From 4dd79783744adbfdc86f9454bffb5a92715a7f61 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 2 Dec 2025 18:33:45 -0800 Subject: [PATCH 01/16] [Bugfix] Fix regression on pooling models from PR#29621 (#29921) Signed-off-by: Roger Wang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/multimodal/parse.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 0d3b8289e4e12..650368dcb8fcd 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -134,11 +134,17 @@ class EmbeddingItems( or a list of embedding tensors (one per item). """ + def _unwrap( + self, item: torch.Tensor | MediaWithBytes[torch.Tensor] + ) -> torch.Tensor: + """Extract media from wrapper if present.""" + return item.media if isinstance(item, MediaWithBytes) else item + def get_count(self) -> int: return len(self.data) def get(self, index: int) -> torch.Tensor: - return self.data[index] + return self._unwrap(self.data[index]) def get_processor_data(self) -> Mapping[str, object]: return {} From 506ed87e876e9dca3c64ac83e21051c02e9cb2e3 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Tue, 2 Dec 2025 20:36:49 -0600 Subject: [PATCH 02/16] [ROCm][CI][Bugfix] Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers accuracy issues (#29909) Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 6 +----- requirements/rocm-test.txt | 4 ++-- .../models/multimodal/generation/conftest.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 7 deletions(-) create mode 100644 tests/models/multimodal/generation/conftest.py diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 4aabe2661088a..1b6bdabc7a539 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -65,7 +65,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite -# Centralized v1 package - copied to both test and final stages COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 # ----------------------- @@ -98,7 +97,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system hf_transfer ENV HF_HUB_ENABLE_HF_TRANSFER=1 -# Copy in the v1 package +# Copy in the v1 package (for python-only install test group) COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 # Source code is used in the `python_only_compile.sh` test @@ -130,9 +129,6 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ && pip uninstall -y vllm \ && uv pip install --system *.whl -# Copy in the v1 package -COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 - ARG COMMON_WORKDIR # Copy over the benchmark scripts as well diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index ae61d4c6c6a81..394728b67eaa4 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -70,8 +70,8 @@ torchgeo==0.7.0 mteb==2.1.2 # Data processing -xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628 - # Test async scheduling +xgrammar==0.1.27 +# Test async scheduling # Utilities num2words==0.5.14 diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/generation/conftest.py new file mode 100644 index 0000000000000..ee3ecdb10fdb8 --- /dev/null +++ b/tests/models/multimodal/generation/conftest.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM tests.""" + +import torch + +from vllm.platforms import current_platform + + +def pytest_configure(config): + """Disable Flash/MemEfficient SDP on ROCm to avoid HF + Transformers accuracy issues. + """ + if not current_platform.is_rocm(): + return + + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) From d7284a2604ef3fe96f0779309caafb59860704bb Mon Sep 17 00:00:00 2001 From: Arpit Khandelwal <60464796+arpitkh101@users.noreply.github.com> Date: Tue, 2 Dec 2025 22:38:55 -0500 Subject: [PATCH 03/16] [Core] Rename PassConfig flags as per RFC #27995 (#29646) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: arpitkh101 Co-authored-by: Luka Govedič --- tests/compile/distributed/test_async_tp.py | 4 +- .../distributed/test_fusion_all_reduce.py | 2 +- tests/compile/distributed/test_fusions_e2e.py | 16 +-- .../distributed/test_sequence_parallelism.py | 18 +-- tests/compile/fullgraph/test_full_graph.py | 4 +- tests/compile/test_config.py | 77 +++++++++-- tests/compile/test_functionalization.py | 6 +- tests/compile/test_fusion.py | 4 +- tests/compile/test_fusion_attn.py | 2 +- tests/compile/test_noop_elimination.py | 4 +- tests/compile/test_pass_manager.py | 7 +- tests/compile/test_qk_norm_rope_fusion.py | 2 +- tests/compile/test_silu_mul_quant_fusion.py | 2 +- tests/distributed/test_sequence_parallel.py | 34 +++-- tests/test_config.py | 12 +- vllm/compilation/pass_manager.py | 13 +- vllm/config/compilation.py | 120 +++++++++++++++--- vllm/config/utils.py | 29 +++++ vllm/config/vllm.py | 76 ++++++----- vllm/v1/worker/gpu_model_runner.py | 5 +- vllm/v1/worker/gpu_worker.py | 2 +- vllm/v1/worker/utils.py | 2 +- 22 files changed, 318 insertions(+), 123 deletions(-) diff --git a/tests/compile/distributed/test_async_tp.py b/tests/compile/distributed/test_async_tp.py index 86d409f1eadb0..2eb18e25c98bf 100644 --- a/tests/compile/distributed/test_async_tp.py +++ b/tests/compile/distributed/test_async_tp.py @@ -326,7 +326,7 @@ def async_tp_pass_on_test_model( vllm_config = VllmConfig() vllm_config.compilation_config = CompilationConfig( pass_config=PassConfig( - enable_async_tp=True, + fuse_gemm_comms=True, ), ) vllm_config.device_config = DeviceConfig(device=torch.device("cuda")) @@ -413,7 +413,7 @@ def test_async_tp_pass_correctness( "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [2, 4, 8], "splitting_ops": [], - "pass_config": {"enable_async_tp": async_tp_enabled}, + "pass_config": {"fuse_gemm_comms": async_tp_enabled}, } async_tp_args = [ diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py index d401d57032752..fc8d1f98ebf87 100644 --- a/tests/compile/distributed/test_fusion_all_reduce.py +++ b/tests/compile/distributed/test_fusion_all_reduce.py @@ -295,7 +295,7 @@ def all_reduce_fusion_pass_on_test_model( ) ) vllm_config.compilation_config.pass_config = PassConfig( - enable_fi_allreduce_fusion=True, enable_noop=True + fuse_allreduce_rms=True, eliminate_noops=True ) vllm_config.device_config = DeviceConfig(device=torch.device("cuda")) vllm_config.parallel_config.rank = local_rank # Setup rank for debug path diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 661172e1965b5..5d2786e122a61 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -192,7 +192,7 @@ def test_attn_quant( splitting_ops=splitting_ops, # Common mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True), + pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), # Inductor caches custom passes by default as well via uuid inductor_compile_config={"force_disable_caches": True}, ) @@ -282,9 +282,9 @@ def test_tp2_attn_quant_allreduce_rmsnorm( # Common mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig( - enable_attn_fusion=True, - enable_noop=True, - enable_fi_allreduce_fusion=True, + fuse_attn_quant=True, + eliminate_noops=True, + fuse_allreduce_rms=True, ), # Inductor caches custom passes by default as well via uuid inductor_compile_config={"force_disable_caches": True}, @@ -384,10 +384,10 @@ def test_tp2_attn_quant_async_tp( # Common level=CompilationMode.VLLM_COMPILE, pass_config=PassConfig( - enable_attn_fusion=True, - enable_noop=True, - enable_sequence_parallelism=True, - enable_async_tp=True, + fuse_attn_quant=True, + eliminate_noops=True, + enable_sp=True, + fuse_gemm_comms=True, ), # Inductor caches custom passes by default as well via uuid inductor_compile_config={"force_disable_caches": True}, diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index 30084dfd5a950..d9fdc3acc3d6f 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -153,7 +153,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module): ] def ops_in_model(self): - if self.vllm_config.compilation_config.pass_config.enable_fusion: + if self.vllm_config.compilation_config.pass_config.fuse_norm_quant: return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default] elif RMSNorm.enabled(): return [ @@ -183,7 +183,7 @@ class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module): @pytest.mark.parametrize("seq_len", [16]) @pytest.mark.parametrize("hidden_size", [16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("enable_fusion", [True, False]) +@pytest.mark.parametrize("fuse_norm_quant", [True, False]) @pytest.mark.parametrize("dynamic", [False, True]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") def test_sequence_parallelism_pass( @@ -193,7 +193,7 @@ def test_sequence_parallelism_pass( seq_len: int, hidden_size: int, dtype: torch.dtype, - enable_fusion: bool, + fuse_norm_quant: bool, dynamic: bool, ): num_processes = 2 @@ -211,7 +211,7 @@ def test_sequence_parallelism_pass( seq_len, hidden_size, dtype, - enable_fusion, + fuse_norm_quant, dynamic, ), nprocs=nprocs, @@ -229,7 +229,7 @@ def sequence_parallelism_pass_on_test_model( seq_len: int, hidden_size: int, dtype: torch.dtype, - enable_fusion: bool, + fuse_norm_quant: bool, dynamic: bool, ): current_platform.seed_everything(0) @@ -260,9 +260,9 @@ def sequence_parallelism_pass_on_test_model( cudagraph_mode=CUDAGraphMode.NONE, # avoid piecewise warnings custom_ops=custom_ops_list, pass_config=PassConfig( - enable_sequence_parallelism=True, - enable_fusion=enable_fusion, - enable_noop=True, + enable_sp=True, + fuse_norm_quant=fuse_norm_quant, + eliminate_noops=True, ), ) # NoOp needed for fusion device_config = DeviceConfig(device=torch.device("cuda")) @@ -297,7 +297,7 @@ def sequence_parallelism_pass_on_test_model( sequence_parallelism_pass, ] - if enable_fusion: + if fuse_norm_quant: fusion_pass = RMSNormQuantFusionPass(vllm_config) passes_for_backend.append(fusion_pass) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 2c11ecef7f029..3cd1d4be2ebdc 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -122,7 +122,9 @@ def test_full_graph( CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm"], - pass_config=PassConfig(enable_fusion=True, enable_noop=True), + pass_config=PassConfig( + fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True + ), ), *model_info, ) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index a9e5ccee520e3..9e912c6d810d2 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +import logging from contextlib import nullcontext from unittest.mock import patch @@ -10,8 +11,9 @@ from pydantic import ValidationError from vllm.compilation.counter import compilation_counter from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.compilation import CompilationMode +from vllm.config.compilation import CompilationMode, PassConfig from vllm.engine.arg_utils import EngineArgs +from vllm.logger import _print_warning_once from vllm.platforms import current_platform from vllm.utils.torch_utils import _is_torch_equal_or_newer @@ -191,7 +193,7 @@ def test_splitting_ops_dynamic(): config = VllmConfig( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config={"enable_attn_fusion": True, "enable_noop": True}, + pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, ) @@ -206,7 +208,7 @@ def test_splitting_ops_dynamic(): config = VllmConfig( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config={"enable_attn_fusion": True, "enable_noop": True}, + pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, # work around for accessing all attntion ops @@ -219,7 +221,7 @@ def test_splitting_ops_dynamic(): compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, - pass_config={"enable_attn_fusion": True, "enable_noop": True}, + pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, ) @@ -227,7 +229,7 @@ def test_splitting_ops_dynamic(): # With inductor graph partition, attn_fusion and splitting_ops # work together. Default splitting_ops include attention ops. assert config.compilation_config.splitting_ops_contain_attention() - # enable_attn_fusion is directly supported under + # fuse_attn_quant is directly supported under # use_inductor_graph_partition=True, and cudagraph_mode # is unchanged. assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE @@ -301,7 +303,7 @@ def test_should_split(): "cudagraph_capture_sizes", "max_cudagraph_capture_size", "tp_size", - "enable_sequence_parallelism", + "enable_sp", "max_num_batched_tokens", "cudagraph_mode", "expected_max_size", @@ -339,7 +341,7 @@ def test_cudagraph_sizes_post_init( cudagraph_capture_sizes, max_cudagraph_capture_size, tp_size, - enable_sequence_parallelism, + enable_sp, max_num_batched_tokens, cudagraph_mode, expected_max_size, @@ -355,11 +357,12 @@ def test_cudagraph_sizes_post_init( compilation_config = CompilationConfig( cudagraph_capture_sizes=cudagraph_capture_sizes, max_cudagraph_capture_size=max_cudagraph_capture_size, - pass_config={ - "enable_sequence_parallelism": enable_sequence_parallelism, - "enable_fusion": True, - "enable_noop": True, - }, + pass_config=PassConfig( + enable_sp=enable_sp, + fuse_norm_quant=True, + fuse_act_quant=True, + eliminate_noops=True, + ), cudagraph_mode=cudagraph_mode, ) engine_args = EngineArgs( @@ -375,3 +378,53 @@ def test_cudagraph_sizes_post_init( vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size ) + + +def test_pass_config_deprecation(caplog_vllm): + caplog_vllm.set_level(logging.WARNING) + + # Clear cache to ensure warnings are re-issued + _print_warning_once.cache_clear() + + # Test enable_fusion -> fuse_norm_quant, fuse_act_quant + caplog_vllm.clear() + config = PassConfig(enable_fusion=True) + assert "enable_fusion is deprecated" in caplog_vllm.text + assert config.fuse_norm_quant is True + assert config.fuse_act_quant is True + assert config.enable_fusion is None + + # Test enable_attn_fusion -> fuse_attn_quant + caplog_vllm.clear() + config = PassConfig(enable_attn_fusion=True) + assert "enable_attn_fusion is deprecated" in caplog_vllm.text + assert config.fuse_attn_quant is True + assert config.enable_attn_fusion is None + + # Test enable_noop -> eliminate_noops + caplog_vllm.clear() + config = PassConfig(enable_noop=True) + assert "enable_noop is deprecated" in caplog_vllm.text + assert config.eliminate_noops is True + assert config.enable_noop is None + + # Test enable_sequence_parallelism -> enable_sp + caplog_vllm.clear() + config = PassConfig(enable_sequence_parallelism=True) + assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text + assert config.enable_sp is True + assert config.enable_sequence_parallelism is None + + # Test enable_async_tp -> fuse_gemm_comms + caplog_vllm.clear() + config = PassConfig(enable_async_tp=True) + assert "enable_async_tp is deprecated" in caplog_vllm.text + assert config.fuse_gemm_comms is True + assert config.enable_async_tp is None + + # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms + caplog_vllm.clear() + config = PassConfig(enable_fi_allreduce_fusion=True) + assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text + assert config.fuse_allreduce_rms is True + assert config.enable_fi_allreduce_fusion is None diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 515e0a93ac2a8..7585915892700 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -223,7 +223,11 @@ def test_fix_functionalization( model_config=ModelConfig(dtype=dtype), compilation_config=CompilationConfig( custom_ops=["all"], - pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True), + pass_config=PassConfig( + fuse_norm_quant=do_fusion, + fuse_act_quant=do_fusion, + eliminate_noops=True, + ), ), ) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 286f2276367a0..d0ba8385f4a01 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -159,7 +159,9 @@ def test_fusion_rmsnorm_quant( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, - pass_config=PassConfig(enable_fusion=True, enable_noop=True), + pass_config=PassConfig( + fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True + ), ), ) with vllm.config.set_current_vllm_config(vllm_config): diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 4d213e030edb5..9b4486e56c73e 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -373,7 +373,7 @@ def test_attention_quant_pattern( # Run model with attn fusion enabled vllm_config.compilation_config.pass_config = PassConfig( - enable_attn_fusion=True, enable_noop=True + fuse_attn_quant=True, eliminate_noops=True ) with ( set_current_vllm_config(vllm_config), diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py index 0ccc1a0161629..bfe08382fd949 100644 --- a/tests/compile/test_noop_elimination.py +++ b/tests/compile/test_noop_elimination.py @@ -51,7 +51,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size): vllm_config = VllmConfig( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(enable_noop=True), + pass_config=PassConfig(eliminate_noops=True), ) ) with vllm.config.set_current_vllm_config(vllm_config): @@ -99,7 +99,7 @@ def test_non_noop_slice_preserved(): vllm_config = VllmConfig( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(enable_noop=True), + pass_config=PassConfig(eliminate_noops=True), ) ) with vllm.config.set_current_vllm_config(vllm_config): diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 1c40c599f7487..6d0ba6b655031 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -64,8 +64,11 @@ def test_pass_manager_uuid(callable): # UUID should be different due to config change config2 = copy.deepcopy(config) - config2.compilation_config.pass_config.enable_fusion = ( - not config2.compilation_config.pass_config.enable_fusion + config2.compilation_config.pass_config.fuse_norm_quant = ( + not config2.compilation_config.pass_config.fuse_norm_quant + ) + config2.compilation_config.pass_config.fuse_act_quant = ( + not config2.compilation_config.pass_config.fuse_act_quant ) pass_manager3 = PostGradPassManager() pass_manager3.configure(config2) diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 5ebb95b6db332..e0968ac799256 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -140,7 +140,7 @@ def test_qk_norm_rope_fusion( custom_ops=custom_ops, pass_config=PassConfig( enable_qk_norm_rope_fusion=True, - enable_noop=True, + eliminate_noops=True, ), ), ) diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 0ddb82b7c3fc2..c336a45955cb5 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -168,7 +168,7 @@ def test_fusion_silu_and_mul_quant( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, - pass_config=PassConfig(enable_fusion=True, enable_noop=True), + pass_config=PassConfig(fuse_act_quant=True, eliminate_noops=True), ), ) diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index f38c509775ed5..0a7907aadeab5 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -32,7 +32,8 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" class ParallelSetup(NamedTuple): tp_size: int pp_size: int - enable_fusion: bool + fuse_norm_quant: bool + fuse_act_quant: bool eager_mode: bool chunked_prefill: bool @@ -66,7 +67,8 @@ class SPTestSettings: ParallelSetup( tp_size=tp_base, pp_size=pp_multiplier * pp_base, - enable_fusion=False, + fuse_norm_quant=False, + fuse_act_quant=False, eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val, ) @@ -97,7 +99,8 @@ class SPTestSettings: ParallelSetup( tp_size=tp_base, pp_size=pp_multiplier * pp_base, - enable_fusion=False, + fuse_norm_quant=False, + fuse_act_quant=False, eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val, ) @@ -126,7 +129,8 @@ class SPTestSettings: ParallelSetup( tp_size=tp_base, pp_size=pp_base, - enable_fusion=fusion_val, + fuse_norm_quant=fusion_val, + fuse_act_quant=fusion_val, eager_mode=True, chunked_prefill=False, ) @@ -162,7 +166,7 @@ def _compare_sp( test_options: SPTestOptions, num_gpus_available: int, use_inductor_graph_partition: bool, - enable_async_tp: bool, + fuse_gemm_comms: bool, *, method: Literal["generate", "encode"], is_multimodal: bool, @@ -170,7 +174,8 @@ def _compare_sp( ( tp_size, pp_size, - enable_fusion, + fuse_norm_quant, + fuse_act_quant, eager_mode, chunked_prefill, ) = parallel_setup @@ -248,10 +253,11 @@ def _compare_sp( "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [4, 8], "pass_config": { - "enable_sequence_parallelism": True, - "enable_async_tp": enable_async_tp, - "enable_fusion": enable_fusion, - "enable_noop": True, + "enable_sp": True, + "fuse_gemm_comms": fuse_gemm_comms, + "fuse_norm_quant": fuse_norm_quant, + "fuse_act_quant": fuse_act_quant, + "eliminate_noops": True, }, "use_inductor_graph_partition": use_inductor_graph_partition, } @@ -309,7 +315,7 @@ SP_TEST_MODELS = [ ], ) @pytest.mark.parametrize("use_inductor_graph_partition", [True, False]) -@pytest.mark.parametrize("enable_async_tp", [False]) # TODO: enable async TP +@pytest.mark.parametrize("fuse_gemm_comms", [False]) # TODO: enable async TP @create_new_process_for_each_test() def test_tp_sp_generation( model_id: str, @@ -319,7 +325,7 @@ def test_tp_sp_generation( test_options: SPTestOptions, num_gpus_available, use_inductor_graph_partition: bool, - enable_async_tp: bool, + fuse_gemm_comms: bool, ): if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): pytest.skip("inductor graph partition is only available in PyTorch 2.9+") @@ -328,7 +334,7 @@ def test_tp_sp_generation( if ( "fp8" in model_id.lower() and current_platform.get_device_capability() < (9, 0) - and (not enable_async_tp) + and (not fuse_gemm_comms) ): pytest.skip("FP8 reduction support begins with sm90 capable devices.") @@ -340,7 +346,7 @@ def test_tp_sp_generation( test_options, num_gpus_available, use_inductor_graph_partition, - enable_async_tp=enable_async_tp, + fuse_gemm_comms=fuse_gemm_comms, method="generate", is_multimodal=False, ) diff --git a/tests/test_config.py b/tests/test_config.py index b7ed68fea92ab..019c0d6d8733f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1023,17 +1023,17 @@ def test_vllm_config_explicit_overrides(): assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE # Explicit pass config flags to override defaults - pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True) + pass_config = PassConfig(eliminate_noops=True, fuse_attn_quant=True) compilation_config = CompilationConfig(pass_config=pass_config) config = VllmConfig( optimization_level=OptimizationLevel.O0, compilation_config=compilation_config, ) - assert config.compilation_config.pass_config.enable_noop is True - assert config.compilation_config.pass_config.enable_attn_fusion is True + assert config.compilation_config.pass_config.eliminate_noops is True + assert config.compilation_config.pass_config.fuse_attn_quant is True # Explicit cudagraph mode override on quantized model at O2 - pass_config = PassConfig(enable_async_tp=True) + pass_config = PassConfig(fuse_gemm_comms=True) compilation_config = CompilationConfig( cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config ) @@ -1043,7 +1043,7 @@ def test_vllm_config_explicit_overrides(): compilation_config=compilation_config, ) assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE - assert config.compilation_config.pass_config.enable_async_tp is True + assert config.compilation_config.pass_config.fuse_gemm_comms is True # Mode should still use default for O2 assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE @@ -1093,7 +1093,7 @@ def test_vllm_config_explicit_overrides(): compilation_config=compilation_config, ) # Explicit override should be respected - assert config.compilation_config.pass_config.enable_noop is False + assert config.compilation_config.pass_config.eliminate_noops is False # Other fields should still use defaults assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index fe2547d7fecaf..37f48721ea208 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -92,22 +92,23 @@ class PostGradPassManager(CustomGraphPass): # Set the current vllm config to allow tracing CustomOp instances with set_current_vllm_config(config, check_compile=False): - if self.pass_config.enable_noop: + if self.pass_config.eliminate_noops: self.passes += [NoOpEliminationPass(config)] - if self.pass_config.enable_sequence_parallelism: + if self.pass_config.enable_sp: self.passes += [SequenceParallelismPass(config)] - if self.pass_config.enable_async_tp: + if self.pass_config.fuse_gemm_comms: self.passes += [AsyncTPPass(config)] - if self.pass_config.enable_fi_allreduce_fusion: + if self.pass_config.fuse_allreduce_rms: self.passes += [AllReduceFusionPass(config)] - if self.pass_config.enable_fusion: + if self.pass_config.fuse_norm_quant: self.passes += [RMSNormQuantFusionPass(config)] + if self.pass_config.fuse_act_quant: self.passes += [ActivationQuantFusionPass(config)] - if self.pass_config.enable_attn_fusion: + if self.pass_config.fuse_attn_quant: self.passes += [AttnFusionPass(config)] if self.pass_config.enable_qk_norm_rope_fusion: diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 0f876c38169ac..963b091939e0e 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -13,7 +13,7 @@ from pydantic.dataclasses import dataclass import vllm.envs as envs from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass -from vllm.config.utils import config +from vllm.config.utils import config, handle_deprecated from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.import_utils import resolve_obj_by_qualname @@ -105,18 +105,43 @@ class PassConfig: improper state. """ + # New flags + fuse_norm_quant: bool = Field(default=None) + """Fuse the custom RMSNorm + quant ops.""" + fuse_act_quant: bool = Field(default=None) + """Fuse the custom SiluMul + quant ops.""" + fuse_attn_quant: bool = Field(default=None) + """Fuse the custom attention + quant ops.""" + eliminate_noops: bool = Field(default=None) + """Eliminate no-op ops.""" + enable_sp: bool = Field(default=None) + """Enable sequence parallelism.""" + fuse_gemm_comms: bool = Field(default=None) + """Enable async TP.""" + fuse_allreduce_rms: bool = Field(default=None) + """Enable flashinfer allreduce fusion.""" + + # Deprecated flags enable_fusion: bool = Field(default=None) - """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" + """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant + instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner. + """ enable_attn_fusion: bool = Field(default=None) - """Whether to enable the custom attention+quant fusion pass.""" + """Deprecated in: v0.12.0. Use fuse_attn_quant instead. + Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" enable_noop: bool = Field(default=None) - """Whether to enable the custom no-op elimination pass.""" + """Deprecated in: v0.12.0. Use eliminate_noops instead. + Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" enable_sequence_parallelism: bool = Field(default=None) - """Whether to enable sequence parallelism.""" + """Deprecated in: v0.12.0. Use enable_sp instead. + Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" enable_async_tp: bool = Field(default=None) - """Whether to enable async TP.""" + """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. + Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" enable_fi_allreduce_fusion: bool = Field(default=None) - """Whether to enable flashinfer allreduce fusion.""" + """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. + Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" + fi_allreduce_fusion_max_size_mb: float | None = None """The threshold of the communicated tensor sizes under which vllm should use flashinfer fused allreduce. Specified as a @@ -136,7 +161,7 @@ class PassConfig: }, }, where key is the device capability""" enable_qk_norm_rope_fusion: bool = False - """Whether to enable the fused Q/K RMSNorm + RoPE pass.""" + """Enable fused Q/K RMSNorm + RoPE pass.""" # TODO(luka) better pass enabling system. @@ -174,6 +199,13 @@ class PassConfig: return InductorPass.hash_dict(asdict(self)) @field_validator( + "fuse_norm_quant", + "fuse_act_quant", + "fuse_attn_quant", + "eliminate_noops", + "enable_sp", + "fuse_gemm_comms", + "fuse_allreduce_rms", "enable_fusion", "enable_attn_fusion", "enable_noop", @@ -190,18 +222,71 @@ class PassConfig: return handler(value) def __post_init__(self) -> None: - if not self.enable_noop: - if self.enable_fusion: + # Handle deprecation and defaults + + # Map old flags to new flags and issue warnings + handle_deprecated( + self, + "enable_fusion", + ["fuse_norm_quant", "fuse_act_quant"], + "v0.13.0 or v1.0.0, whichever is sooner", + ) + + handle_deprecated( + self, + "enable_attn_fusion", + "fuse_attn_quant", + "v0.13.0 or v1.0.0, whichever is sooner", + ) + + handle_deprecated( + self, + "enable_sequence_parallelism", + "enable_sp", + "v0.13.0 or v1.0.0, whichever is sooner", + ) + + handle_deprecated( + self, + "enable_async_tp", + "fuse_gemm_comms", + "v0.13.0 or v1.0.0, whichever is sooner", + ) + + handle_deprecated( + self, + "enable_fi_allreduce_fusion", + "fuse_allreduce_rms", + "v0.13.0 or v1.0.0, whichever is sooner", + ) + + handle_deprecated( + self, + "enable_noop", + "eliminate_noops", + "v0.13.0 or v1.0.0, whichever is sooner", + ) + + # Force old flags to None to ensure they are not used + self.enable_fusion = None + self.enable_attn_fusion = None + self.enable_noop = None + self.enable_sequence_parallelism = None + self.enable_async_tp = None + self.enable_fi_allreduce_fusion = None + + if not self.eliminate_noops: + if self.fuse_norm_quant or self.fuse_act_quant: logger.warning_once( "Fusion enabled but reshape elimination disabled. " "RMSNorm/SiluMul + quant (fp8) fusion might not work" ) - if self.enable_attn_fusion: + if self.fuse_attn_quant: logger.warning_once( "Fusion enabled but reshape elimination disabled. " "Attention + quant (fp8) fusion might not work" ) - if self.enable_fi_allreduce_fusion: + if self.fuse_allreduce_rms: logger.warning_once( "Fusion enabled but reshape elimination disabled. " "Allreduce + rms norm + quant (fp8) fusion might not work" @@ -873,7 +958,7 @@ class CompilationConfig: self.set_splitting_ops_for_inductor_graph_partition() return - if self.pass_config.enable_attn_fusion: + if self.pass_config.fuse_attn_quant: # here use_inductor_graph_partition is False self.set_splitting_ops_for_attn_fusion() return @@ -915,12 +1000,12 @@ class CompilationConfig: self.splitting_ops = list(self._attention_ops) def set_splitting_ops_for_attn_fusion(self): - assert self.pass_config.enable_attn_fusion + assert self.pass_config.fuse_attn_quant if self.splitting_ops is None: self.splitting_ops = [] if self.cudagraph_mode.has_piecewise_cudagraphs(): logger.warning_once( - "enable_attn_fusion is incompatible with piecewise " + "fuse_attn_quant is incompatible with piecewise " "cudagraph when use_inductor_graph_partition is off. " "In this case, splitting_ops will be set to empty " "list, and cudagraph_mode will be set to FULL. " @@ -931,8 +1016,7 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.FULL assert not self.splitting_ops_contain_attention(), ( - "attention ops should not be in splitting_ops " - "when enable_attn_fusion is True" + "attention ops should not be in splitting_ops when fuse_attn_quant is True" ) def splitting_ops_contain_attention(self) -> bool: @@ -1008,7 +1092,7 @@ class CompilationConfig: self, uniform_decode_query_len: int, tensor_parallel_size: int ): multiple_of = uniform_decode_query_len - if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism: + if tensor_parallel_size > 1 and self.pass_config.enable_sp: multiple_of = max(uniform_decode_query_len, tensor_parallel_size) if ( multiple_of % uniform_decode_query_len != 0 diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 02f2b75f608f1..3124fcf007396 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -19,6 +19,10 @@ import torch from pydantic.fields import FieldInfo from typing_extensions import runtime_checkable +from vllm.logger import init_logger + +logger = init_logger(__name__) + if TYPE_CHECKING: from _typeshed import DataclassInstance else: @@ -293,3 +297,28 @@ def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, ob def hash_factors(items: dict[str, object]) -> str: """Return a SHA-256 hex digest of the canonical items structure.""" return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest() + + +def handle_deprecated( + config: ConfigT, + old_name: str, + new_name_or_names: str | list[str], + removal_version: str, +) -> None: + old_val = getattr(config, old_name) + if old_val is None: + return + + if isinstance(new_name_or_names, str): + new_names = [new_name_or_names] + else: + new_names = new_name_or_names + + msg = ( + f"{old_name} is deprecated and will be removed in {removal_version}. " + f"Use {', '.join(new_names)} instead." + ) + logger.warning(msg) + + for new_name in new_names: + setattr(config, new_name, old_val) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 5b3a9c437662b..735b0afbaaeb3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -83,22 +83,33 @@ IS_DENSE = False # See https://github.com/vllm-project/vllm/issues/25689. -def enable_fusion(cfg: "VllmConfig") -> bool: - """Returns True if RMS norm or quant FP8 is enabled.""" +def enable_norm_fusion(cfg: "VllmConfig") -> bool: + """Enable if either RMS norm or quant FP8 custom op is active; + otherwise Inductor handles fusion.""" + return cfg.compilation_config.is_custom_op_enabled( "rms_norm" ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8") +def enable_act_fusion(cfg: "VllmConfig") -> bool: + """Enable if either SiLU+Mul or quant FP8 custom op is active; + otherwise Inductor handles fusion.""" + return cfg.compilation_config.is_custom_op_enabled( + "silu_and_mul" + ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8") + + OPTIMIZATION_LEVEL_00 = { "compilation_config": { "pass_config": { - "enable_noop": False, - "enable_fusion": False, - "enable_fi_allreduce_fusion": False, - "enable_attn_fusion": False, - "enable_sequence_parallelism": False, - "enable_async_tp": False, + "eliminate_noops": False, + "fuse_norm_quant": False, + "fuse_act_quant": False, + "fuse_allreduce_rms": False, + "fuse_attn_quant": False, + "enable_sp": False, + "fuse_gemm_comms": False, }, "cudagraph_mode": CUDAGraphMode.NONE, "use_inductor_graph_partition": False, @@ -107,12 +118,13 @@ OPTIMIZATION_LEVEL_00 = { OPTIMIZATION_LEVEL_01 = { "compilation_config": { "pass_config": { - "enable_noop": True, - "enable_fusion": enable_fusion, - "enable_fi_allreduce_fusion": False, - "enable_attn_fusion": False, - "enable_sequence_parallelism": False, - "enable_async_tp": False, + "eliminate_noops": True, + "fuse_norm_quant": enable_norm_fusion, + "fuse_act_quant": enable_act_fusion, + "fuse_allreduce_rms": False, + "fuse_attn_quant": False, + "enable_sp": False, + "fuse_gemm_comms": False, }, "cudagraph_mode": CUDAGraphMode.PIECEWISE, "use_inductor_graph_partition": False, @@ -121,12 +133,13 @@ OPTIMIZATION_LEVEL_01 = { OPTIMIZATION_LEVEL_02 = { "compilation_config": { "pass_config": { - "enable_noop": True, - "enable_fusion": enable_fusion, - "enable_fi_allreduce_fusion": False, - "enable_attn_fusion": IS_QUANTIZED, - "enable_sequence_parallelism": IS_DENSE, - "enable_async_tp": IS_DENSE, + "eliminate_noops": True, + "fuse_norm_quant": enable_norm_fusion, + "fuse_act_quant": enable_act_fusion, + "fuse_allreduce_rms": False, + "fuse_attn_quant": IS_QUANTIZED, + "enable_sp": IS_DENSE, + "fuse_gemm_comms": IS_DENSE, }, "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE, "use_inductor_graph_partition": False, @@ -135,12 +148,13 @@ OPTIMIZATION_LEVEL_02 = { OPTIMIZATION_LEVEL_03 = { "compilation_config": { "pass_config": { - "enable_noop": True, - "enable_fusion": enable_fusion, - "enable_fi_allreduce_fusion": False, - "enable_attn_fusion": IS_QUANTIZED, - "enable_sequence_parallelism": IS_DENSE, - "enable_async_tp": IS_DENSE, + "eliminate_noops": True, + "fuse_norm_quant": enable_norm_fusion, + "fuse_act_quant": enable_act_fusion, + "fuse_allreduce_rms": False, + "fuse_attn_quant": IS_QUANTIZED, + "enable_sp": IS_DENSE, + "fuse_gemm_comms": IS_DENSE, }, "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE, "use_inductor_graph_partition": False, @@ -645,9 +659,9 @@ class VllmConfig: # async tp is built on top of sequence parallelism # and requires it to be enabled. - if self.compilation_config.pass_config.enable_async_tp: - self.compilation_config.pass_config.enable_sequence_parallelism = True - if self.compilation_config.pass_config.enable_sequence_parallelism: + if self.compilation_config.pass_config.fuse_gemm_comms: + self.compilation_config.pass_config.enable_sp = True + if self.compilation_config.pass_config.enable_sp: if "-rms_norm" in self.compilation_config.custom_ops: logger.warning( "RMS norm force disabled, sequence parallelism might break" @@ -797,7 +811,7 @@ class VllmConfig: # Do this after all the updates to compilation_config.mode self.compilation_config.set_splitting_ops_for_v1() - if self.compilation_config.pass_config.enable_sequence_parallelism: + if self.compilation_config.pass_config.enable_sp: # With pipeline parallelism or dynamo partitioning, # native rms norm tracing errors due to incorrect residual shape. # Use custom rms norm to unblock. In the future, @@ -1062,7 +1076,7 @@ class VllmConfig: if ( self.parallel_config.tensor_parallel_size > 1 - and self.compilation_config.pass_config.enable_sequence_parallelism + and self.compilation_config.pass_config.enable_sp ): cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism( cudagraph_capture_sizes diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8c22ada029b1a..1b250a8bd009c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2417,10 +2417,7 @@ class GPUModelRunner( # Pad tokens to multiple of tensor_parallel_size when # enabled collective fusion for SP tp_size = self.vllm_config.parallel_config.tensor_parallel_size - if ( - self.compilation_config.pass_config.enable_sequence_parallelism - and tp_size > 1 - ): + if self.compilation_config.pass_config.enable_sp and tp_size > 1: return round_up(num_scheduled_tokens, tp_size) return num_scheduled_tokens diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index ed6fb32bcb2f6..edba07a423cda 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -552,7 +552,7 @@ class Worker(WorkerBase): if ( parallel_config.pipeline_parallel_size > 1 - and compilation_config.pass_config.enable_sequence_parallelism + and compilation_config.pass_config.enable_sp and forward_pass ): # currently only supported by V1 GPUModelRunner diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index bd88cb1b253f8..427a0d296b253 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -342,7 +342,7 @@ def is_residual_scattered_for_sp( partition), SP is always applied - Otherwise, SP is only applied for specific shapes in compile_sizes """ - if not vllm_config.compilation_config.pass_config.enable_sequence_parallelism: + if not vllm_config.compilation_config.pass_config.enable_sp: return False tp = vllm_config.parallel_config.tensor_parallel_size From b08025a83bf416d97d0547ac52c3909356e118c4 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 2 Dec 2025 23:57:28 -0500 Subject: [PATCH 04/16] [Docs] Discuss api key limitations in security guide (#29922) Signed-off-by: Russell Bryant --- docs/usage/security.md | 110 +++++++++++++++++++++++++++++++++ vllm/entrypoints/cli/openai.py | 4 ++ 2 files changed, 114 insertions(+) diff --git a/docs/usage/security.md b/docs/usage/security.md index 9d10b66a5a97f..74060d86f6854 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -108,6 +108,116 @@ networks. Consult your operating system or application platform documentation for specific firewall configuration instructions. +## API Key Authentication Limitations + +### Overview + +The `--api-key` flag (or `VLLM_API_KEY` environment variable) provides authentication for vLLM's HTTP server, but **only for OpenAI-compatible API endpoints under the `/v1` path prefix**. Many other sensitive endpoints are exposed on the same HTTP server without any authentication enforcement. + +**Important:** Do not rely exclusively on `--api-key` for securing access to vLLM. Additional security measures are required for production deployments. + +### Protected Endpoints (Require API Key) + +When `--api-key` is configured, the following `/v1` endpoints require Bearer token authentication: + +- `/v1/models` - List available models +- `/v1/chat/completions` - Chat completions +- `/v1/completions` - Text completions +- `/v1/embeddings` - Generate embeddings +- `/v1/audio/transcriptions` - Audio transcription +- `/v1/audio/translations` - Audio translation +- `/v1/messages` - Anthropic-compatible messages API +- `/v1/responses` - Response management +- `/v1/score` - Scoring API +- `/v1/rerank` - Reranking API + +### Unprotected Endpoints (No API Key Required) + +The following endpoints **do not require authentication** even when `--api-key` is configured: + +**Inference endpoints:** + +- `/invocations` - SageMaker-compatible endpoint (routes to the same inference functions as `/v1` endpoints) +- `/inference/v1/generate` - Generate completions +- `/pooling` - Pooling API +- `/classify` - Classification API +- `/score` - Scoring API (non-`/v1` variant) +- `/rerank` - Reranking API (non-`/v1` variant) + +**Operational control endpoints (always enabled):** + +- `/pause` - Pause generation (causes denial of service) +- `/resume` - Resume generation +- `/scale_elastic_ep` - Trigger scaling operations + +**Utility endpoints:** + +- `/tokenize` - Tokenize text +- `/detokenize` - Detokenize tokens +- `/health` - Health check +- `/ping` - SageMaker health check +- `/version` - Version information +- `/load` - Server load metrics + +**Tokenizer information endpoint (only when `--enable-tokenizer-info-endpoint` is set):** + +This endpoint is **only available when the `--enable-tokenizer-info-endpoint` flag is set**. It may expose sensitive information such as chat templates and tokenizer configuration: + +- `/tokenizer_info` - Get comprehensive tokenizer information including chat templates and configuration + +**Development endpoints (only when `VLLM_SERVER_DEV_MODE=1`):** + +These endpoints are **only available when the environment variable `VLLM_SERVER_DEV_MODE` is set to `1`**. They are intended for development and debugging purposes and should never be enabled in production: + +- `/server_info` - Get detailed server configuration +- `/reset_prefix_cache` - Reset prefix cache (can disrupt service) +- `/reset_mm_cache` - Reset multimodal cache (can disrupt service) +- `/sleep` - Put engine to sleep (causes denial of service) +- `/wake_up` - Wake engine from sleep +- `/is_sleeping` - Check if engine is sleeping +- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous) + +**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):** + +These endpoints are only available when profiling is enabled and should only be used for local development: + +- `/start_profile` - Start PyTorch profiler +- `/stop_profile` - Stop PyTorch profiler + +**Note:** The `/invocations` endpoint is particularly concerning as it provides unauthenticated access to the same inference capabilities as the protected `/v1` endpoints. + +### Security Implications + +An attacker who can reach the vLLM HTTP server can: + +1. **Bypass authentication** by using non-`/v1` endpoints like `/invocations`, `/inference/v1/generate`, `/pooling`, `/classify`, `/score`, or `/rerank` to run arbitrary inference without credentials +2. **Cause denial of service** by calling `/pause` or `/scale_elastic_ep` without a token +3. **Access operational controls** to manipulate server state (e.g., pausing generation) +4. **If `--enable-tokenizer-info-endpoint` is set:** Access sensitive tokenizer configuration including chat templates, which may reveal prompt engineering strategies or other implementation details +5. **If `VLLM_SERVER_DEV_MODE=1` is set:** Execute arbitrary RPC commands via `/collective_rpc`, reset caches, put the engine to sleep, and access detailed server configuration + +### Recommended Security Practices + +#### 1. Minimize Exposed Endpoints + +**CRITICAL:** Never set `VLLM_SERVER_DEV_MODE=1` in production environments. Development endpoints expose extremely dangerous functionality including: + +- Arbitrary RPC execution via `/collective_rpc` +- Cache manipulation that can disrupt service +- Detailed server configuration disclosure + +Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production. + +**Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies. + +#### 2. Deploy Behind a Reverse Proxy + +The most effective approach is to deploy vLLM behind a reverse proxy (such as nginx, Envoy, or a Kubernetes Gateway) that: + +- Explicitly allowlists only the endpoints you want to expose to end users +- Blocks all other endpoints, including the unauthenticated inference and operational control endpoints +- Implements additional authentication, rate limiting, and logging at the proxy layer + ## Reporting Security Vulnerabilities If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md). diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index fb49be370203e..1c18b193d1cdc 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -109,6 +109,10 @@ def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser help=( "API key for OpenAI services. If provided, this api key " "will overwrite the api key obtained through environment variables." + " It is important to note that this option only applies to the " + "OpenAI-compatible API endpoints and NOT other endpoints that may " + "be present in the server. See the security guide in the vLLM docs " + "for more details." ), ) return parser From c719c40540a85c1e6aeee9af20f29db581da27f0 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Wed, 3 Dec 2025 13:15:50 +0800 Subject: [PATCH 05/16] [Bugfix] Defunctionalize TRTLLM AR+Norm op for avoiding extra clone kernel before it (#29631) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: Luka Govedič Co-authored-by: Luka Govedič --- vllm/compilation/fix_functionalization.py | 12 ++++++++++++ vllm/compilation/fx_utils.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 126ad35e527ae..76068f86ebfb3 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -103,6 +103,18 @@ class FixFunctionalizationPass(VllmInductorPass): ]: mutated_args = {1: "result"} self.defunctionalize(graph, node, mutated_args) + elif ( + at_target + == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default + ): + mutated_args = { + 1: "allreduce_in", + 2: "residual", + 3: "norm_out", + 4: "quant_out", + 5: "scale_out", + } + self.defunctionalize(graph, node, mutated_args) # For some reason we need to specify the args for both # silu_and_mul and silu_and_mul_quant. The kwargs # pathway gets the wrong answer. diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index f2497950fc22f..3650ee6b41745 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -75,8 +75,8 @@ def find_op_nodes( return assert isinstance(op, OpOverload) - if not op._schema.is_mutable: - yield from graph.find_nodes(op="call_function", target=op) + + yield from graph.find_nodes(op="call_function", target=op) for n in graph.find_nodes(op="call_function", target=auto_functionalized): if n.args[0] == op: From 0bec63fa317e1fbd62e19b0fc31c43c81bf89077 Mon Sep 17 00:00:00 2001 From: JackieWu Date: Wed, 3 Dec 2025 14:20:37 +0800 Subject: [PATCH 06/16] [BugFix] fix imgs_pos in hunyuan_vl (#29879) Co-authored-by: Isotr0py --- vllm/transformers_utils/processors/hunyuan_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py index 615a8bff85912..f32ce115c866d 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl.py +++ b/vllm/transformers_utils/processors/hunyuan_vl.py @@ -123,7 +123,7 @@ class HunYuanVLProcessor(ProcessorMixin): attention_mask = input_ids.ne(self.pad_id) text_inputs["attention_mask"] = attention_mask - text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)] + text_inputs["imgs_pos"] = [self.get_imgs_pos(e) for e in input_ids] # image_inputs["imgs"] = [[image_inputs["pixel_values"]]] return_tensors = kwargs.pop("return_tensors", None) From bbfb55c29e7febb91e90f261dd9adb4200ee3a09 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 3 Dec 2025 15:49:34 +0800 Subject: [PATCH 07/16] [Misc] Allow `fetch_*` utils to access local files by default (#29932) Signed-off-by: DarkLight1337 --- vllm/multimodal/utils.py | 38 ++++++++++++++++++++++++++++++-------- vllm/multimodal/video.py | 2 +- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1020554e2e073..1840220854858 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -67,8 +67,9 @@ class MediaConnector: to set num_frames for video, set `--media-io-kwargs '{"video":{"num_frames":40}}'` connection: HTTP connection client to download media contents. - allowed_local_media_path: A local directory to load media files - from. + allowed_local_media_path: A local directory to load media files from. + allowed_media_domains: If set, only media URLs that belong to this + domain can be used for multi-modal inputs. """ super().__init__() @@ -123,16 +124,16 @@ class MediaConnector: "Cannot load local files without `--allowed-local-media-path`." ) - filepath = Path(url2pathname(url_spec.path)) + filepath = Path(url2pathname(url_spec.netloc + url_spec.path)) if allowed_local_media_path not in filepath.resolve().parents: raise ValueError( f"The file path {filepath} must be a subpath " - f"of `--allowed-local-media-path` {allowed_local_media_path}." + f"of `--allowed-local-media-path {allowed_local_media_path}`." ) return media_io.load_file(filepath) - def _assert_url_in_allowed_media_domains(self, url_spec) -> None: + def _assert_url_in_allowed_media_domains(self, url_spec: ParseResult) -> None: if ( self.allowed_media_domains and url_spec.hostname not in self.allowed_media_domains @@ -489,9 +490,16 @@ def fetch_audio( Args: audio_url: URL of the audio file to fetch. audio_io_kwargs: Additional kwargs passed to handle audio IO. + + Warning: + This method has direct access to local files and is only intended + to be called by user code. Never call this from the online server! """ media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs} - media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + media_connector = MediaConnector( + media_io_kwargs=media_io_kwargs, + allowed_local_media_path="/", + ) return media_connector.fetch_audio(audio_url) @@ -503,9 +511,16 @@ def fetch_image( Args: image_url: URL of the image file to fetch. image_io_kwargs: Additional kwargs passed to handle image IO. + + Warning: + This method has direct access to local files and is only intended + to be called by user code. Never call this from the online server! """ media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs} - media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + media_connector = MediaConnector( + media_io_kwargs=media_io_kwargs, + allowed_local_media_path="/", + ) return media_connector.fetch_image(image_url) @@ -517,7 +532,14 @@ def fetch_video( Args: video_url: URL of the video file to fetch. video_io_kwargs: Additional kwargs passed to handle video IO. + + Warning: + This method has direct access to local files and is only intended + to be called by user code. Never call this from the online server! """ media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs} - media_connector = MediaConnector(media_io_kwargs=media_io_kwargs) + media_connector = MediaConnector( + media_io_kwargs=media_io_kwargs, + allowed_local_media_path="/", + ) return media_connector.fetch_video(video_url) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 763f90fde7b6d..abfc226a689c2 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -267,7 +267,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): return frames, metadata -class VideoMediaIO(MediaIO[npt.NDArray]): +class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]): def __init__( self, image_io: ImageMediaIO, From 3a7751485b71ce5ef927e4aa03b28602cb90811c Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Tue, 2 Dec 2025 23:59:23 -0800 Subject: [PATCH 08/16] [responsesAPI] support input output messages for non harmony models (#29549) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- .../openai/test_response_api_simple.py | 18 +++++++++++++++ vllm/entrypoints/context.py | 22 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 22 +++++++++++++++---- vllm/entrypoints/openai/serving_responses.py | 13 +++++------ 4 files changed, 64 insertions(+), 11 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py index 425b8199a0fd0..aee03199bc6f4 100644 --- a/tests/entrypoints/openai/test_response_api_simple.py +++ b/tests/entrypoints/openai/test_response_api_simple.py @@ -42,6 +42,24 @@ async def test_basic(client: OpenAI, model_name: str): assert response.status == "completed" +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_enable_response_messages(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="Hello?", + extra_body={"enable_response_messages": True}, + ) + assert response.status == "completed" + assert response.input_messages[0]["type"] == "raw_message_tokens" + assert type(response.input_messages[0]["message"]) is str + assert len(response.input_messages[0]["message"]) > 10 + assert type(response.input_messages[0]["tokens"][0]) is int + assert type(response.output_messages[0]["message"]) is str + assert len(response.output_messages[0]["message"]) > 10 + assert type(response.output_messages[0]["tokens"][0]) is int + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_reasoning_item(client: OpenAI, model_name: str): diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 1260f65dba59a..43783c92667af 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -23,6 +23,7 @@ from vllm.entrypoints.openai.parser.responses_parser import ( ) from vllm.entrypoints.openai.protocol import ( ResponseInputOutputItem, + ResponseRawMessageAndToken, ResponsesRequest, ) from vllm.entrypoints.responses_utils import construct_tool_dicts @@ -148,6 +149,8 @@ def _create_json_parse_error_messages( class SimpleContext(ConversationContext): + """This is a context that cannot handle MCP tool calls""" + def __init__(self): self.last_output = None self.num_prompt_tokens = 0 @@ -158,6 +161,9 @@ class SimpleContext(ConversationContext): # not implemented yet for SimpleContext self.all_turn_metrics = [] + self.input_messages: list[ResponseRawMessageAndToken] = [] + self.output_messages: list[ResponseRawMessageAndToken] = [] + def append_output(self, output) -> None: self.last_output = output if not isinstance(output, RequestOutput): @@ -166,6 +172,22 @@ class SimpleContext(ConversationContext): self.num_cached_tokens = output.num_cached_tokens or 0 self.num_output_tokens += len(output.outputs[0].token_ids or []) + if len(self.input_messages) == 0: + output_prompt = output.prompt or "" + output_prompt_token_ids = output.prompt_token_ids or [] + self.input_messages.append( + ResponseRawMessageAndToken( + message=output_prompt, + tokens=output_prompt_token_ids, + ) + ) + self.output_messages.append( + ResponseRawMessageAndToken( + message=output.outputs[0].text, + tokens=output.outputs[0].token_ids, + ) + ) + def append_tool_output(self, output) -> None: raise NotImplementedError("Should not be called.") diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0f4b2b4d7aad0..2d34a6a0cd5ad 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1598,6 +1598,20 @@ def serialize_messages(msgs): return [serialize_message(msg) for msg in msgs] if msgs else None +class ResponseRawMessageAndToken(OpenAIBaseModel): + """Class to show the raw message. + If message / tokens diverge, tokens is the source of truth""" + + message: str + tokens: list[int] + type: Literal["raw_message_tokens"] = "raw_message_tokens" + + +ResponseInputOutputMessage: TypeAlias = ( + list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken] +) + + class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") created_at: int = Field(default_factory=lambda: int(time.time())) @@ -1631,8 +1645,8 @@ class ResponsesResponse(OpenAIBaseModel): # These are populated when enable_response_messages is set to True # NOTE: custom serialization is needed # see serialize_input_messages and serialize_output_messages - input_messages: list[ChatCompletionMessageParam] | None = None - output_messages: list[ChatCompletionMessageParam] | None = None + input_messages: ResponseInputOutputMessage | None = None + output_messages: ResponseInputOutputMessage | None = None # --8<-- [end:responses-extra-params] # NOTE: openAI harmony doesn't serialize TextContent properly, @@ -1658,8 +1672,8 @@ class ResponsesResponse(OpenAIBaseModel): output: list[ResponseOutputItem], status: ResponseStatus, usage: ResponseUsage | None = None, - input_messages: list[ChatCompletionMessageParam] | None = None, - output_messages: list[ChatCompletionMessageParam] | None = None, + input_messages: ResponseInputOutputMessage | None = None, + output_messages: ResponseInputOutputMessage | None = None, ) -> "ResponsesResponse": incomplete_details: IncompleteDetails | None = None if status == "incomplete": diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 5ad86194ce1b2..3c9ae8e8c8087 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -86,6 +86,7 @@ from vllm.entrypoints.openai.protocol import ( ResponseCompletedEvent, ResponseCreatedEvent, ResponseInProgressEvent, + ResponseInputOutputMessage, ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent, ResponsesRequest, @@ -629,8 +630,8 @@ class OpenAIServingResponses(OpenAIServing): # "completed" is implemented as the "catch-all" for now. status: ResponseStatus = "completed" - input_messages = None - output_messages = None + input_messages: ResponseInputOutputMessage | None = None + output_messages: ResponseInputOutputMessage | None = None if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) @@ -670,12 +671,10 @@ class OpenAIServingResponses(OpenAIServing): output = self._make_response_output_items(request, final_output, tokenizer) - # TODO: context for non-gptoss models doesn't use messages - # so we can't get them out yet if request.enable_response_messages: - raise NotImplementedError( - "enable_response_messages is currently only supported for gpt-oss" - ) + input_messages = context.input_messages + output_messages = context.output_messages + # Calculate usage. assert final_res.prompt_token_ids is not None num_tool_output_tokens = 0 From 69520bc695ff8fa7fda66ef7c1a16761824ad354 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 2 Dec 2025 23:01:48 -1000 Subject: [PATCH 09/16] Add logging for cudagraph related info (#29825) Signed-off-by: Yong Hoon Shin --- vllm/compilation/cuda_graph.py | 94 ++++++++++++++++++++++++++++++ vllm/config/observability.py | 4 ++ vllm/engine/arg_utils.py | 6 ++ vllm/v1/core/sched/scheduler.py | 8 ++- vllm/v1/metrics/loggers.py | 14 +++++ vllm/v1/metrics/stats.py | 3 + vllm/v1/outputs.py | 4 ++ vllm/v1/worker/gpu_model_runner.py | 32 ++++++++-- vllm/v1/worker/gpu_worker.py | 2 +- 9 files changed, 161 insertions(+), 6 deletions(-) diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py index a2e0abfebc2c9..0748643a5299f 100644 --- a/vllm/compilation/cuda_graph.py +++ b/vllm/compilation/cuda_graph.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses +from collections import Counter from collections.abc import Callable from contextlib import ExitStack from typing import Any @@ -22,6 +23,99 @@ from vllm.utils.torch_utils import weak_ref_tensors logger = init_logger(__name__) +@dataclasses.dataclass(frozen=True) +class CUDAGraphStat: + num_unpadded_tokens: int + num_padded_tokens: int + num_paddings: int + runtime_mode: str + + +class CUDAGraphLogging: + """Aggregate and log cudagraph metrics""" + + COLUMN_HEADERS = [ + "Unpadded Tokens", + "Padded Tokens", + "Num Paddings", + "Runtime Mode", + "Count", + ] + + def __init__(self, cg_mode: CUDAGraphMode, cg_capture_sizes: list[int] | None): + self.reset() + self.cg_mode = str(cg_mode) + self.cg_capture_sizes = str(cg_capture_sizes or []) + + self.settings_header = ( + "**CUDAGraph Config Settings:**\n\n" + f"- Mode: {self.cg_mode}\n" + f"- Capture sizes: {self.cg_capture_sizes}\n\n" + "**CUDAGraph Stats:**\n\n" + ) + + def reset(self): + self.stats = [] + + def observe(self, cudagraph_stat: CUDAGraphStat): + self.stats.append(cudagraph_stat) + + def generate_metric_table(self) -> str: + stats_counts = Counter(self.stats) + + # Convert stats to rows of strings, in descending order of observed frequencies + rows = [] + for stat, count in sorted( + stats_counts.items(), key=lambda item: item[1], reverse=True + ): + rows.append( + [ + str(stat.num_unpadded_tokens), + str(stat.num_padded_tokens), + str(stat.num_paddings), + stat.runtime_mode, + str(count), + ] + ) + + # Calculate column widths (max of header and data) + col_widths = [] + for i, header_text in enumerate(self.COLUMN_HEADERS): + max_width = len(header_text) + for row in rows: + max_width = max(max_width, len(row[i])) + col_widths.append(max_width) + + table_header_list = [ + h.ljust(w) for h, w in zip(self.COLUMN_HEADERS, col_widths) + ] + table_header = "| " + " | ".join(table_header_list) + " |\n" + + table_separator = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|\n" + + # Create data rows with proper alignment + data_rows = [] + for row in rows: + formatted_row = [ + str(val).ljust(width) for val, width in zip(row, col_widths) + ] + data_rows.append("| " + " | ".join(formatted_row) + " |") + + return ( + self.settings_header + + table_header + + table_separator + + "\n".join(data_rows) + + "\n" + ) + + def log(self, log_fn=logger.info): + if not self.stats: + return + log_fn(self.generate_metric_table()) + self.reset() + + @dataclasses.dataclass class CUDAGraphEntry: batch_descriptor: BatchDescriptor diff --git a/vllm/config/observability.py b/vllm/config/observability.py index 656a5f8a9068e..fdc27aee380ef 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -55,6 +55,10 @@ class ObservabilityConfig: kv_cache_metrics_sample: float = Field(default=0.01, gt=0, le=1) """Sampling rate for KV cache metrics (0.0, 1.0]. Default 0.01 = 1% of blocks.""" + cudagraph_metrics: bool = False + """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph + dispatch modes, and their observed frequencies at every logging interval).""" + @cached_property def collect_model_forward_time(self) -> bool: """Whether to collect model forward time for the request.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 096217da4fe44..fd07cded7bc51 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -518,6 +518,7 @@ class EngineArgs: kv_cache_metrics_sample: float = get_field( ObservabilityConfig, "kv_cache_metrics_sample" ) + cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls @@ -1021,6 +1022,10 @@ class EngineArgs: "--kv-cache-metrics-sample", **observability_kwargs["kv_cache_metrics_sample"], ) + observability_group.add_argument( + "--cudagraph-metrics", + **observability_kwargs["cudagraph_metrics"], + ) # Scheduler arguments scheduler_kwargs = get_kwargs(SchedulerConfig) @@ -1698,6 +1703,7 @@ class EngineArgs: collect_detailed_traces=self.collect_detailed_traces, kv_cache_metrics=self.kv_cache_metrics, kv_cache_metrics_sample=self.kv_cache_metrics_sample, + cudagraph_metrics=self.cudagraph_metrics, ) # Compilation config overrides diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 52b98ef654592..75a7385df38b1 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -7,6 +7,7 @@ from collections.abc import Iterable from typing import Any from vllm import envs +from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.config import VllmConfig from vllm.distributed.ec_transfer.ec_connector.base import ( ECConnectorMetadata, @@ -1037,6 +1038,7 @@ class Scheduler(SchedulerInterface): pooler_outputs = model_runner_output.pooler_output num_nans_in_logits = model_runner_output.num_nans_in_logits kv_connector_output = model_runner_output.kv_connector_output + cudagraph_stats = model_runner_output.cudagraph_stats outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: SpecDecodingStats | None = None @@ -1219,7 +1221,9 @@ class Scheduler(SchedulerInterface): finished_req_ids.clear() if ( - stats := self.make_stats(spec_decoding_stats, kv_connector_stats) + stats := self.make_stats( + spec_decoding_stats, kv_connector_stats, cudagraph_stats + ) ) is not None: # Return stats to only one of the front-ends. if (eco := next(iter(engine_core_outputs.values()), None)) is None: @@ -1420,6 +1424,7 @@ class Scheduler(SchedulerInterface): self, spec_decoding_stats: SpecDecodingStats | None = None, kv_connector_stats: KVConnectorStats | None = None, + cudagraph_stats: CUDAGraphStat | None = None, ) -> SchedulerStats | None: if not self.log_stats: return None @@ -1444,6 +1449,7 @@ class Scheduler(SchedulerInterface): kv_cache_eviction_events=eviction_events, spec_decoding_stats=spec_stats, kv_connector_stats=connector_stats_payload, + cudagraph_stats=cudagraph_stats, ) def make_spec_decoding_stats( diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index dec0e2d00aea8..6961e15c2d0c5 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -10,6 +10,7 @@ from typing import TypeAlias from prometheus_client import Counter, Gauge, Histogram import vllm.envs as envs +from vllm.compilation.cuda_graph import CUDAGraphLogging from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorLogging, @@ -106,6 +107,12 @@ class LoggingStatLogger(StatLoggerBase): self.spec_decoding_logging = SpecDecodingLogging() kv_transfer_config = self.vllm_config.kv_transfer_config self.kv_connector_logging = KVConnectorLogging(kv_transfer_config) + self.cudagraph_logging = None + if self.vllm_config.observability_config.cudagraph_metrics: + self.cudagraph_logging = CUDAGraphLogging( + self.vllm_config.compilation_config.cudagraph_mode, + self.vllm_config.compilation_config.cudagraph_capture_sizes, + ) self.last_prompt_throughput: float = 0.0 self.last_generation_throughput: float = 0.0 self.engine_is_idle = False @@ -161,6 +168,11 @@ class LoggingStatLogger(StatLoggerBase): self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats) if kv_connector_stats := scheduler_stats.kv_connector_stats: self.kv_connector_logging.observe(kv_connector_stats) + if ( + self.cudagraph_logging is not None + and scheduler_stats.cudagraph_stats is not None + ): + self.cudagraph_logging.observe(scheduler_stats.cudagraph_stats) if not self.aggregated: self.last_scheduler_stats = scheduler_stats if mm_cache_stats: @@ -240,6 +252,8 @@ class LoggingStatLogger(StatLoggerBase): self.spec_decoding_logging.log(log_fn=log_fn) self.kv_connector_logging.log(log_fn=log_fn) + if self.cudagraph_logging is not None: + self.cudagraph_logging.log(log_fn=log_fn) def log_engine_initialized(self): if self.vllm_config.cache_config.num_gpu_blocks: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index a3078eaa75dc5..733d3ae12e67f 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -7,6 +7,7 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any import vllm.envs as envs +from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.v1.spec_decode.metrics import SpecDecodingStats if TYPE_CHECKING: @@ -183,6 +184,8 @@ class SchedulerStats: waiting_lora_adapters: dict[str, int] = field(default_factory=dict) running_lora_adapters: dict[str, int] = field(default_factory=dict) + cudagraph_stats: CUDAGraphStat | None = None + @dataclass class RequestStateStats: diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 8110deb5a610b..88ac6b4aeb4bb 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, NamedTuple import numpy as np import torch +from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: @@ -169,6 +170,9 @@ class ModelRunnerOutput: # req_id -> num_nans_in_logits num_nans_in_logits: dict[str, int] | None = None + # information related to cudagraph execution + cudagraph_stats: CUDAGraphStat | None = None + # ModelRunnerOutput wrapper for async scheduling. class AsyncModelRunnerOutput(ABC): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1b250a8bd009c..3f043e3b2648b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -27,7 +27,7 @@ from vllm.attention.backends.abstract import ( ) from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter -from vllm.compilation.cuda_graph import CUDAGraphWrapper +from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( CompilationMode, @@ -257,6 +257,7 @@ class ExecuteModelState(NamedTuple): sample_hidden_states: torch.Tensor aux_hidden_states: list[torch.Tensor] | None ec_connector_output: ECConnectorOutput | None + cudagraph_stats: CUDAGraphStat | None class GPUModelRunner( @@ -2755,7 +2756,11 @@ class GPUModelRunner( force_uniform_decode: bool | None = None, force_has_lora: bool | None = None, ) -> tuple[ - CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None + CUDAGraphMode, + BatchDescriptor, + UBatchSlices | None, + torch.Tensor | None, + CUDAGraphStat | None, ]: num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) uniform_decode = ( @@ -2820,7 +2825,22 @@ class GPUModelRunner( # num_tokens_across_dp will no-longer be valid assert batch_descriptor.num_tokens == num_tokens_padded - return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp + cudagraph_stats = None + if self.vllm_config.observability_config.cudagraph_metrics: + cudagraph_stats = CUDAGraphStat( + num_unpadded_tokens=num_tokens, + num_padded_tokens=batch_descriptor.num_tokens, + num_paddings=batch_descriptor.num_tokens - num_tokens, + runtime_mode=str(cudagraph_mode), + ) + + return ( + cudagraph_mode, + batch_descriptor, + ubatch_slices, + num_tokens_across_dp, + cudagraph_stats, + ) @torch.inference_mode() def execute_model( @@ -2918,6 +2938,7 @@ class GPUModelRunner( batch_desc, ubatch_slices, num_tokens_across_dp, + cudagraph_stats, ) = self._determine_batch_execution_and_padding( num_tokens=num_tokens_unpadded, num_reqs=num_reqs, @@ -3067,6 +3088,7 @@ class GPUModelRunner( sample_hidden_states, aux_hidden_states, ec_connector_output, + cudagraph_stats, ) self.kv_connector_output = kv_connector_output return None @@ -3102,6 +3124,7 @@ class GPUModelRunner( sample_hidden_states, aux_hidden_states, ec_connector_output, + cudagraph_stats, ) = self.execute_model_state # Clear ephemeral state. self.execute_model_state = None @@ -3217,6 +3240,7 @@ class GPUModelRunner( if self.supports_mm_inputs else None, num_nans_in_logits=num_nans_in_logits, + cudagraph_stats=cudagraph_stats, ) if not self.use_async_scheduling: @@ -3937,7 +3961,7 @@ class GPUModelRunner( num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) - _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = ( + _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = ( self._determine_batch_execution_and_padding( num_tokens=num_tokens_unpadded, num_reqs=num_reqs, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index edba07a423cda..a133575cbbced 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -564,7 +564,7 @@ class Worker(WorkerBase): # TODO(lucas): This is pretty gross; ideally we should only ever call # `_determine_batch_execution_and_padding` once (will get called again # in `execute_model`) but this requires a larger refactor of PP. - _, batch_desc, _, _ = ( + _, batch_desc, _, _, _ = ( self.model_runner._determine_batch_execution_and_padding( num_tokens=num_scheduled_tokens, num_reqs=len(num_scheduled_tokens_np), From 3f42b05fbc53e50813a1619f5fc770f17ac2a1b6 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 3 Dec 2025 17:26:39 +0800 Subject: [PATCH 10/16] [Refactor] [1/N] to simplify the vLLM serving architecture (#28040) Signed-off-by: chaunceyjiang --- tests/entrypoints/openai/test_basic.py | 2 +- vllm/entrypoints/api_server.py | 1 + vllm/entrypoints/openai/api_server.py | 455 +----------------- vllm/entrypoints/openai/serving_engine.py | 3 +- vllm/entrypoints/sagemaker/routes.py | 2 +- vllm/entrypoints/serve/__init__.py | 60 +++ vllm/entrypoints/serve/disagg/__init__.py | 0 vllm/entrypoints/serve/disagg/api_router.py | 110 +++++ vllm/entrypoints/serve/disagg/protocol.py | 90 ++++ .../disagg/serving.py} | 10 +- vllm/entrypoints/serve/elastic_ep/__init__.py | 0 .../serve/elastic_ep/api_router.py | 96 ++++ .../serve/elastic_ep/middleware.py | 49 ++ .../serve/instrumentator/__init__.py | 0 .../serve/instrumentator/health.py | 33 ++ .../serve/instrumentator/metrics.py | 46 ++ vllm/entrypoints/serve/lora/__init__.py | 0 .../lora/api_router.py} | 19 +- vllm/entrypoints/serve/profile/__init__.py | 0 vllm/entrypoints/serve/profile/api_router.py | 49 ++ vllm/entrypoints/serve/rlhf/__init__.py | 0 vllm/entrypoints/serve/rlhf/api_router.py | 102 ++++ vllm/entrypoints/serve/sleep/__init__.py | 0 vllm/entrypoints/serve/sleep/api_router.py | 60 +++ vllm/entrypoints/serve/tokenize/__init__.py | 0 vllm/entrypoints/serve/tokenize/api_router.py | 118 +++++ .../tokenize/serving.py} | 0 27 files changed, 850 insertions(+), 455 deletions(-) create mode 100644 vllm/entrypoints/serve/__init__.py create mode 100644 vllm/entrypoints/serve/disagg/__init__.py create mode 100644 vllm/entrypoints/serve/disagg/api_router.py create mode 100644 vllm/entrypoints/serve/disagg/protocol.py rename vllm/entrypoints/{openai/serving_tokens.py => serve/disagg/serving.py} (99%) create mode 100644 vllm/entrypoints/serve/elastic_ep/__init__.py create mode 100644 vllm/entrypoints/serve/elastic_ep/api_router.py create mode 100644 vllm/entrypoints/serve/elastic_ep/middleware.py create mode 100644 vllm/entrypoints/serve/instrumentator/__init__.py create mode 100644 vllm/entrypoints/serve/instrumentator/health.py create mode 100644 vllm/entrypoints/serve/instrumentator/metrics.py create mode 100644 vllm/entrypoints/serve/lora/__init__.py rename vllm/entrypoints/{dynamic_lora.py => serve/lora/api_router.py} (80%) create mode 100644 vllm/entrypoints/serve/profile/__init__.py create mode 100644 vllm/entrypoints/serve/profile/api_router.py create mode 100644 vllm/entrypoints/serve/rlhf/__init__.py create mode 100644 vllm/entrypoints/serve/rlhf/api_router.py create mode 100644 vllm/entrypoints/serve/sleep/__init__.py create mode 100644 vllm/entrypoints/serve/sleep/api_router.py create mode 100644 vllm/entrypoints/serve/tokenize/__init__.py create mode 100644 vllm/entrypoints/serve/tokenize/api_router.py rename vllm/entrypoints/{openai/serving_tokenization.py => serve/tokenize/serving.py} (100%) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 3d581a300b6a9..1ff30de31bbe5 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -232,7 +232,7 @@ async def test_server_load(server: RemoteOpenAIServer): @pytest.mark.asyncio async def test_health_check_engine_dead_error(): # Import the health function directly to test it in isolation - from vllm.entrypoints.openai.api_server import health + from vllm.entrypoints.serve.instrumentator.health import health # Create a mock request that simulates what FastAPI would provide mock_request = Mock(spec=Request) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 154cdeb42a3ea..b59f7120551e0 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -118,6 +118,7 @@ async def init_app( ) ) app.state.engine_client = engine + app.state.args = args return app diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index cdc316b65ba78..2fa6afa2bacb5 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -20,21 +20,15 @@ from http import HTTPStatus from typing import Annotated, Any, Literal import model_hosting_container_standards.sagemaker as sagemaker_standards -import prometheus_client import pydantic -import regex as re import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse -from prometheus_client import make_asgi_app -from prometheus_fastapi_instrumentator import Instrumentator from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import URL, Headers, MutableHeaders, State -from starlette.routing import Mount from starlette.types import ASGIApp, Message, Receive, Scope, Send -from typing_extensions import assert_never import vllm.envs as envs from vllm.config import VllmConfig @@ -56,17 +50,11 @@ from vllm.entrypoints.openai.protocol import ( ChatCompletionResponse, CompletionRequest, CompletionResponse, - DetokenizeRequest, - DetokenizeResponse, ErrorInfo, ErrorResponse, - GenerateRequest, - GenerateResponse, ResponsesRequest, ResponsesResponse, StreamingResponsesResponse, - TokenizeRequest, - TokenizeResponse, TranscriptionRequest, TranscriptionResponseVariant, TranslationRequest, @@ -80,8 +68,6 @@ from vllm.entrypoints.openai.serving_models import ( OpenAIServingModels, ) from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses -from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization -from vllm.entrypoints.openai.serving_tokens import ServingTokens from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription, OpenAIServingTranslation, @@ -92,6 +78,11 @@ from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling from vllm.entrypoints.pooling.score.serving import ServingScores +from vllm.entrypoints.serve.disagg.serving import ServingTokens +from vllm.entrypoints.serve.elastic_ep.middleware import ( + ScalingMiddleware, +) +from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer from vllm.entrypoints.utils import ( cli_env_setup, @@ -109,8 +100,6 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.gc_utils import freeze_gc_heap from vllm.utils.network_utils import is_valid_ipv6_address from vllm.utils.system_utils import decorate_logs, set_ulimit -from vllm.v1.engine.exceptions import EngineDeadError -from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION prometheus_multiproc_dir: tempfile.TemporaryDirectory @@ -245,39 +234,6 @@ async def build_async_engine_client_from_engine_args( router = APIRouter() -class PrometheusResponse(Response): - media_type = prometheus_client.CONTENT_TYPE_LATEST - - -def mount_metrics(app: FastAPI): - """Mount prometheus metrics to a FastAPI app.""" - - registry = get_prometheus_registry() - - # `response_class=PrometheusResponse` is needed to return an HTTP response - # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" - # instead of the default "application/json" which is incorrect. - # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364 - Instrumentator( - excluded_handlers=[ - "/metrics", - "/health", - "/load", - "/ping", - "/version", - "/server_info", - ], - registry=registry, - ).add().instrument(app).expose(app, response_class=PrometheusResponse) - - # Add prometheus asgi middleware to route /metrics requests - metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) - - # Workaround for 307 Redirect for /metrics - metrics_route.path_regex = re.compile("^/metrics(?P.*)$") - app.routes.append(metrics_route) - - def base(request: Request) -> OpenAIServing: # Reuse the existing instance return tokenization(request) @@ -323,16 +279,6 @@ def generate_tokens(request: Request) -> ServingTokens | None: return request.app.state.serving_tokens -@router.get("/health", response_class=Response) -async def health(raw_request: Request) -> Response: - """Health check.""" - try: - await engine_client(raw_request).check_health() - return Response(status_code=200) - except EngineDeadError: - return Response(status_code=503) - - @router.get("/load") async def get_server_load_metrics(request: Request): # This endpoint returns the current server load metrics. @@ -352,167 +298,6 @@ async def get_server_load_metrics(request: Request): return JSONResponse(content={"server_load": request.app.state.server_load_metrics}) -@router.post("/pause") -async def pause_generation( - raw_request: Request, - wait_for_inflight_requests: bool = Query(False), - clear_cache: bool = Query(True), -) -> JSONResponse: - """Pause generation requests to allow weight updates. - - Args: - wait_for_inflight_requests: When ``True`` waits for in-flight - requests to finish before pausing. When ``False`` (default), - aborts any in-flight requests immediately. - clear_cache: Whether to clear KV/prefix caches after draining. - """ - - engine = engine_client(raw_request) - - try: - await engine.pause_generation( - wait_for_inflight_requests=wait_for_inflight_requests, - clear_cache=clear_cache, - ) - return JSONResponse( - content={"status": "paused"}, - status_code=HTTPStatus.OK.value, - ) - - except ValueError as err: - return JSONResponse( - content={"error": str(err)}, - status_code=HTTPStatus.BAD_REQUEST.value, - ) - except Exception as err: # pragma: no cover - defensive - logger.exception("Failed to pause generation") - return JSONResponse( - content={"error": f"Failed to pause generation: {err}"}, - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, - ) - - -@router.post("/resume") -async def resume_generation(raw_request: Request) -> JSONResponse: - """Resume generation after a pause.""" - - engine = engine_client(raw_request) - - try: - await engine.resume_generation() - return JSONResponse( - content={"status": "resumed"}, - status_code=HTTPStatus.OK.value, - ) - except Exception as err: # pragma: no cover - defensive - logger.exception("Failed to resume generation") - return JSONResponse( - content={"error": f"Failed to resume generation: {err}"}, - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, - ) - - -@router.get("/is_paused") -async def is_paused(raw_request: Request) -> JSONResponse: - """Return the current pause status.""" - - engine = engine_client(raw_request) - - try: - paused = await engine.is_paused() - except Exception as err: # pragma: no cover - defensive - logger.exception("Failed to fetch pause status") - return JSONResponse( - content={"error": f"Failed to fetch pause status: {err}"}, - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, - ) - - return JSONResponse(content={"is_paused": paused}) - - -@router.post( - "/tokenize", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -async def tokenize(request: TokenizeRequest, raw_request: Request): - handler = tokenization(raw_request) - - try: - generator = await handler.create_tokenize(request, raw_request) - except NotImplementedError as e: - raise HTTPException( - status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e) - ) from e - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - elif isinstance(generator, TokenizeResponse): - return JSONResponse(content=generator.model_dump()) - - assert_never(generator) - - -@router.post( - "/detokenize", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -async def detokenize(request: DetokenizeRequest, raw_request: Request): - handler = tokenization(raw_request) - - try: - generator = await handler.create_detokenize(request, raw_request) - except OverflowError as e: - raise RequestValidationError(errors=[str(e)]) from e - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - elif isinstance(generator, DetokenizeResponse): - return JSONResponse(content=generator.model_dump()) - - assert_never(generator) - - -def maybe_register_tokenizer_info_endpoint(args): - """Conditionally register the tokenizer info endpoint if enabled.""" - if getattr(args, "enable_tokenizer_info_endpoint", False): - - @router.get("/tokenizer_info") - async def get_tokenizer_info(raw_request: Request): - """Get comprehensive tokenizer information.""" - result = await tokenization(raw_request).get_tokenizer_info() - return JSONResponse( - content=result.model_dump(), - status_code=result.error.code - if isinstance(result, ErrorResponse) - else 200, - ) - - @router.get("/v1/models") async def show_available_models(raw_request: Request): handler = models(raw_request) @@ -898,33 +683,6 @@ if envs.VLLM_SERVER_DEV_MODE: await engine_client(raw_request).reset_mm_cache() return Response(status_code=200) - @router.post("/sleep") - async def sleep(raw_request: Request): - # get POST params - level = raw_request.query_params.get("level", "1") - await engine_client(raw_request).sleep(int(level)) - # FIXME: in v0 with frontend multiprocessing, the sleep command - # is sent but does not finish yet when we return a response. - return Response(status_code=200) - - @router.post("/wake_up") - async def wake_up(raw_request: Request): - tags = raw_request.query_params.getlist("tags") - if tags == []: - # set to None to wake up all tags if no tags are provided - tags = None - logger.info("wake up the engine with tags: %s", tags) - await engine_client(raw_request).wake_up(tags) - # FIXME: in v0 with frontend multiprocessing, the wake-up command - # is sent but does not finish yet when we return a response. - return Response(status_code=200) - - @router.get("/is_sleeping") - async def is_sleeping(raw_request: Request): - logger.info("check whether the engine is sleeping") - is_sleeping = await engine_client(raw_request).is_sleeping() - return JSONResponse(content={"is_sleeping": is_sleeping}) - @router.post("/collective_rpc") async def collective_rpc(raw_request: Request): try: @@ -952,138 +710,13 @@ if envs.VLLM_SERVER_DEV_MODE: return Response(status_code=200) response: list[Any] = [] for result in results: - if result is None or isinstance(result, (dict, list)): + if result is None or isinstance(result, dict | list): response.append(result) else: response.append(str(result)) return JSONResponse(content={"results": response}) -@router.post( - "/scale_elastic_ep", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.OK.value: {"model": dict}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -async def scale_elastic_ep(raw_request: Request): - try: - body = await raw_request.json() - except json.JSONDecodeError as e: - raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904 - - new_data_parallel_size = body.get("new_data_parallel_size") - drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes - - if new_data_parallel_size is None: - raise HTTPException( - status_code=400, detail="new_data_parallel_size is required" - ) - - if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0: - raise HTTPException( - status_code=400, detail="new_data_parallel_size must be a positive integer" - ) - - if not isinstance(drain_timeout, int) or drain_timeout <= 0: - raise HTTPException( - status_code=400, detail="drain_timeout must be a positive integer" - ) - - # Set scaling flag to prevent new requests - global _scaling_elastic_ep - _scaling_elastic_ep = True - client = engine_client(raw_request) - try: - await client.scale_elastic_ep(new_data_parallel_size, drain_timeout) - return JSONResponse( - { - "message": f"Scaled to {new_data_parallel_size} data parallel engines", - } - ) - except TimeoutError as e: - raise HTTPException( - status_code=408, - detail="Scale failed due to request drain timeout " - f"after {drain_timeout} seconds", - ) from e - except Exception as e: - logger.error("Scale failed: %s", e) - raise HTTPException(status_code=500, detail="Scale failed") from e - finally: - _scaling_elastic_ep = False - - -@router.post("/is_scaling_elastic_ep") -async def is_scaling_elastic_ep(raw_request: Request): - return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep}) - - -@router.post( - "/inference/v1/generate", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def generate(request: GenerateRequest, raw_request: Request): - handler = generate_tokens(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support generate tokens API" - ) - try: - generator = await handler.serve_tokens(request, raw_request) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - - elif isinstance(generator, GenerateResponse): - return JSONResponse(content=generator.model_dump()) - - return StreamingResponse(content=generator, media_type="text/event-stream") - - -if envs.VLLM_TORCH_PROFILER_DIR: - logger.warning_once( - "Torch Profiler is enabled in the API server. This should ONLY be " - "used for local development!" - ) -elif envs.VLLM_TORCH_CUDA_PROFILE: - logger.warning_once( - "CUDA Profiler is enabled in the API server. This should ONLY be " - "used for local development!" - ) -if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE: - - @router.post("/start_profile") - async def start_profile(raw_request: Request): - logger.info("Starting profiler...") - await engine_client(raw_request).start_profile() - logger.info("Profiler started.") - return Response(status_code=200) - - @router.post("/stop_profile") - async def stop_profile(raw_request: Request): - logger.info("Stopping profiler...") - await engine_client(raw_request).stop_profile() - logger.info("Profiler stopped.") - return Response(status_code=200) - - def load_log_config(log_config_file: str | None) -> dict | None: if not log_config_file: return None @@ -1176,41 +809,6 @@ class XRequestIdMiddleware: return self.app(scope, receive, send_with_request_id) -# Global variable to track scaling state -_scaling_elastic_ep = False - - -class ScalingMiddleware: - """ - Middleware that checks if the model is currently scaling and - returns a 503 Service Unavailable response if it is. - - This middleware applies to all HTTP requests and prevents - processing when the model is in a scaling state. - """ - - def __init__(self, app: ASGIApp) -> None: - self.app = app - - def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: - if scope["type"] != "http": - return self.app(scope, receive, send) - - # Check global scaling state - global _scaling_elastic_ep - if _scaling_elastic_ep: - # Return 503 Service Unavailable response - response = JSONResponse( - content={ - "error": "The model is currently scaling. Please try again later." - }, - status_code=503, - ) - return response(scope, receive, send) - - return self.app(scope, receive, send) - - def _extract_content_from_chunk(chunk_data: dict) -> str: """Extract content from a streaming response chunk.""" try: @@ -1353,15 +951,10 @@ def build_app(args: Namespace) -> FastAPI: ) else: app = FastAPI(lifespan=lifespan) + app.state.args = args + from vllm.entrypoints.serve import register_vllm_serve_api_routers - if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: - logger.warning( - "LoRA dynamic loading & unloading is enabled in the API server. " - "This should ONLY be used for local development!" - ) - from vllm.entrypoints.dynamic_lora import register_dynamic_lora_routes - - register_dynamic_lora_routes(router) + register_vllm_serve_api_routers(app) from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes @@ -1370,8 +963,6 @@ def build_app(args: Namespace) -> FastAPI: app.root_path = args.root_path - mount_metrics(app) - from vllm.entrypoints.pooling import register_pooling_api_routers register_pooling_api_routers(app) @@ -1462,31 +1053,6 @@ def build_app(args: Namespace) -> FastAPI: ) app = sagemaker_standards.bootstrap(app) - # Optional endpoints - if args.tokens_only: - - @app.post("/abort_requests") - async def abort_requests(raw_request: Request): - """ - Abort one or more requests. To be used in a - Disaggregated Everything setup. - """ - try: - body = await raw_request.json() - except json.JSONDecodeError as e: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail=f"JSON decode error: {e}", - ) from e - request_ids = body.get("request_ids") - if request_ids is None: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail="Missing 'request_ids' in request body", - ) - # Abort requests in background - asyncio.create_task(engine_client(raw_request).abort(request_ids)) - return Response(status_code=200) return app @@ -1515,7 +1081,7 @@ async def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats state.vllm_config = vllm_config - + state.args = args supported_tasks = await engine_client.get_supported_tasks() logger.info("Supported tasks: %s", supported_tasks) @@ -1839,7 +1405,6 @@ async def run_server_worker( args, client_config=client_config, ) as engine_client: - maybe_register_tokenizer_info_endpoint(args) app = build_app(args) await init_app_state(engine_client, app.state, args) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 1d89aa011af21..67291f45a9251 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -74,8 +74,6 @@ from vllm.entrypoints.openai.protocol import ( ErrorResponse, FunctionCall, FunctionDefinition, - GenerateRequest, - GenerateResponse, ResponsesRequest, TokenizeChatRequest, TokenizeCompletionRequest, @@ -87,6 +85,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig +from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import PromptType from vllm.inputs.data import TokensPrompt as EngineTokensPrompt diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py index 108fdd773e321..ea88c0fc4b979 100644 --- a/vllm/entrypoints/sagemaker/routes.py +++ b/vllm/entrypoints/sagemaker/routes.py @@ -16,7 +16,6 @@ from vllm.entrypoints.openai.api_server import ( completion, create_chat_completion, create_completion, - health, validate_json_request, ) from vllm.entrypoints.openai.protocol import ( @@ -38,6 +37,7 @@ from vllm.entrypoints.pooling.score.api_router import ( score, ) from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest +from vllm.entrypoints.serve.instrumentator.health import health # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers # (requires typing_extensions >= 4.13) diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py new file mode 100644 index 0000000000000..c4fcc92db931f --- /dev/null +++ b/vllm/entrypoints/serve/__init__.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import FastAPI + + +def register_vllm_serve_api_routers(app: FastAPI): + from vllm.entrypoints.serve.lora.api_router import ( + attach_router as attach_lora_router, + ) + + attach_lora_router(app) + from vllm.entrypoints.serve.elastic_ep.api_router import ( + attach_router as attach_elastic_ep_router, + ) + + attach_elastic_ep_router(app) + + from vllm.entrypoints.serve.profile.api_router import ( + attach_router as attach_profile_router, + ) + + attach_profile_router(app) + + from vllm.entrypoints.serve.sleep.api_router import ( + attach_router as attach_sleep_router, + ) + + attach_sleep_router(app) + + from vllm.entrypoints.serve.tokenize.api_router import ( + attach_router as attach_tokenize_router, + ) + + attach_tokenize_router(app) + + from vllm.entrypoints.serve.disagg.api_router import ( + attach_router as attach_disagg_router, + ) + + attach_disagg_router(app) + + from vllm.entrypoints.serve.rlhf.api_router import ( + attach_router as attach_rlhf_router, + ) + + attach_rlhf_router(app) + + from vllm.entrypoints.serve.instrumentator.metrics import ( + attach_router as attach_metrics_router, + ) + + attach_metrics_router(app) + + from vllm.entrypoints.serve.instrumentator.health import ( + attach_router as attach_health_router, + ) + + attach_health_router(app) diff --git a/vllm/entrypoints/serve/disagg/__init__.py b/vllm/entrypoints/serve/disagg/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py new file mode 100644 index 0000000000000..c38ede30dad1c --- /dev/null +++ b/vllm/entrypoints/serve/disagg/api_router.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import asyncio +import json +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Response +from fastapi.responses import JSONResponse, StreamingResponse + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.openai.api_server import validate_json_request +from vllm.entrypoints.openai.protocol import ( + ErrorResponse, +) +from vllm.entrypoints.serve.disagg.protocol import ( + GenerateRequest, + GenerateResponse, +) +from vllm.entrypoints.serve.disagg.serving import ( + ServingTokens, +) +from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization +from vllm.entrypoints.utils import ( + load_aware_call, + with_cancellation, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def tokenization(request: Request) -> OpenAIServingTokenization: + return request.app.state.openai_serving_tokenization + + +def generate_tokens(request: Request) -> ServingTokens | None: + return request.app.state.serving_tokens + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +router = APIRouter() + + +@router.post( + "/inference/v1/generate", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def generate(request: GenerateRequest, raw_request: Request): + handler = generate_tokens(raw_request) + if handler is None: + return tokenization(raw_request).create_error_response( + message="The model does not support generate tokens API" + ) + try: + generator = await handler.serve_tokens(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, GenerateResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +def attach_router(app: FastAPI): + if getattr(app.state.args, "tokens_only", False): + + @router.post("/abort_requests") + async def abort_requests(raw_request: Request): + """ + Abort one or more requests. To be used in a + Disaggregated Everything setup. + """ + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + request_ids = body.get("request_ids") + if request_ids is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'request_ids' in request body", + ) + # Abort requests in background + asyncio.create_task(engine_client(raw_request).abort(request_ids)) + return Response(status_code=200) + + app.include_router(router) diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py new file mode 100644 index 0000000000000..251fcf12ed7dd --- /dev/null +++ b/vllm/entrypoints/serve/disagg/protocol.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +from pydantic import BaseModel, Field + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionLogProbs, + Logprob, + SamplingParams, + StreamOptions, +) +from vllm.utils import random_uuid + + +####### Tokens IN <> Tokens OUT ####### +class GenerateRequest(BaseModel): + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + token_ids: list[int] + """The token ids to generate text from.""" + + # features: MultiModalFeatureSpec + # TODO (NickLucche): implement once Renderer work is completed + features: str | None = None + """The processed MM inputs for the model.""" + + sampling_params: SamplingParams + """The sampling parameters for the model.""" + + model: str | None = None + + stream: bool | None = False + stream_options: StreamOptions | None = None + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit)." + ), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) + + +class GenerateResponseChoice(BaseModel): + index: int + logprobs: ChatCompletionLogProbs | None = None + # per OpenAI spec this is the default + finish_reason: str | None = "stop" + token_ids: list[int] | None = None + + +class GenerateResponse(BaseModel): + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + choices: list[GenerateResponseChoice] + + prompt_logprobs: list[dict[int, Logprob] | None] | None = None + + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/serve/disagg/serving.py similarity index 99% rename from vllm/entrypoints/openai/serving_tokens.py rename to vllm/entrypoints/serve/disagg/serving.py index daa739e41fa07..5c1d17156a90d 100644 --- a/vllm/entrypoints/openai/serving_tokens.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + import asyncio import time from collections.abc import AsyncGenerator @@ -14,15 +16,17 @@ from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProbs, ChatCompletionLogProbsContent, ErrorResponse, - GenerateRequest, - GenerateResponse, - GenerateResponseChoice, PromptTokenUsageInfo, RequestResponseMetadata, UsageInfo, ) from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.serve.disagg.protocol import ( + GenerateRequest, + GenerateResponse, + GenerateResponseChoice, +) from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.logprobs import Logprob diff --git a/vllm/entrypoints/serve/elastic_ep/__init__.py b/vllm/entrypoints/serve/elastic_ep/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py new file mode 100644 index 0000000000000..21d5d2e60778a --- /dev/null +++ b/vllm/entrypoints/serve/elastic_ep/api_router.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import json +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.openai.api_server import validate_json_request +from vllm.entrypoints.openai.protocol import ( + ErrorResponse, +) +from vllm.entrypoints.serve.elastic_ep.middleware import ( + get_scaling_elastic_ep, + set_scaling_elastic_ep, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +router = APIRouter() + + +@router.post( + "/scale_elastic_ep", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"model": dict}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.REQUEST_TIMEOUT.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def scale_elastic_ep(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail="Invalid JSON format") from e # noqa: B904 + + new_data_parallel_size = body.get("new_data_parallel_size") + drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes + + if new_data_parallel_size is None: + raise HTTPException( + status_code=400, detail="new_data_parallel_size is required" + ) + + if not isinstance(new_data_parallel_size, int) or new_data_parallel_size <= 0: + raise HTTPException( + status_code=400, + detail="new_data_parallel_size must be a positive integer", + ) + + if not isinstance(drain_timeout, int) or drain_timeout <= 0: + raise HTTPException( + status_code=400, detail="drain_timeout must be a positive integer" + ) + + # Set scaling flag to prevent new requests + set_scaling_elastic_ep(True) + client = engine_client(raw_request) + try: + await client.scale_elastic_ep(new_data_parallel_size, drain_timeout) + return JSONResponse( + { + "message": f"Scaled to {new_data_parallel_size} data parallel engines", + } + ) + except TimeoutError as e: + raise HTTPException( + status_code=408, + detail="Scale failed due to request drain timeout " + f"after {drain_timeout} seconds", + ) from e + except Exception as e: + logger.error("Scale failed: %s", e) + raise HTTPException(status_code=500, detail="Scale failed") from e + finally: + set_scaling_elastic_ep(False) + + +@router.post("/is_scaling_elastic_ep") +async def is_scaling_elastic_ep(raw_request: Request): + return JSONResponse({"is_scaling_elastic_ep": get_scaling_elastic_ep()}) + + +def attach_router(app: FastAPI): + app.include_router(router) diff --git a/vllm/entrypoints/serve/elastic_ep/middleware.py b/vllm/entrypoints/serve/elastic_ep/middleware.py new file mode 100644 index 0000000000000..23f45eafeaa0d --- /dev/null +++ b/vllm/entrypoints/serve/elastic_ep/middleware.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Awaitable + +from fastapi.responses import JSONResponse +from starlette.types import ASGIApp, Receive, Scope, Send + +# Global variable to track scaling state +_scaling_elastic_ep = False + + +def get_scaling_elastic_ep(): + return _scaling_elastic_ep + + +def set_scaling_elastic_ep(value): + global _scaling_elastic_ep + _scaling_elastic_ep = value + + +class ScalingMiddleware: + """ + Middleware that checks if the model is currently scaling and + returns a 503 Service Unavailable response if it is. + + This middleware applies to all HTTP requests and prevents + processing when the model is in a scaling state. + """ + + def __init__(self, app: ASGIApp) -> None: + self.app = app + + def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: + if scope["type"] != "http": + return self.app(scope, receive, send) + + # Check global scaling state + if get_scaling_elastic_ep(): + # Return 503 Service Unavailable response + response = JSONResponse( + content={ + "error": "The model is currently scaling. Please try again later." + }, + status_code=503, + ) + return response(scope, receive, send) + + return self.app(scope, receive, send) diff --git a/vllm/entrypoints/serve/instrumentator/__init__.py b/vllm/entrypoints/serve/instrumentator/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py new file mode 100644 index 0000000000000..029ef677aaa25 --- /dev/null +++ b/vllm/entrypoints/serve/instrumentator/health.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import APIRouter, Request +from fastapi.responses import Response + +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger +from vllm.v1.engine.exceptions import EngineDeadError + +logger = init_logger(__name__) + + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.get("/health", response_class=Response) +async def health(raw_request: Request) -> Response: + """Health check.""" + try: + await engine_client(raw_request).check_health() + return Response(status_code=200) + except EngineDeadError: + return Response(status_code=503) + + +def attach_router(app): + app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/metrics.py b/vllm/entrypoints/serve/instrumentator/metrics.py new file mode 100644 index 0000000000000..efe0c63a90714 --- /dev/null +++ b/vllm/entrypoints/serve/instrumentator/metrics.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import re + +import prometheus_client +from fastapi import FastAPI, Response +from prometheus_client import make_asgi_app +from prometheus_fastapi_instrumentator import Instrumentator +from starlette.routing import Mount + +from vllm.v1.metrics.prometheus import get_prometheus_registry + + +class PrometheusResponse(Response): + media_type = prometheus_client.CONTENT_TYPE_LATEST + + +def attach_router(app: FastAPI): + """Mount prometheus metrics to a FastAPI app.""" + + registry = get_prometheus_registry() + + # `response_class=PrometheusResponse` is needed to return an HTTP response + # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" + # instead of the default "application/json" which is incorrect. + # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364 + Instrumentator( + excluded_handlers=[ + "/metrics", + "/health", + "/load", + "/ping", + "/version", + "/server_info", + ], + registry=registry, + ).add().instrument(app).expose(app, response_class=PrometheusResponse) + + # Add prometheus asgi middleware to route /metrics requests + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) + + # Workaround for 307 Redirect for /metrics + metrics_route.path_regex = re.compile("^/metrics(?P.*)$") + app.routes.append(metrics_route) diff --git a/vllm/entrypoints/serve/lora/__init__.py b/vllm/entrypoints/serve/lora/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/dynamic_lora.py b/vllm/entrypoints/serve/lora/api_router.py similarity index 80% rename from vllm/entrypoints/dynamic_lora.py rename to vllm/entrypoints/serve/lora/api_router.py index cc0f437e5c77f..6a57e73f334f2 100644 --- a/vllm/entrypoints/dynamic_lora.py +++ b/vllm/entrypoints/serve/lora/api_router.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + import model_hosting_container_standards.sagemaker as sagemaker_standards -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, FastAPI, Request from fastapi.responses import JSONResponse, Response +from vllm import envs from vllm.entrypoints.openai.api_server import models, validate_json_request from vllm.entrypoints.openai.protocol import ( ErrorResponse, @@ -14,9 +17,18 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger logger = init_logger(__name__) +router = APIRouter() -def register_dynamic_lora_routes(router: APIRouter): +def attach_router(app: FastAPI): + if not envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + """If LoRA dynamic loading & unloading is not enabled, do nothing.""" + return + logger.warning( + "LoRA dynamic loading & unloading is enabled in the API server. " + "This should ONLY be used for local development!" + ) + @sagemaker_standards.register_load_adapter_handler( request_shape={ "lora_name": "body.name", @@ -54,4 +66,5 @@ def register_dynamic_lora_routes(router: APIRouter): return Response(status_code=200, content=response) - return router + # register the router + app.include_router(router) diff --git a/vllm/entrypoints/serve/profile/__init__.py b/vllm/entrypoints/serve/profile/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py new file mode 100644 index 0000000000000..166f13764eb36 --- /dev/null +++ b/vllm/entrypoints/serve/profile/api_router.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import APIRouter, FastAPI, Request +from fastapi.responses import Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/start_profile") +async def start_profile(raw_request: Request): + logger.info("Starting profiler...") + await engine_client(raw_request).start_profile() + logger.info("Profiler started.") + return Response(status_code=200) + + +@router.post("/stop_profile") +async def stop_profile(raw_request: Request): + logger.info("Stopping profiler...") + await engine_client(raw_request).stop_profile() + logger.info("Profiler stopped.") + return Response(status_code=200) + + +def attach_router(app: FastAPI): + if envs.VLLM_TORCH_PROFILER_DIR: + logger.warning_once( + "Torch Profiler is enabled in the API server. This should ONLY be " + "used for local development!" + ) + elif envs.VLLM_TORCH_CUDA_PROFILE: + logger.warning_once( + "CUDA Profiler is enabled in the API server. This should ONLY be " + "used for local development!" + ) + if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE: + app.include_router(router) diff --git a/vllm/entrypoints/serve/rlhf/__init__.py b/vllm/entrypoints/serve/rlhf/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/rlhf/api_router.py b/vllm/entrypoints/serve/rlhf/api_router.py new file mode 100644 index 0000000000000..3b37840ae0899 --- /dev/null +++ b/vllm/entrypoints/serve/rlhf/api_router.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from http import HTTPStatus + +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import JSONResponse + +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +router = APIRouter() + + +@router.post("/pause") +async def pause_generation( + raw_request: Request, + wait_for_inflight_requests: bool = Query(False), + clear_cache: bool = Query(True), +) -> JSONResponse: + """Pause generation requests to allow weight updates. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight + requests to finish before pausing. When ``False`` (default), + aborts any in-flight requests immediately. + clear_cache: Whether to clear KV/prefix caches after draining. + """ + + engine = engine_client(raw_request) + + try: + await engine.pause_generation( + wait_for_inflight_requests=wait_for_inflight_requests, + clear_cache=clear_cache, + ) + return JSONResponse( + content={"status": "paused"}, + status_code=HTTPStatus.OK.value, + ) + + except ValueError as err: + return JSONResponse( + content={"error": str(err)}, + status_code=HTTPStatus.BAD_REQUEST.value, + ) + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to pause generation") + return JSONResponse( + content={"error": f"Failed to pause generation: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + +@router.post("/resume") +async def resume_generation(raw_request: Request) -> JSONResponse: + """Resume generation after a pause.""" + + engine = engine_client(raw_request) + + try: + await engine.resume_generation() + return JSONResponse( + content={"status": "resumed"}, + status_code=HTTPStatus.OK.value, + ) + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to resume generation") + return JSONResponse( + content={"error": f"Failed to resume generation: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + +@router.get("/is_paused") +async def is_paused(raw_request: Request) -> JSONResponse: + """Return the current pause status.""" + + engine = engine_client(raw_request) + + try: + paused = await engine.is_paused() + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to fetch pause status") + return JSONResponse( + content={"error": f"Failed to fetch pause status: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + return JSONResponse(content={"is_paused": paused}) + + +def attach_router(app: FastAPI): + app.include_router(router) diff --git a/vllm/entrypoints/serve/sleep/__init__.py b/vllm/entrypoints/serve/sleep/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py new file mode 100644 index 0000000000000..bc01e185315c8 --- /dev/null +++ b/vllm/entrypoints/serve/sleep/api_router.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import APIRouter, FastAPI, Request +from fastapi.responses import JSONResponse, Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +router = APIRouter() + + +@router.post("/sleep") +async def sleep(raw_request: Request): + # get POST params + level = raw_request.query_params.get("level", "1") + await engine_client(raw_request).sleep(int(level)) + # FIXME: in v0 with frontend multiprocessing, the sleep command + # is sent but does not finish yet when we return a response. + return Response(status_code=200) + + +@router.post("/wake_up") +async def wake_up(raw_request: Request): + tags = raw_request.query_params.getlist("tags") + if tags == []: + # set to None to wake up all tags if no tags are provided + tags = None + logger.info("wake up the engine with tags: %s", tags) + await engine_client(raw_request).wake_up(tags) + # FIXME: in v0 with frontend multiprocessing, the wake-up command + # is sent but does not finish yet when we return a response. + return Response(status_code=200) + + +@router.get("/is_sleeping") +async def is_sleeping(raw_request: Request): + logger.info("check whether the engine is sleeping") + is_sleeping = await engine_client(raw_request).is_sleeping() + return JSONResponse(content={"is_sleeping": is_sleeping}) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + logger.warning( + "SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!" + ) + + app.include_router(router) diff --git a/vllm/entrypoints/serve/tokenize/__init__.py b/vllm/entrypoints/serve/tokenize/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py new file mode 100644 index 0000000000000..a10e78c8d28ee --- /dev/null +++ b/vllm/entrypoints/serve/tokenize/api_router.py @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse +from typing_extensions import assert_never + +from vllm.entrypoints.openai.api_server import validate_json_request +from vllm.entrypoints.openai.protocol import ( + DetokenizeRequest, + DetokenizeResponse, + ErrorResponse, + TokenizeRequest, + TokenizeResponse, +) +from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization +from vllm.entrypoints.utils import ( + with_cancellation, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def tokenization(request: Request) -> OpenAIServingTokenization: + return request.app.state.openai_serving_tokenization + + +router = APIRouter() + + +@router.post( + "/tokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def tokenize(request: TokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) + + try: + generator = await handler.create_tokenize(request, raw_request) + except NotImplementedError as e: + raise HTTPException( + status_code=HTTPStatus.NOT_IMPLEMENTED.value, detail=str(e) + ) from e + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, TokenizeResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post( + "/detokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def detokenize(request: DetokenizeRequest, raw_request: Request): + handler = tokenization(raw_request) + + try: + generator = await handler.create_detokenize(request, raw_request) + except OverflowError as e: + raise RequestValidationError(errors=[str(e)]) from e + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, DetokenizeResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +def attach_router(app: FastAPI): + if getattr(app.state.args, "enable_tokenizer_info_endpoint", False): + """Conditionally register the tokenizer info endpoint if enabled.""" + + @router.get("/tokenizer_info") + async def get_tokenizer_info(raw_request: Request): + """Get comprehensive tokenizer information.""" + result = await tokenization(raw_request).get_tokenizer_info() + return JSONResponse( + content=result.model_dump(), + status_code=result.error.code + if isinstance(result, ErrorResponse) + else 200, + ) + + app.include_router(router) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/serve/tokenize/serving.py similarity index 100% rename from vllm/entrypoints/openai/serving_tokenization.py rename to vllm/entrypoints/serve/tokenize/serving.py From 7fe9c1a2232275ee4cc7d65af3bc5b648543f367 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Wed, 3 Dec 2025 17:51:08 +0800 Subject: [PATCH 11/16] [CI] Add Async Eplb nightly CI tests (#29385) Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: Cyrus Leung --- .../deepseek_v2_lite_ep_async_eplb.sh | 73 ++++++++++++++++++ .../deepseek_v2_lite_ep_eplb.sh | 1 + .../qwen3_next_mtp_async_eplb.sh | 74 +++++++++++++++++++ .buildkite/test-pipeline.yaml | 20 ++++- vllm/distributed/eplb/rebalance_execute.py | 3 - 5 files changed, 167 insertions(+), 4 deletions(-) create mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh create mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh new file mode 100644 index 0000000000000..d7167161b0059 --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +THRESHOLD=${1:-0.25} +NUM_Q=${2:-1319} +PORT=${3:-8030} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="deepseek-ai/DeepSeek-V2-lite" + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + VLLM_ALL2ALL_BACKEND=$BACK \ + vllm serve "$MODEL" \ + --enforce-eager \ + --tensor-parallel-size 2 \ + --data-parallel-size 2 \ + --enable-expert-parallel \ + --enable-eplb \ + --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ + --trust-remote-code \ + --max-model-len 2048 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 8106f50f18f66..693418da6093e 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do --data-parallel-size 2 \ --enable-expert-parallel \ --enable-eplb \ + --eplb-config '{"window_size":200,"step_interval":600}' \ --trust-remote-code \ --max-model-len 2048 \ --port $PORT & diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh new file mode 100644 index 0000000000000..937a43d1a3221 --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +THRESHOLD=${1:-0.25} +NUM_Q=${2:-1319} +PORT=${3:-8040} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + VLLM_ALL2ALL_BACKEND=$BACK \ + vllm serve "$MODEL" \ + --enforce-eager \ + --tensor-parallel-size 4 \ + --enable-expert-parallel \ + --enable-eplb \ + --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ + --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ + --trust-remote-code \ + --max-model-len 2048 \ + --gpu-memory-utilization 0.9 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 52c848c784e53..f79e9266559f6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1373,4 +1373,22 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 376dad8a72ef1..55856d940f001 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -322,9 +322,6 @@ async def transfer_layer( num_local_physical_experts = next(iter(expert_weights[0])).shape[0] assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) assert num_physical_experts == ep_size * num_local_physical_experts - # A buffer to hold the expert weights in one layer during the exchange. - # NOTE: Currently we assume the same weights across different layers - # have the same shape. is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer( num_local_experts=num_local_physical_experts, From a21cd9ed239b853bd587ffe3c9140fe68cd41f59 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 3 Dec 2025 18:05:10 +0800 Subject: [PATCH 12/16] [Bugfix] Fix incorrect `image_grid_thw` rank for HunyuanOCR from missing `merge_by_field_config=True` (#29950) Signed-off-by: Isotr0py --- .../vision_language_multi_image.py | 23 +++++++++++++++++++ vllm/model_executor/models/hunyuan_vision.py | 1 + 2 files changed, 24 insertions(+) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 2193b1ca9cf48..560ca768d1a6c 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ) +# HunyuanOCR +def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "tencent/HunyuanOCR" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholder = ( + "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501 + ) * len(image_urls) + prompt = f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_hyperclovax_seed_vision( question: str, image_urls: list[str] ) -> ModelRequestData: @@ -1322,6 +1344,7 @@ model_example_map = { "deepseek_ocr": load_deepseek_ocr, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, + "hunyuan_vl": load_hunyuan_vl, "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "idefics3": load_idefics3, "interns1": load_interns1, diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 2950db571e6ee..6537b6df876a9 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -785,6 +785,7 @@ class HunYuanVLForConditionalGeneration( SupportsQuant, SupportsXDRoPE, ): + merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw"} # To ensure correct weight loading and mapping. From cc4e296ea62226632de5285621fd0cd287621ddc Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 3 Dec 2025 18:27:36 +0800 Subject: [PATCH 13/16] [CI/Build] Avoid duplicate empty inputs test for common multimodal generation tests (#29907) Signed-off-by: Isotr0py --- .../multimodal/generation/test_common.py | 14 +-- .../generation/vlm_utils/case_filtering.py | 114 +++++++++--------- .../multimodal/generation/vlm_utils/types.py | 4 +- 3 files changed, 69 insertions(+), 63 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index deaeea059ccaf..0eaf7198f91b7 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "qwen2_5_omni": VLMTestInfo( @@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForTextToWaveform, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "qwen3_vl": VLMTestInfo( @@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[ pytest.mark.core_model, ], @@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = { patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], - image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], + image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], @@ -707,7 +707,7 @@ VLM_TEST_SETTINGS = { max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForCausalLM, - image_size_factors=[(), (0.25,)], + image_size_factors=[(0.25,)], marks=[ pytest.mark.skipif( Version(TRANSFORMERS_VERSION) == Version("4.57.3"), @@ -760,7 +760,7 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.cpu_model], ), "skywork_r1v": VLMTestInfo( @@ -812,7 +812,7 @@ VLM_TEST_SETTINGS = { max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.skip("Model initialization hangs")], ), ### Tensor parallel / multi-gpu broadcast tests diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index d42150bcbf672..116eead7a70ad 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -62,6 +62,65 @@ def get_filtered_test_settings( return matching_tests +def get_model_type_cases( + model_type: str, + test_info: VLMTestInfo, + test_type: VLMTestType, +): + # Ensure that something is wrapped as an iterable it's not already + ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,) + + # This is essentially the same as nesting a bunch of mark.parametrize + # decorators, but we do it programmatically to allow overrides for on + # a per-model basis, while still being able to execute each of these + # as individual test cases in pytest. + iter_kwargs = OrderedDict( + [ + ("model", ensure_wrapped(test_info.models)), + ("max_tokens", ensure_wrapped(test_info.max_tokens)), + ("num_logprobs", ensure_wrapped(test_info.num_logprobs)), + ("dtype", ensure_wrapped(test_info.dtype)), + ( + "distributed_executor_backend", + ensure_wrapped(test_info.distributed_executor_backend), + ), + ] + ) + + # num_frames is video only + if test_type == VLMTestType.VIDEO: + iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames) + iter_kwargs["needs_video_metadata"] = ensure_wrapped( + test_info.needs_video_metadata + ) + + # No sizes passed for custom inputs, since inputs are directly provided + if test_type not in ( + VLMTestType.CUSTOM_INPUTS, + VLMTestType.AUDIO, + ): + wrapped_sizes = get_wrapped_test_sizes(test_info, test_type) + if wrapped_sizes is None: + raise ValueError(f"Sizes must be set for test type {test_type}") + iter_kwargs["size_wrapper"] = wrapped_sizes + + # Otherwise expand the custom test options instead + elif test_type == VLMTestType.CUSTOM_INPUTS: + if test_info.custom_test_opts is None: + raise ValueError("Test has type CUSTOM_INPUTS, but none given") + iter_kwargs["custom_test_opts"] = test_info.custom_test_opts + + # Wrap all model cases in a pytest parameter & pass marks through + return [ + pytest.param( + model_type, + ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}), + marks=test_info.marks if test_info.marks is not None else [], + ) + for case in list(itertools.product(*iter_kwargs.values())) + ] + + def get_parametrized_options( test_settings: dict[str, VLMTestInfo], test_type: VLMTestType, @@ -76,64 +135,11 @@ def get_parametrized_options( test_settings, test_type, create_new_process_for_each_test ) - # Ensure that something is wrapped as an iterable it's not already - ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,) - - def get_model_type_cases(model_type: str, test_info: VLMTestInfo): - # This is essentially the same as nesting a bunch of mark.parametrize - # decorators, but we do it programmatically to allow overrides for on - # a per-model basis, while still being able to execute each of these - # as individual test cases in pytest. - iter_kwargs = OrderedDict( - [ - ("model", ensure_wrapped(test_info.models)), - ("max_tokens", ensure_wrapped(test_info.max_tokens)), - ("num_logprobs", ensure_wrapped(test_info.num_logprobs)), - ("dtype", ensure_wrapped(test_info.dtype)), - ( - "distributed_executor_backend", - ensure_wrapped(test_info.distributed_executor_backend), - ), - ] - ) - - # num_frames is video only - if test_type == VLMTestType.VIDEO: - iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames) - iter_kwargs["needs_video_metadata"] = ensure_wrapped( - test_info.needs_video_metadata - ) - - # No sizes passed for custom inputs, since inputs are directly provided - if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO): - wrapped_sizes = get_wrapped_test_sizes(test_info, test_type) - if wrapped_sizes is None: - raise ValueError(f"Sizes must be set for test type {test_type}") - iter_kwargs["size_wrapper"] = wrapped_sizes - - # Otherwise expand the custom test options instead - elif test_type == VLMTestType.CUSTOM_INPUTS: - if test_info.custom_test_opts is None: - raise ValueError("Test has type CUSTOM_INPUTS, but none given") - iter_kwargs["custom_test_opts"] = test_info.custom_test_opts - - # Wrap all model cases in a pytest parameter & pass marks through - return [ - pytest.param( - model_type, - ExpandableVLMTestArgs( - **{k: v for k, v in zip(iter_kwargs.keys(), case)} - ), - marks=test_info.marks if test_info.marks is not None else [], - ) - for case in list(itertools.product(*iter_kwargs.values())) - ] - # Get a list per model type, where each entry contains a tuple of all of # that model type's cases, then flatten them into the top level so that # we can consume them in one mark.parametrize call. cases_by_model_type = [ - get_model_type_cases(model_type, test_info) + get_model_type_cases(model_type, test_info, test_type) for model_type, test_info in matching_tests.items() ] return list(itertools.chain(*cases_by_model_type)) diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 0c03c84497125..ae2f754813590 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" -IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] -EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)] +IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] +EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)] RunnerOutput = tuple[list[int], str, SampleLogprobs | None] From 42c194964341bea9fc59e0d35db04dfafc3c473d Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Wed, 3 Dec 2025 19:33:46 +0900 Subject: [PATCH 14/16] [Bugfix][Quantization] Support BF16 tensors on GGUF (#29948) Signed-off-by: Tsukasa OI --- tests/models/quantization/test_gguf.py | 7 +++++++ vllm/model_executor/model_loader/weight_utils.py | 12 +++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 3b9597507ac1b..064ca94f3cbac 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig( gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", ) +QWEN3_CONFIG = GGUFTestConfig( + original_model="Qwen/Qwen3-0.6B", + gguf_repo="unsloth/Qwen3-0.6B-GGUF", + gguf_filename="Qwen3-0.6B-BF16.gguf", +) + PHI3_CONFIG = GGUFTestConfig( original_model="microsoft/Phi-3.5-mini-instruct", gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", @@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig( MODELS = [ # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458 QWEN2_CONFIG, + QWEN3_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0809bdfa9d4c2..0496b7a84507b 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -921,7 +921,17 @@ def gguf_quant_weights_iterator( name = gguf_to_hf_name_map[tensor.name] if weight_type.name not in ("F32", "BF16", "F16"): name = name.replace("weight", "qweight") - param = torch.tensor(weight) + if weight_type.name == "BF16" and tensor.data.dtype == np.uint8: + # BF16 is currently the only "quantization" type that isn't + # actually quantized but is read as a raw byte tensor. + # Reinterpret as `torch.bfloat16` tensor. + weight = weight.view(np.uint16) + if reader.byte_order == "S": + # GGUF endianness != system endianness + weight = weight.byteswap() + param = torch.tensor(weight).view(torch.bfloat16) + else: + param = torch.tensor(weight) yield name, param From 787b84a9fc9d1744f82addf40912e9fb84c0b4c5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 3 Dec 2025 02:42:49 -0800 Subject: [PATCH 15/16] [Bugfix] Follow-up fix on MediaWithBytes (#29951) Signed-off-by: Roger Wang --- vllm/multimodal/base.py | 2 ++ vllm/multimodal/inputs.py | 3 ++- vllm/multimodal/parse.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 4a619fd303ca9..53eb4c591ef99 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -21,6 +21,8 @@ class MediaWithBytes(Generic[_T]): The wrapper delegates attribute access to the underlying media object, making it behave transparently like the wrapped type (e.g., PIL.Image). + + NOTE: Currently, this wrapper is used only for the image modality. """ media: _T diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index f4e38b1f3325f..397684fa2f83c 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: from PIL.Image import Image from transformers.feature_extraction_utils import BatchFeature + from .base import MediaWithBytes from .processing import MultiModalHashes else: @@ -59,7 +60,7 @@ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. """ -ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] +ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"] """ A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 650368dcb8fcd..c3c7cc2c3da0e 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -484,7 +484,7 @@ class MultiModalDataParser: return ImageEmbeddingItems(data) if ( - isinstance(data, PILImage.Image) + isinstance(data, (PILImage.Image, MediaWithBytes)) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3 ): From b294e28db2c5dee61bc25157664edcada8b90b31 Mon Sep 17 00:00:00 2001 From: HDCharles <39544797+HDCharles@users.noreply.github.com> Date: Wed, 3 Dec 2025 06:00:56 -0500 Subject: [PATCH 16/16] [refactor] CTMoEMethods to use QuantizationArgs (#28871) Signed-off-by: HDCharles Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- .../compressed_tensors/compressed_tensors.py | 6 +- .../compressed_tensors_moe.py | 155 +++++++++--------- 2 files changed, 86 insertions(+), 75 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 02086c3c0052d..b91ecb59fee18 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -767,8 +767,10 @@ class CompressedTensorsConfig(QuantizationConfig): targets=self.target_scheme_map.keys(), fused_mapping=self.packed_modules_mapping, ) - - return self.target_scheme_map[matched_target] + scheme_dict = self.target_scheme_map[matched_target] + if scheme_dict.get("format") is None: + scheme_dict["format"] = self.quant_format + return scheme_dict return None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 80ee443d4dd6a..c7368bf427fe1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -7,7 +7,11 @@ from enum import Enum import torch from compressed_tensors import CompressionFormat -from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy +from compressed_tensors.quantization import ( + ActivationOrdering, + QuantizationArgs, + QuantizationStrategy, +) from torch.nn.parameter import Parameter import vllm.envs as envs @@ -142,10 +146,26 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): # are supported + check if the layer is being ignored. weight_quant = scheme_dict.get("weights") input_quant = scheme_dict.get("input_activations") + format = scheme_dict.get("format") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): # group_size=None means channelwise group_size = weight_quant.group_size or -1 + + valid_format_and_bits = ( + weight_quant.num_bits in WNA16_SUPPORTED_BITS + and format == CompressionFormat.pack_quantized.value + ) + + if not valid_format_and_bits: + raise ValueError( + "For Fused MoE layers, only format: ", + f"{CompressionFormat.pack_quantized.value} ", + f" and bits: {WNA16_SUPPORTED_BITS} is supported ", + f"but got format: {CompressionFormat.pack_quantized.value} " + f" and bits: {weight_quant.num_bits}", + ) + # Prefer to use the MarlinMoE kernel when it is supported. if ( not check_moe_marlin_supports_layer(layer, group_size) @@ -161,12 +181,12 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): ) logger.info_once("Using CompressedTensorsWNA16MoEMethod") return CompressedTensorsWNA16MoEMethod( - quant_config, layer.moe_config, layer_name + weight_quant, input_quant, layer.moe_config ) else: logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod") return CompressedTensorsWNA16MarlinMoEMethod( - quant_config, layer.moe_config, layer_name + weight_quant, input_quant, layer.moe_config ) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name) @@ -176,15 +196,15 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): or quant_config._is_fp8_w8a8(weight_quant, input_quant) ): return CompressedTensorsW8A8Fp8MoEMethod( - quant_config, layer.moe_config, layer_name + weight_quant, input_quant, layer.moe_config ) elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8MoEMethod( - quant_config, layer.moe_config, layer_name + weight_quant, input_quant, layer.moe_config ) elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant): return CompressedTensorsW4A8Int8MoEMethod( - quant_config, layer.moe_config, layer_name + weight_quant, input_quant, layer.moe_config ) else: raise RuntimeError( @@ -650,17 +670,19 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): def __init__( self, - quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, moe: FusedMoEConfig, layer_name: str | None = None, ): - super().__init__(moe) - self.quant_config = quant_config - self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights") - self.input_quant = self.quant_config.target_scheme_map["Linear"].get( - "input_activations" + from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsConfig, ) + super().__init__(moe) + self.weight_quant = weight_quant + self.input_quant = input_quant + per_tensor = ( self.weight_quant.strategy == QuantizationStrategy.TENSOR and self.input_quant.strategy == QuantizationStrategy.TENSOR @@ -698,11 +720,13 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() # cutlass path - self.is_fp8_w8a8_sm100 = quant_config._is_fp8_w8a8_sm100( + self.is_fp8_w8a8_sm100 = CompressedTensorsConfig._is_fp8_w8a8_sm100( self.weight_quant, self.input_quant ) self.use_cutlass = not self.block_quant and ( - quant_config._is_fp8_w8a8_sm90(self.weight_quant, self.input_quant) + CompressedTensorsConfig._is_fp8_w8a8_sm90( + self.weight_quant, self.input_quant + ) or self.is_fp8_w8a8_sm100 ) self.disable_expert_map = False @@ -1261,16 +1285,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def __init__( self, - quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, moe: FusedMoEConfig, layer_name: str | None = None, ): super().__init__(moe) - self.quant_config = quant_config - self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights") - self.input_quant = self.quant_config.target_scheme_map["Linear"].get( - "input_activations" - ) + self.weight_quant = weight_quant + self.input_quant = input_quant per_channel = ( self.weight_quant.strategy == QuantizationStrategy.CHANNEL @@ -1414,36 +1436,27 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): def __init__( self, - quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs | None, moe: FusedMoEConfig, layer_name: str | None = None, ): super().__init__(moe) - self.quant_config = quant_config - # TODO: @dsikka: refactor this to use schemes as other kernels - # are supported + check if the layer is being ignored. - config = self.quant_config.target_scheme_map["Linear"].get("weights") - self.num_bits = config.num_bits - self.packed_factor = 32 // config.num_bits - self.strategy = config.strategy - self.group_size = config.group_size - self.actorder = config.actorder - self.layer_name = layer_name - self.marlin_input_dtype = get_marlin_input_dtype(layer_name) - assert config.symmetric, "Only symmetric quantization is supported for MoE" + self.weight_quant = weight_quant + self.input_quant = input_quant + assert weight_quant.symmetric, ( + "Only symmetric quantization is supported for MoE" + ) + # Extract properties from weight_quant + self.num_bits = weight_quant.num_bits + self.packed_factor = 32 // weight_quant.num_bits + self.strategy = weight_quant.strategy + self.group_size = weight_quant.group_size + self.actorder = weight_quant.actorder - if not ( - self.quant_config.quant_format == CompressionFormat.pack_quantized.value - and self.num_bits in WNA16_SUPPORTED_BITS - ): - raise ValueError( - "For Fused MoE layers, only ", - f"{CompressionFormat.pack_quantized.value} ", - "is supported for the following bits: ", - f"{WNA16_SUPPORTED_BITS}", - ) self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits] self.use_marlin = True + self.marlin_input_dtype = get_marlin_input_dtype(layer_name) def create_weights( self, @@ -1812,35 +1825,26 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): def __init__( self, - quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs | None, moe: FusedMoEConfig, layer_name: str | None = None, ): super().__init__(moe) - self.quant_config = quant_config - # TODO: @dsikka: refactor this to use schemes as other kernels - # are supported + check if the layer is being ignored. - config = self.quant_config.target_scheme_map["Linear"].get("weights") - self.num_bits = config.num_bits - self.packed_factor = 32 // config.num_bits - self.strategy = config.strategy + self.weight_quant = weight_quant + self.input_quant = input_quant + # Extract properties from weight_quant + self.num_bits = weight_quant.num_bits + self.packed_factor = 32 // weight_quant.num_bits + self.strategy = weight_quant.strategy # channelwise is not supported by this kernel - assert config.strategy == "group" - self.group_size = config.group_size + assert weight_quant.strategy == "group" + self.group_size = weight_quant.group_size # grouped actorder isn't supported by this kernel - assert config.actorder != "group" - assert config.symmetric, "Only symmetric quantization is supported for MoE" - - if not ( - self.quant_config.quant_format == CompressionFormat.pack_quantized.value - and self.num_bits in WNA16_SUPPORTED_BITS - ): - raise ValueError( - "For Fused MoE layers, only ", - f"{CompressionFormat.pack_quantized.value} ", - "is supported for the following bits: ", - f"{WNA16_SUPPORTED_BITS}", - ) + assert weight_quant.actorder != "group" + assert weight_quant.symmetric, ( + "Only symmetric quantization is supported for MoE" + ) def create_weights( self, @@ -2065,28 +2069,33 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): def __init__( self, - quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, moe: FusedMoEConfig, layer_name: str | None = None, ): super().__init__(moe) self.has_bias = self.moe.has_bias - self.quant_config = quant_config + self.weight_quant = weight_quant + self.input_quant = input_quant # Validate scheme: weights=W4 (channel or group), # activations=dynamic TOKEN (A8) - wq = self.quant_config.target_scheme_map["Linear"].get("weights") - aq = self.quant_config.target_scheme_map["Linear"].get("input_activations") # Must be dynamic per-token activations - if aq.strategy != QuantizationStrategy.TOKEN or not aq.dynamic: + if ( + input_quant.strategy != QuantizationStrategy.TOKEN + or not input_quant.dynamic + ): raise ValueError( "W4A8-int MoE needs dynamic per-token activation quantization." ) # Weight can be channel-wise (group_size=None) or group-wise - self.group_size = wq.group_size if (wq.group_size is not None) else -1 - if wq.num_bits != 4: + self.group_size = ( + weight_quant.group_size if (weight_quant.group_size is not None) else -1 + ) + if weight_quant.num_bits != 4: raise ValueError("This method only supports 4-bit weights (num_bits=4).") # CPU only