From 577c72a22743bb3ea9976d7029731237d8346e73 Mon Sep 17 00:00:00 2001 From: Fardin Hoque Date: Mon, 13 Oct 2025 15:22:31 -0700 Subject: [PATCH 01/92] [CI Perf]Prune Tests in kernel/mamba (#26538) Signed-off-by: Fardin Hoque Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- tests/kernels/mamba/test_causal_conv1d.py | 4 +-- tests/kernels/mamba/test_mamba_mixer2.py | 1 - tests/kernels/mamba/test_mamba_ssm.py | 22 ++++++++--------- tests/kernels/mamba/test_mamba_ssm_ssd.py | 30 ++++++----------------- 4 files changed, 21 insertions(+), 36 deletions(-) diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index d9023490d7fc2..4647b97c47718 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -183,7 +183,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("silu_activation", [False, True]) @pytest.mark.parametrize("has_bias", [False, True]) @pytest.mark.parametrize("seqlen", [1, 3]) @@ -265,7 +265,7 @@ def test_causal_conv1d_update_with_batch_gather( @pytest.mark.parametrize("silu_activation", [True]) @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) -@pytest.mark.parametrize("seqlen", [8, 30, 249, 2049, 4096]) +@pytest.mark.parametrize("seqlen", [8, 249, 4096]) @pytest.mark.parametrize("dim", [64, 4096]) @pytest.mark.parametrize("with_padding", [True, False]) @pytest.mark.parametrize("batch", [4, 10]) diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index d23daefa7b436..25934c409744b 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -25,7 +25,6 @@ from vllm.utils import update_environment_variables (64, 1), (64, 2), (64, 4), # hidden_size be divisible by num_gpus - (100, 5), # and n_groups must divide hidden_size ], ) @pytest.mark.parametrize("dtype", [torch.float16]) diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 9a6137239ebfc..c59fc7af0c897 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -229,8 +229,8 @@ def selective_scan_opcheck_fn( @pytest.mark.parametrize("wtype", [torch.float32]) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("seqlen", [128, 256, 512, 1024, 2048, 4096]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("seqlen", [128, 1024, 4096]) @pytest.mark.parametrize("has_delta_bias", [True]) @pytest.mark.parametrize("delta_softplus", [True]) @pytest.mark.parametrize("has_z", [True]) @@ -238,7 +238,7 @@ def selective_scan_opcheck_fn( @pytest.mark.parametrize("varBC_groups", [1, 2]) @pytest.mark.parametrize("is_variable_C", [True]) @pytest.mark.parametrize("is_variable_B", [True]) -@pytest.mark.parametrize("scan_chunks", [1, 2, 3]) +@pytest.mark.parametrize("scan_chunks", [1, 3]) def test_selective_scan( is_variable_B, is_variable_C, @@ -375,9 +375,9 @@ def test_selective_scan( ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("has_z", [False, True]) -@pytest.mark.parametrize("dstate", [16, 32, 64]) +@pytest.mark.parametrize("dstate", [16, 64]) @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) def test_selective_state_update(dim, dstate, has_z, itype): device = "cuda" @@ -413,7 +413,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): @pytest.mark.parametrize("wtype", [torch.float32]) @pytest.mark.parametrize("itype", [torch.float32]) -@pytest.mark.parametrize("seqlen", [1, 128, 129, 256, 512, 1024, 2048, 4096]) +@pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096]) @pytest.mark.parametrize("return_last_state", [True]) @pytest.mark.parametrize("has_delta_bias", [True]) @pytest.mark.parametrize("delta_softplus", [True]) @@ -589,9 +589,9 @@ def test_selective_scan_varlen( ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("has_z", [True]) -@pytest.mark.parametrize("dstate", [16, 32, 64]) +@pytest.mark.parametrize("dstate", [16, 64]) @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) # tests correctness in case subset of the sequences are padded @pytest.mark.parametrize("with_padding", [True, False]) @@ -679,11 +679,11 @@ def test_selective_state_update_with_batch_indices( assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("has_z", [False, True]) @pytest.mark.parametrize("tie_hdim", [False, True]) -@pytest.mark.parametrize("ngroups", [1, 2, 4]) -@pytest.mark.parametrize("dstate", [16, 32, 64]) +@pytest.mark.parametrize("ngroups", [1, 4]) +@pytest.mark.parametrize("dstate", [16, 64]) @pytest.mark.parametrize("dim", [2048, 4096]) def test_selective_state_update_with_heads_with_batch_indices( dim, dstate, ngroups, has_z, tie_hdim, itype diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 57dcb789e97ba..0b0b82e484a1c 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -188,9 +188,9 @@ def generate_continuous_batched_examples( ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32]) -@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128]) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("n_heads", [4, 16, 32]) +@pytest.mark.parametrize("d_head", [5, 8, 32, 128]) @pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)]) def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, itype): # this tests the kernels on a single example (bs=1) @@ -254,15 +254,14 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it ) -@pytest.mark.parametrize("itype", [torch.float32, torch.float16]) -@pytest.mark.parametrize("n_heads", [4, 8, 13]) -@pytest.mark.parametrize("d_head", [5, 16, 21, 32]) +@pytest.mark.parametrize("itype", [torch.float32]) +@pytest.mark.parametrize("n_heads", [4, 8]) +@pytest.mark.parametrize("d_head", [5, 16, 32]) @pytest.mark.parametrize( "seq_len_chunk_size_cases", [ # small-ish chunk_size (8) (64, 8, 2, [(64, 32), (64, 32)]), - (64, 8, 2, [(32, 32), (32, 32), (32, 32)]), (64, 8, 2, [(8, 8), (8, 8), (8, 8)]), # chunk size boundary ( 64, @@ -270,16 +269,7 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size, it 2, [(4, 4), (4, 4), (4, 4), (4, 4)], ), # chunk_size larger than cont batches - ( - 64, - 8, - 5, - [ - (64, 32, 16, 8, 8), - (8, 16, 32, 16, 8), - (8, 8, 16, 32, 16), - ], - ), # mode examples with varied lengths + (64, 8, 5, [(64, 32, 16, 8, 8)]), # large-ish chunk_size (256) (64, 256, 1, [(5,), (1,), (1,), (1,)]), # irregular sizes with small sequences ( @@ -359,11 +349,7 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases, @pytest.mark.parametrize("chunk_size", [8, 256]) @pytest.mark.parametrize( "seqlens", - [ - (16, 2, 8, 13), - (270, 88, 212, 203), - (16, 20), - ], + [(16, 20), (270, 88, 212, 203)], ) def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens): # This test verifies the correctness of the chunked prefill implementation From 7200a21cd14d48d548be4ce4af1870d3caf1a994 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 13 Oct 2025 18:26:37 -0400 Subject: [PATCH 02/92] [Bug] Fix Assertion error DeepEP/csrc/kernels/intranode.cu:928: 'false and Unsupported type' (#26532) Signed-off-by: yewentao256 --- .../layers/fused_moe/deepep_ht_prepare_finalize.py | 4 ++++ vllm/model_executor/layers/fused_moe/modular_kernel.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 40cc6d2cee988..a5c5c115f36c9 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -336,7 +336,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input=apply_router_weight_on_input, ) dbo_yield_and_switch_from_compute_to_comm() + assert fused_expert_output.dtype == torch.bfloat16, ( + f"Expected fused_expert_output bfloat16, got {fused_expert_output.dtype}" + ) combined_x, _, event = self.buffer.combine( + # HT combine only supports BF16 x=fused_expert_output, handle=handle, topk_weights=None, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index a0ed88309df0c..0fa98b1c7f670 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -984,7 +984,7 @@ class FusedMoEModularKernel(torch.nn.Module): assert num_chunks == 0 workspace13 = None workspace2 = None - fused_out = torch.empty_like(a1q) + fused_out = torch.empty_like(a1q, dtype=in_dtype) else: assert num_chunks > 0 workspace13, workspace2, fused_out = self._allocate_buffers( From e3fdb627d98d0c4e8313988a9a81eac0935dbcab Mon Sep 17 00:00:00 2001 From: Morrison Turnansky Date: Mon, 13 Oct 2025 18:47:16 -0400 Subject: [PATCH 03/92] [FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: morrison-turnansky Signed-off-by: Morrison Turnansky Signed-off-by: Luka Govedič Co-authored-by: Luka Govedič Co-authored-by: Jiangyun Zhu --- tests/compile/piecewise/test_toy_llama.py | 31 +++----- tests/compile/test_basic_correctness.py | 54 ++++++++------ .../model_executor/test_enabled_custom_ops.py | 39 +++++----- vllm/compilation/backends.py | 6 +- vllm/config/compilation.py | 73 ++++++++++++++++--- vllm/config/vllm.py | 14 ++++ vllm/model_executor/custom_op.py | 20 ++--- vllm/platforms/cpu.py | 2 - 8 files changed, 153 insertions(+), 86 deletions(-) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 45317b456af48..eaf0a15479e97 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -258,13 +258,13 @@ def tractable_computation( @torch.inference_mode def run_model( - llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False + llama_config, use_compile: bool, backend: str, split_attn: bool = False ) -> torch.Tensor: if use_compile: compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, - use_inductor=use_inductor, + backend=backend, cudagraph_capture_sizes=[1, 2], ) if split_attn: @@ -338,8 +338,8 @@ def run_model( return output.cpu() -@pytest.mark.parametrize("use_inductor", [True, False]) -def test_toy_llama(use_inductor: bool): +@pytest.mark.parametrize("backend", ["inductor", "eager"]) +def test_toy_llama(backend: str): # compare output with and without piecewise compilation llama_config = LlamaConfig( @@ -358,10 +358,10 @@ def test_toy_llama(use_inductor: bool): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(llama_config, use_inductor=False, use_compile=False)) - run_model(tractable_config, use_inductor=False, use_compile=False) + outputs.append(run_model(llama_config, backend="eager", use_compile=False)) + run_model(tractable_config, backend="eager", use_compile=False) - if use_inductor: + if backend == "inductor": kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} else: kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} @@ -377,10 +377,8 @@ def test_toy_llama(use_inductor: bool): num_cudagraph_captured=2, **kwargs, ): - outputs.append( - run_model(llama_config, use_inductor=use_inductor, use_compile=True) - ) - run_model(tractable_config, use_inductor=use_inductor, use_compile=True) + outputs.append(run_model(llama_config, backend=backend, use_compile=True)) + run_model(tractable_config, backend=backend, use_compile=True) with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -395,16 +393,9 @@ def test_toy_llama(use_inductor: bool): ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): outputs.append( - run_model( - llama_config, - use_inductor=use_inductor, - use_compile=True, - split_attn=True, - ) + run_model(llama_config, backend=backend, use_compile=True, split_attn=True) ) - run_model( - tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True - ) + run_model(tractable_config, backend=backend, use_compile=True, split_attn=True) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 9bfd72260436b..ab6a17e149fcd 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -77,14 +77,15 @@ class TestSetting: method="encode", ), # vision language model - TestSetting( - model="microsoft/Phi-3.5-vision-instruct", - model_args=["--trust-remote-code", "--max-model-len", "2048"], - pp_size=2, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate_with_image", - ), + # See https://github.com/vllm-project/vllm/issues/26716. + # TestSetting( + # model="microsoft/Phi-3.5-vision-instruct", + # model_args=["--trust-remote-code", "--max-model-len", "2048"], + # pp_size=2, + # tp_size=1, + # attn_backend="FLASH_ATTN", + # method="generate_with_image", + # ), ], ) def test_compile_correctness( @@ -109,41 +110,46 @@ def test_compile_correctness( with monkeypatch.context() as m: m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) final_args = [ - "--enforce-eager", *model_args, "-pp", str(pp_size), "-tp", str(tp_size), + "-O.cudagraph_mode=none", ] all_args: list[list[str]] = [] all_envs: list[dict[str, str] | None] = [] - for level in [ - CompilationLevel.NO_COMPILATION, + for comp_level in [ + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE, ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) + for level in [CompilationLevel.NO_COMPILATION, comp_level]: + all_args.append( + final_args + [f"-O.level={level}", "-O.backend=inductor"] + ) - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close", - ) - all_envs.clear() - all_args.clear() + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close", + ) + all_envs.clear() + all_args.clear() for level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE, + CompilationLevel.PIECEWISE, ]: - all_args.append(final_args + [f"-O{level}"]) + all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) + all_envs.append({}) all_envs.append({}) compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index bf290079469aa..254e9b3ab8af0 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -36,55 +36,56 @@ class Relu3(ReLUSquaredActivation): @pytest.mark.parametrize( - "env, torch_level, use_inductor, ops_enabled, default_on", + "env, torch_level, backend, ops_enabled, default_on", [ # Default values based on compile level # - All by default (no Inductor compilation) - (None, 0, False, [True] * 4, True), - (None, 1, True, [True] * 4, True), - (None, 2, False, [True] * 4, True), + (None, 0, "eager", [True] * 4, True), + (None, 1, "eager", [True] * 4, True), + (None, 2, "eager", [True] * 4, True), + (None, 3, "eager", [True] * 4, True), # - None by default (with Inductor) - (None, 3, True, [False] * 4, False), - (None, 4, True, [False] * 4, False), - # - All by default (without Inductor) - (None, 3, False, [True] * 4, True), - (None, 4, False, [True] * 4, True), + (None, 0, "inductor", [True] * 4, True), + # - None by default (with Inductor) + (None, 1, "inductor", [False] * 4, False), + (None, 2, "inductor", [False] * 4, False), + (None, 3, "inductor", [False] * 4, False), # Explicitly enabling/disabling # # Default: all # # All but SiluAndMul - ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True), + ("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True), # Only ReLU3 - ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False), + ("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False), # All but SiluAndMul - ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True), + ("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True), # All but ReLU3 (even if ReLU2 is on) - ("-relu3,+relu2", 3, False, [1, 1, 1, 0], True), + ("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True), # RMSNorm and SiluAndMul - ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False), + ("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False), # All but RMSNorm - ("-rms_norm", 3, False, [0, 1, 1, 1], True), + ("-rms_norm", 3, "eager", [0, 1, 1, 1], True), # # Default: none # # Only ReLU3 - ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False), + ("none,+relu3", 3, "inductor", [0, 0, 0, 1], False), # All but RMSNorm - ("all,-rms_norm", 4, True, [0, 1, 1, 1], True), + ("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True), ], ) def test_enabled_ops( env: str | None, torch_level: int, - use_inductor: bool, + backend: str, ops_enabled: list[int], default_on: bool, ): custom_ops = env.split(",") if env else [] vllm_config = VllmConfig( compilation_config=CompilationConfig( - use_inductor=bool(use_inductor), level=torch_level, custom_ops=custom_ops + backend=backend, level=torch_level, custom_ops=custom_ops ) ) with set_current_vllm_config(vllm_config): diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index e559fdb397fa3..46c433fe6aefb 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -41,7 +41,7 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: - if compilation_config.use_inductor: + if compilation_config.backend == "inductor": # Use standalone compile only if requested, version is new enough, # and the symbol actually exists in this PyTorch build. if ( @@ -55,6 +55,10 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: logger.debug("Using InductorAdaptor") return InductorAdaptor() else: + assert compilation_config.backend == "eager", ( + "Custom backends not supported with CompilationLevel.PIECEWISE" + ) + logger.debug("Using EagerAdaptor") return EagerAdaptor() diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 657c430049f86..5313112a19a60 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -15,6 +15,7 @@ from pydantic.dataclasses import dataclass from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.config.utils import config from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname if TYPE_CHECKING: @@ -187,7 +188,8 @@ class CompilationConfig: backend: str = "" """The backend for compilation. It needs to be a string: - - "" (empty string): use the default backend. + - "" (empty string): use the default backend ("inductor" on CUDA-alike + platforms). - "eager"/"openxla"/...: use the specified backend registered in PyTorch. - "full.module.name": a qualified name which can be used to import the @@ -196,7 +198,12 @@ class CompilationConfig: distributed setting. When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the compilation level is 3, the backend is used for the piecewise compilation - (it sees a part of the graph).""" + (it sees a part of the graph). The backend can not be custom for compilation + level 3, i.e. the backend must be either eager or inductor. Furthermore, + compilation is only piecewise if splitting ops is set accordingly and + use_inductor_cudagraphs_partition is off. Note that the default options for + splitting ops are sufficient for piecewise compilation. + """ custom_ops: list[str] = field(default_factory=list) """Fine-grained control over which custom ops to enable/disable. Use 'all' to enable all, 'none' to disable all. Also specify a list of custom op @@ -229,8 +236,12 @@ class CompilationConfig: If empty list [], no ops are excluded (suitable for full cudagraphs).""" # Inductor capture - use_inductor: bool = True - """Whether to use inductor compilation: + use_inductor: bool | None = None + """ + Whether to use inductor compilation. + + This flag is deprecated and will be removed in the next release 0.12.0. + Please use the 'backend' option instead. - False: inductor compilation is not used. graph runs in eager (custom_ops enabled by default). @@ -238,7 +249,11 @@ class CompilationConfig: One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - This setting is ignored if level str | Callable: + """ + Initialize the backend for the compilation config from a vllm config. + Arguments: + vllm_config: The vllm config to initialize the backend from. + Returns: + The backend for the compilation config. + """ + if self.level is None: + raise ValueError( + "No compilation level is set. This method should only be \ + called via vllm config where the level is set if none is \ + provided." + ) if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -553,15 +604,15 @@ class CompilationConfig: torch_backends = list_backends(exclude_tags=tuple()) if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]: - if self.backend == "": - return "eager" if self.backend in torch_backends: return self.backend return resolve_obj_by_qualname(self.backend) - # TODO: pass user-specified backend to piecewise compilation - # merge with the config use_inductor assert self.level == CompilationLevel.PIECEWISE + if self.backend not in ["eager", "inductor"]: + raise ValueError( + f"Invalid backend for piecewise compilation: {self.backend}" + ) from vllm.compilation.backends import VllmBackend @@ -710,7 +761,9 @@ class CompilationConfig: return self.level == CompilationLevel.PIECEWISE # Inductor partition case - return self.level > CompilationLevel.NO_COMPILATION and self.use_inductor + return ( + self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION + ) def custom_op_log_check(self): """ diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index b15d122c9161a..c94101bf608f2 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -322,6 +322,20 @@ class VllmConfig: # NB: Passing both --enforce-eager and a compilation level # in V0 means the compilation level wins out. self.compilation_config.level = CompilationLevel.NO_COMPILATION + else: + assert self.compilation_config.level >= CompilationLevel.NO_COMPILATION + assert self.compilation_config.level <= CompilationLevel.PIECEWISE + + # If user does not set custom ops via none or all set it here based on + # compilation level and backend. + if all(s not in self.compilation_config.custom_ops for s in ("all", "none")): + if ( + self.compilation_config.backend == "inductor" + and self.compilation_config.level > CompilationLevel.NO_COMPILATION + ): + self.compilation_config.custom_ops.append("none") + else: + self.compilation_config.custom_ops.append("all") # async tp is built on top of sequence parallelism # and requires it to be enabled. diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 7f75066f2c36f..9ef696d80712c 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -113,7 +113,9 @@ class CustomOp(nn.Module): custom_ops = compilation_config.custom_ops if not hasattr(cls, "name"): logger.warning_once( - "Custom op %s was not registered, which means it won't appear in the op registry. It will be enabled/disabled based on the global settings.", # noqa: E501 + "Custom op %s was not registered, which means it won't appear " + "in the op registry. It will be enabled/disabled based on the " + "global settings.", cls.__name__, ) return CustomOp.default_on() @@ -127,19 +129,17 @@ class CustomOp(nn.Module): @staticmethod def default_on() -> bool: """ - On by default if PyTorch Inductor is not used. - Specifying 'all' or 'none' in custom_op takes precedence. + Behavior controlled by `CompilationConfig.custom_ops`: On by default if + 'all', off by default if 'none'. + When PyTorch Inductor is used, 'none' is the default value, + otherwise 'all'. """ - from vllm.config import CompilationLevel - compilation_config = get_cached_compilation_config() - default_on = ( - compilation_config.level < CompilationLevel.PIECEWISE - or not compilation_config.use_inductor - ) count_none = compilation_config.custom_ops.count("none") count_all = compilation_config.custom_ops.count("all") - return default_on and not count_none > 0 or count_all > 0 + assert count_none + count_all == 1 + + return not count_none > 0 or count_all > 0 # Dictionary of all custom ops (classes, indexed by registered name). # To check if an op with a name is enabled, call .enabled() on the class. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index ed6724b298a53..17d610ac16a39 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -275,8 +275,6 @@ class CpuPlatform(Platform): "epilogue_fusion": True, } ) - if compilation_config.use_inductor: - compilation_config.custom_ops = ["none"] if vllm_config.lora_config is not None: compilation_config.level = CompilationLevel.NO_COMPILATION From fa96fb9c707919b5a2a2d0b0e9feb610a3a1f75a Mon Sep 17 00:00:00 2001 From: Fardin Hoque Date: Mon, 13 Oct 2025 16:08:18 -0700 Subject: [PATCH 04/92] Pruning kernel Core Tests (#26727) Signed-off-by: Fardin Hoque --- tests/kernels/core/test_fused_quant_layernorm.py | 1 - tests/kernels/core/test_layernorm.py | 16 ++-------------- tests/kernels/core/test_permute_cols.py | 2 +- tests/kernels/core/test_pos_encoding.py | 4 ++-- 4 files changed, 5 insertions(+), 18 deletions(-) diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index 418c700bbf003..63b5a37d3c779 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -15,7 +15,6 @@ VEC_HIDDEN_SIZES = range(1024, 1030) # Avoid combinatorial explosion with full Cartesian product NUM_TOKENS_HIDDEN_SIZES = [ *[(1, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5120, 5137]], - *[(83, i) for i in [1, 1033, 2048, 5120]], *[(2048, i) for i in [1, 64, *VEC_HIDDEN_SIZES, 5137]], *[(4096, i) for i in [1, 64, 5137]], ] diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 7553d45e00576..aaa13c06623ac 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -11,19 +11,7 @@ from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing -HIDDEN_SIZES = [ - 8, - 768, - 769, - 770, - 771, - 5120, - 5124, - 5125, - 5126, - 8192, - 8199, -] # Arbitrary values for testing +HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] @@ -118,7 +106,7 @@ def test_poly_norm( @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0]) +@pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0]) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("strided_input", [False, True]) diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py index 1e264735cb3c2..08fdd0e055eac 100644 --- a/tests/kernels/core/test_permute_cols.py +++ b/tests/kernels/core/test_permute_cols.py @@ -9,7 +9,7 @@ from vllm._custom_ops import permute_cols @pytest.mark.parametrize("shape", [(1, 512), (544, 4096), (67, 8192)]) -@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) def test_permute_cols(shape, dtype): x = torch.randn(shape, dtype=dtype).cuda() perm = torch.randperm(x.shape[1]).to(torch.int).cuda() diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index e1ddc5de067bb..c35ee5016ba05 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -12,8 +12,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform IS_NEOX_STYLE = [True, False] -DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 112, 120, 256] +DTYPES = [torch.bfloat16, torch.float] +HEAD_SIZES = [64, 80, 120, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [17] # Arbitrary values for testing BATCH_SIZES = [5] # Arbitrary values for testing From 35bc22f23ce3fcd9c57f318836ec102f7c85647a Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Mon, 13 Oct 2025 16:31:35 -0700 Subject: [PATCH 05/92] [ResponseAPI] Further polish message serialization and unit tests (#26728) Signed-off-by: Jialin Ouyang --- tests/entrypoints/openai/test_protocol.py | 36 +++++++++++++++++++ .../openai/test_response_api_with_harmony.py | 31 ---------------- vllm/entrypoints/openai/protocol.py | 2 +- 3 files changed, 37 insertions(+), 32 deletions(-) create mode 100644 tests/entrypoints/openai/test_protocol.py diff --git a/tests/entrypoints/openai/test_protocol.py b/tests/entrypoints/openai/test_protocol.py new file mode 100644 index 0000000000000..e9b1cfb58b502 --- /dev/null +++ b/tests/entrypoints/openai/test_protocol.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from openai_harmony import ( + Message, +) + +from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages + + +def test_serialize_message() -> None: + dict_value = {"a": 1, "b": "2"} + assert serialize_message(dict_value) == dict_value + + msg_value = { + "role": "assistant", + "name": None, + "content": [{"type": "text", "text": "Test 1"}], + "channel": "analysis", + } + msg = Message.from_dict(msg_value) + assert serialize_message(msg) == msg_value + + +def test_serialize_messages() -> None: + assert serialize_messages(None) is None + assert serialize_messages([]) is None + + dict_value = {"a": 3, "b": "4"} + msg_value = { + "role": "assistant", + "name": None, + "content": [{"type": "text", "text": "Test 2"}], + "channel": "analysis", + } + msg = Message.from_dict(msg_value) + assert serialize_messages([msg, dict_value]) == [msg_value, dict_value] diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 0720c8aa51219..57d88f84d2519 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -12,8 +12,6 @@ from openai_harmony import ( Message, ) -from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages - from ...utils import RemoteOpenAIServer MODEL_NAME = "openai/gpt-oss-20b" @@ -760,32 +758,3 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server): assert response.status == "completed" assert len(response.input_messages) > 0 assert len(response.output_messages) > 0 - - -def test_serialize_message() -> None: - dict_value = {"a": 1, "b": "2"} - assert serialize_message(dict_value) == dict_value - - msg_value = { - "role": "assistant", - "name": None, - "content": [{"type": "text", "text": "Test 1"}], - "channel": "analysis", - } - msg = Message.from_dict(msg_value) - assert serialize_message(msg) == msg_value - - -def test_serialize_messages() -> None: - assert serialize_messages(None) is None - assert serialize_messages([]) is None - - dict_value = {"a": 3, "b": "4"} - msg_value = { - "role": "assistant", - "name": None, - "content": [{"type": "text", "text": "Test 2"}], - "channel": "analysis", - } - msg = Message.from_dict(msg_value) - assert serialize_messages([msg, dict_value]) == [msg_value, dict_value] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index de5cf80105a58..1f2c40e703834 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2110,7 +2110,7 @@ def serialize_message(msg): """ if isinstance(msg, dict): return msg - elif hasattr(msg, "__dict__"): + elif hasattr(msg, "to_dict"): return msg.to_dict() else: # fallback to pyandic dump From d8bebb008a8bac5a0095288815de5cc79753ed38 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Mon, 13 Oct 2025 20:45:04 -0300 Subject: [PATCH 06/92] Add tests for chunked prefill and prefix cache with causal pooling models (#26526) Signed-off-by: Max de Bayser Co-authored-by: Ayush Singh --- tests/v1/e2e/test_pooling_chunked_prefill.py | 167 +++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 tests/v1/e2e/test_pooling_chunked_prefill.py diff --git a/tests/v1/e2e/test_pooling_chunked_prefill.py b/tests/v1/e2e/test_pooling_chunked_prefill.py new file mode 100644 index 0000000000000..a196e359920de --- /dev/null +++ b/tests/v1/e2e/test_pooling_chunked_prefill.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch.nn as nn + +from vllm.platforms import current_platform + +prompt = """ +Generals gathered in their masses +Just like witches at black masses +Evil minds that plot destruction +Sorcerer of death's construction +In the fields, the bodies burning +As the war machine keeps turning +Death and hatred to mankind +Poisoning their brainwashed minds +Oh, Lord, yeah + +Politicians hide themselves away +They only started the war +Why should they go out to fight? +They leave that all to the poor, yeah +Time will tell on their power minds +Making war just for fun +Treating people just like pawns in chess +Wait till their judgment day comes, yeah + +Now, in darkness, world stops turning +Ashes where their bodies burning +No more war pigs have the power +Hand of God has struck the hour +Day of Judgment, God is calling +On their knees, the war pigs crawling +Begging mercies for their sins +Satan, laughing, spreads his wings +Oh, Lord, yeah +""" + + +class WrapperPooler(nn.Module): + def __init__(self, pooler): + super().__init__() + self.pooler = pooler + self.chunks = [] + + def get_pooling_updates(self, task): + return self.pooler.get_pooling_updates(task) + + def forward( + self, + hidden_states, + pooling_metadata, + ): + self.chunks.append(hidden_states.shape[0]) + return self.pooler(hidden_states, pooling_metadata) + + +def inject_pooler(self): + model = self.get_model() + wrapper = WrapperPooler(model.pooler) + model.pooler = wrapper + + +def retrieve_chunks(self): + model = self.get_model() + chunks = model.pooler.chunks + model.pooler.chunks = [] + return chunks + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available") +def test_pooling_chunked_prefill(vllm_runner, monkeypatch): + """Test chunked prefill for pooling models with LastPool.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + model_id = "Qwen/Qwen3-Embedding-0.6B" + + chunk_size = 10 + + # Set chunking parameters to force chunked prefill + # Note: Chunked prefill is automatically handled by vLLM + # internally based on the model size and prompt + with vllm_runner( + model_id, + runner="pooling", + long_prefill_token_threshold=chunk_size, + tensor_parallel_size=1, + enforce_eager=True, + enable_chunked_prefill=True, + ) as llm: + llm.get_llm().llm_engine.collective_rpc(inject_pooler) + + tokenizer = llm.get_llm().get_tokenizer() + tokens = tokenizer(prompt)["input_ids"] + prompt_len = len(tokens) + full_chunks, last_chunk = divmod(prompt_len, chunk_size) + expected_chunks = [chunk_size] * full_chunks + if last_chunk: + expected_chunks.append(last_chunk) + llm.embed([prompt]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + # Check that PoolerWrapper was called and chunks were received + assert len(chunks) > 1 + assert chunks == expected_chunks + + # Disable chunked prefill + with vllm_runner( + model_id, + runner="pooling", + tensor_parallel_size=1, + enforce_eager=True, + ) as llm: + llm.get_llm().llm_engine.collective_rpc(inject_pooler) + llm.embed([prompt]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + # Check that PoolerWrapper was called and no chunks were received + assert len(chunks) == 1 + assert chunks[0] == prompt_len + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available") +def test_pooling_prefix_cache(vllm_runner, monkeypatch): + """Test chunked prefill for pooling models with LastPool.""" + + verses = prompt.split("\n\n") + + with monkeypatch.context() as m: + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + model_id = "Qwen/Qwen3-Embedding-0.6B" + + with vllm_runner( + model_id, + runner="pooling", + enable_prefix_caching=True, + tensor_parallel_size=1, + enforce_eager=True, + ) as llm: + llm.get_llm().llm_engine.collective_rpc(inject_pooler) + tokenizer = llm.get_llm().get_tokenizer() + + prompt1 = "\n\n".join([verses[0], verses[1]]) + prompt2 = "\n\n".join([verses[0], verses[2]]) + tokens1 = tokenizer(prompt1)["input_ids"] + tokens2 = tokenizer(prompt2)["input_ids"] + prompt1_len = len(tokens1) + prompt2_len = len(tokens2) + + llm.embed([prompt1]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + assert len(chunks) == 1 + assert chunks[0] == prompt1_len + + llm.embed([prompt2]) + chunks = llm.get_llm().llm_engine.collective_rpc(retrieve_chunks)[0] + + assert len(chunks) == 1 + assert chunks[0] <= prompt1_len + assert chunks[0] < prompt2_len + + cache_config = llm.get_llm().llm_engine.cache_config + print(f"{cache_config=}") + # Prefixes are cached in blocks + assert (prompt2_len - chunks[0]) % cache_config.block_size == 0 From 8317f723540a9aef2ad1390ac4952aaa13b46f46 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 13 Oct 2025 17:45:59 -0700 Subject: [PATCH 07/92] [Misc][DP] support customized aggregated logger for dp (#24354) Signed-off-by: Lu Fang --- .../multi_instance_data_parallel.py | 55 +++- tests/v1/engine/test_async_llm.py | 56 +++- tests/v1/metrics/test_engine_logger_apis.py | 8 +- vllm/engine/arg_utils.py | 7 + vllm/entrypoints/openai/api_server.py | 1 + vllm/v1/engine/async_llm.py | 4 + vllm/v1/engine/llm_engine.py | 2 + vllm/v1/metrics/loggers.py | 249 +++++++++++++----- 8 files changed, 297 insertions(+), 85 deletions(-) diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py index b46cea5619671..04d21e0489402 100644 --- a/examples/online_serving/multi_instance_data_parallel.py +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import threading from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams +from vllm.v1.metrics.loggers import AggregatedLoggingStatLogger """ To run this example, run the following commands simultaneously with @@ -21,37 +23,64 @@ send a request to the instance with DP rank 1. """ +def _do_background_logging(engine, interval, stop_event): + try: + while not stop_event.is_set(): + asyncio.run(engine.do_log_stats()) + stop_event.wait(interval) + except Exception as e: + print(f"vLLM background logging shutdown: {e}") + pass + + async def main(): engine_args = AsyncEngineArgs( model="ibm-research/PowerMoE-3b", data_parallel_size=2, + tensor_parallel_size=1, dtype="auto", max_model_len=2048, data_parallel_address="127.0.0.1", data_parallel_rpc_port=62300, data_parallel_size_local=1, enforce_eager=True, + enable_log_requests=True, + disable_custom_all_reduce=True, ) - engine_client = AsyncLLMEngine.from_engine_args(engine_args) - + engine_client = AsyncLLMEngine.from_engine_args( + engine_args, + # Example: Using aggregated logger + stat_loggers=[AggregatedLoggingStatLogger], + ) + stop_logging_event = threading.Event() + logging_thread = threading.Thread( + target=_do_background_logging, + args=(engine_client, 5, stop_logging_event), + daemon=True, + ) + logging_thread.start() sampling_params = SamplingParams( temperature=0.7, top_p=0.9, max_tokens=100, ) + num_prompts = 10 + for i in range(num_prompts): + prompt = "Who won the 2004 World Series?" + final_output: RequestOutput | None = None + async for output in engine_client.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id=f"abcdef-{i}", + data_parallel_rank=1, + ): + final_output = output + if final_output: + print(final_output.outputs[0].text) - prompt = "Who won the 2004 World Series?" - final_output: RequestOutput | None = None - async for output in engine_client.generate( - prompt=prompt, - sampling_params=sampling_params, - request_id="abcdef", - data_parallel_rank=1, - ): - final_output = output - if final_output: - print(final_output.outputs[0].text) + stop_logging_event.set() + logging_thread.join() if __name__ == "__main__": diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 8f715c085b5d1..b9fa553142781 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -17,7 +17,12 @@ from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind from vllm.utils import set_default_torch_num_threads from vllm.v1.engine.async_llm import AsyncLLM -from vllm.v1.metrics.loggers import LoggingStatLogger +from vllm.v1.metrics.loggers import ( + AggregatedLoggingStatLogger, + LoggingStatLogger, + PerEngineStatLoggerAdapter, + PrometheusStatLogger, +) if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True) @@ -384,6 +389,12 @@ class MockLoggingStatLogger(LoggingStatLogger): self.log = MagicMock() +class MockAggregatedStatLogger(AggregatedLoggingStatLogger): + def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): + super().__init__(vllm_config, engine_indexes) + self.log = MagicMock() + + @pytest.mark.asyncio async def test_customize_loggers(monkeypatch): """Test that we can customize the loggers. @@ -401,10 +412,45 @@ async def test_customize_loggers(monkeypatch): await engine.do_log_stats() - stat_loggers = engine.logger_manager.per_engine_logger_dict - assert len(stat_loggers) == 1 - assert len(stat_loggers[0]) == 2 # LoggingStatLogger + MockLoggingStatLogger - stat_loggers[0][0].log.assert_called_once() + stat_loggers = engine.logger_manager.stat_loggers + assert ( + len(stat_loggers) == 3 + ) # MockLoggingStatLogger + LoggingStatLogger + Promethus Logger + print(f"{stat_loggers=}") + stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once() + assert isinstance(stat_loggers[1], PerEngineStatLoggerAdapter) + assert isinstance(stat_loggers[1].per_engine_stat_loggers[0], LoggingStatLogger) + assert isinstance(stat_loggers[2], PrometheusStatLogger) + + +@pytest.mark.asyncio +async def test_customize_aggregated_loggers(monkeypatch): + """Test that we can customize the aggregated loggers. + If a customized logger is provided at the init, it should + be added to the default loggers. + """ + + with monkeypatch.context() as m, ExitStack() as after: + m.setenv("VLLM_USE_V1", "1") + + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args( + TEXT_ENGINE_ARGS, + stat_loggers=[MockLoggingStatLogger, MockAggregatedStatLogger], + ) + after.callback(engine.shutdown) + + await engine.do_log_stats() + + stat_loggers = engine.logger_manager.stat_loggers + assert len(stat_loggers) == 4 + # MockLoggingStatLogger + MockAggregatedStatLogger + # + LoggingStatLogger + PrometheusStatLogger + stat_loggers[0].per_engine_stat_loggers[0].log.assert_called_once() + stat_loggers[1].log.assert_called_once() + assert isinstance(stat_loggers[2], PerEngineStatLoggerAdapter) + assert isinstance(stat_loggers[2].per_engine_stat_loggers[0], LoggingStatLogger) + assert isinstance(stat_loggers[3], PrometheusStatLogger) @pytest.mark.asyncio(scope="module") diff --git a/tests/v1/metrics/test_engine_logger_apis.py b/tests/v1/metrics/test_engine_logger_apis.py index bf780b1f36adf..6dd5b2b069c09 100644 --- a/tests/v1/metrics/test_engine_logger_apis.py +++ b/tests/v1/metrics/test_engine_logger_apis.py @@ -54,7 +54,7 @@ async def test_async_llm_replace_default_loggers(log_stats_enabled_engine_args): engine = AsyncLLM.from_engine_args( log_stats_enabled_engine_args, stat_loggers=[RayPrometheusStatLogger] ) - assert isinstance(engine.logger_manager.prometheus_logger, RayPrometheusStatLogger) + assert isinstance(engine.logger_manager.stat_loggers[0], RayPrometheusStatLogger) engine.shutdown() @@ -73,9 +73,11 @@ async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args): disabled_log_engine_args, stat_loggers=[DummyStatLogger] ) - assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1 + assert len(engine.logger_manager.stat_loggers) == 2 + assert len(engine.logger_manager.stat_loggers[0].per_engine_stat_loggers) == 1 assert isinstance( - engine.logger_manager.per_engine_logger_dict[0][0], DummyStatLogger + engine.logger_manager.stat_loggers[0].per_engine_stat_loggers[0], + DummyStatLogger, ) # log_stats is still True, since custom stat loggers are used diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 54a0539f40479..f0eb3c2213384 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -410,6 +410,7 @@ class EngineArgs: max_logprobs: int = ModelConfig.max_logprobs logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode disable_log_stats: bool = False + aggregate_engine_logging: bool = False revision: str | None = ModelConfig.revision code_revision: str | None = ModelConfig.code_revision rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling") @@ -1043,6 +1044,12 @@ class EngineArgs: help="Disable logging statistics.", ) + parser.add_argument( + "--aggregate-engine-logging", + action="store_true", + help="Log aggregate rather than per-engine statistics " + "when using data parallelism.", + ) return parser @classmethod diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 96a0947c4bd31..ec5632523fe3c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -239,6 +239,7 @@ async def build_async_engine_client_from_engine_args( vllm_config=vllm_config, usage_context=usage_context, enable_log_requests=engine_args.enable_log_requests, + aggregate_engine_logging=engine_args.aggregate_engine_logging, disable_log_stats=engine_args.disable_log_stats, client_addresses=client_config, client_count=client_count, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fbbe15b7b04f2..39cd1d97c280a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -57,6 +57,7 @@ class AsyncLLM(EngineClient): log_requests: bool = True, start_engine_loop: bool = True, stat_loggers: list[StatLoggerFactory] | None = None, + aggregate_engine_logging: bool = False, client_addresses: dict[str, str] | None = None, client_count: int = 1, client_index: int = 0, @@ -144,6 +145,7 @@ class AsyncLLM(EngineClient): custom_stat_loggers=stat_loggers, enable_default_loggers=log_stats, client_count=client_count, + aggregate_engine_logging=aggregate_engine_logging, ) self.logger_manager.log_engine_initialized() @@ -187,6 +189,7 @@ class AsyncLLM(EngineClient): usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: list[StatLoggerFactory] | None = None, enable_log_requests: bool = False, + aggregate_engine_logging: bool = False, disable_log_stats: bool = False, client_addresses: dict[str, str] | None = None, client_count: int = 1, @@ -209,6 +212,7 @@ class AsyncLLM(EngineClient): stat_loggers=stat_loggers, log_requests=enable_log_requests, log_stats=not disable_log_stats, + aggregate_engine_logging=aggregate_engine_logging, usage_context=usage_context, client_addresses=client_addresses, client_count=client_count, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index debf8a2192548..538fb6a04bd7b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -51,6 +51,7 @@ class LLMEngine: vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool, + aggregate_engine_logging: bool = False, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: list[StatLoggerFactory] | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, @@ -132,6 +133,7 @@ class LLMEngine: vllm_config=vllm_config, custom_stat_loggers=stat_loggers, enable_default_loggers=log_stats, + aggregate_engine_logging=aggregate_engine_logging, ) self.logger_manager.log_engine_initialized() diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8c5abae2ae652..1a8fefdd1ddf8 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -24,7 +24,9 @@ from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm logger = init_logger(__name__) -StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"] +PerEngineStatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"] +AggregateStatLoggerFactory = type["AggregateStatLoggerBase"] +StatLoggerFactory = AggregateStatLoggerFactory | PerEngineStatLoggerFactory class StatLoggerBase(ABC): @@ -54,6 +56,14 @@ class StatLoggerBase(ABC): pass +class AggregateStatLoggerBase(StatLoggerBase): + """Abstract base class for loggers that + aggregate across multiple DP engines.""" + + @abstractmethod + def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): ... + + class LoggingStatLogger(StatLoggerBase): def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): self.engine_index = engine_index @@ -72,6 +82,8 @@ class LoggingStatLogger(StatLoggerBase): self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config) self.last_prompt_throughput: float = 0.0 self.last_generation_throughput: float = 0.0 + self.engine_is_idle = False + self.aggregated = False def _reset(self, now): self.last_log_time = now @@ -92,6 +104,10 @@ class LoggingStatLogger(StatLoggerBase): return 0.0 return float(tracked_stats / delta_time) + @property + def log_prefix(self): + return "Engine {:03d}: ".format(self.engine_index) + def record( self, scheduler_stats: SchedulerStats | None, @@ -110,34 +126,37 @@ class LoggingStatLogger(StatLoggerBase): self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats) if kv_connector_stats := scheduler_stats.kv_connector_stats: self.kv_connector_logging.observe(kv_connector_stats) - self.last_scheduler_stats = scheduler_stats - + if not self.aggregated: + self.last_scheduler_stats = scheduler_stats if mm_cache_stats: self.mm_caching_metrics.observe(mm_cache_stats) - def log(self): + def _update_stats(self): now = time.monotonic() prompt_throughput = self._get_throughput(self.num_prompt_tokens, now) generation_throughput = self._get_throughput(self.num_generation_tokens, now) self._reset(now) - - scheduler_stats = self.last_scheduler_stats - - log_fn = logger.info - if not any( + self.engine_is_idle = not any( ( prompt_throughput, generation_throughput, self.last_prompt_throughput, self.last_generation_throughput, ) - ): - # Avoid log noise on an idle production system - log_fn = logger.debug + ) self.last_generation_throughput = generation_throughput self.last_prompt_throughput = prompt_throughput + def aggregate_scheduler_stats(self): + # noop for per engine loggers + return + + def log(self): + self._update_stats() + self.aggregate_scheduler_stats() + # Avoid log noise on an idle production system + log_fn = logger.debug if self.engine_is_idle else logger.info # Format and print output. log_parts = [ "Avg prompt throughput: %.1f tokens/s", @@ -148,11 +167,11 @@ class LoggingStatLogger(StatLoggerBase): "Prefix cache hit rate: %.1f%%", ] log_args = [ - prompt_throughput, - generation_throughput, - scheduler_stats.num_running_reqs, - scheduler_stats.num_waiting_reqs, - scheduler_stats.kv_cache_usage * 100, + self.last_prompt_throughput, + self.last_generation_throughput, + self.last_scheduler_stats.num_running_reqs, + self.last_scheduler_stats.num_waiting_reqs, + self.last_scheduler_stats.kv_cache_usage * 100, self.prefix_caching_metrics.hit_rate * 100, ] if not self.mm_caching_metrics.empty: @@ -160,8 +179,7 @@ class LoggingStatLogger(StatLoggerBase): log_args.append(self.mm_caching_metrics.hit_rate * 100) log_fn( - "Engine %03d: " + ", ".join(log_parts), - self.engine_index, + self.log_prefix + ", ".join(log_parts), *log_args, ) @@ -178,7 +196,114 @@ class LoggingStatLogger(StatLoggerBase): ) -class PrometheusStatLogger(StatLoggerBase): +class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase): + def __init__( + self, + vllm_config: VllmConfig, + engine_indexes: list[int], + ): + self.engine_indexes = engine_indexes + self.last_scheduler_stats_dict: dict[int, SchedulerStats] = { + idx: SchedulerStats() for idx in self.engine_indexes + } + LoggingStatLogger.__init__(self, vllm_config, engine_index=-1) + self.aggregated = True + + @property + def log_prefix(self): + return "{} Engines Aggregated: ".format(len(self.engine_indexes)) + + def record( + self, + scheduler_stats: SchedulerStats | None, + iteration_stats: IterationStats | None, + mm_cache_stats: MultiModalCacheStats | None = None, + engine_idx: int = 0, + ): + if engine_idx not in self.engine_indexes: + logger.warning("Unexpected engine_idx: %d", engine_idx) + return + LoggingStatLogger.record( + self, + scheduler_stats, + iteration_stats, + mm_cache_stats=mm_cache_stats, + engine_idx=engine_idx, + ) + if scheduler_stats is not None: + self.last_scheduler_stats_dict[engine_idx] = scheduler_stats + + def aggregate_scheduler_stats(self): + self.last_scheduler_stats = SchedulerStats() + for last_scheduler_stats in self.last_scheduler_stats_dict.values(): + self.last_scheduler_stats.num_waiting_reqs += ( + last_scheduler_stats.num_waiting_reqs + ) + self.last_scheduler_stats.num_running_reqs += ( + last_scheduler_stats.num_running_reqs + ) + self.last_scheduler_stats.num_corrupted_reqs += ( + last_scheduler_stats.num_corrupted_reqs + ) + self.last_scheduler_stats.kv_cache_usage += ( + last_scheduler_stats.kv_cache_usage + ) + self.last_scheduler_stats.kv_cache_usage /= len(self.last_scheduler_stats_dict) + + def log(self): + LoggingStatLogger.log(self) + + def log_engine_initialized(self): + if self.vllm_config.cache_config.num_gpu_blocks: + logger.info( + "%d Engines: vllm cache_config_info with initialization " + "after num_gpu_blocks is: %d", + len(self.engine_indexes), + self.vllm_config.cache_config.num_gpu_blocks, + ) + + +class PerEngineStatLoggerAdapter(AggregateStatLoggerBase): + def __init__( + self, + vllm_config: VllmConfig, + engine_indexes: list[int], + per_engine_stat_logger_factory: PerEngineStatLoggerFactory, + ) -> None: + self.per_engine_stat_loggers = {} + self.engine_indexes = engine_indexes + for engine_index in engine_indexes: + self.per_engine_stat_loggers[engine_index] = per_engine_stat_logger_factory( + vllm_config, engine_index + ) + + def record( + self, + scheduler_stats: SchedulerStats | None, + iteration_stats: IterationStats | None, + mm_cache_stats: MultiModalCacheStats | None = None, + engine_idx: int = 0, + ): + if engine_idx not in self.per_engine_stat_loggers: + logger.warning("Unexpected engine_idx: %d", engine_idx) + return + self.per_engine_stat_loggers[engine_idx].record( + scheduler_stats, + iteration_stats, + mm_cache_stats=mm_cache_stats, + engine_idx=engine_idx, + ) + + def log(self): + for per_engine_stat_logger in self.per_engine_stat_loggers.values(): + per_engine_stat_logger.log() + + def log_engine_initialized(self): + for per_engine_stat_logger in self.per_engine_stat_loggers.values(): + per_engine_stat_logger.log_engine_initialized() + + +class PrometheusStatLogger(AggregateStatLoggerBase): _gauge_cls = Gauge _counter_cls = Counter _histogram_cls = Histogram @@ -189,6 +314,7 @@ class PrometheusStatLogger(StatLoggerBase): ): if engine_indexes is None: engine_indexes = [0] + self.engine_indexes = engine_indexes unregister_vllm_metrics() @@ -880,14 +1006,14 @@ class StatLoggerManager: engine_idxs: list[int] | None = None, custom_stat_loggers: list[StatLoggerFactory] | None = None, enable_default_loggers: bool = True, + aggregate_engine_logging: bool = False, client_count: int = 1, ): - self.engine_idxs = engine_idxs if engine_idxs else [0] - - factories: list[StatLoggerFactory] = [] + self.engine_indexes = engine_idxs if engine_idxs else [0] + self.stat_loggers: list[AggregateStatLoggerBase] = [] + stat_logger_factories: list[StatLoggerFactory] = [] if custom_stat_loggers is not None: - factories.extend(custom_stat_loggers) - + stat_logger_factories.extend(custom_stat_loggers) if enable_default_loggers and logger.isEnabledFor(logging.INFO): if client_count > 1: logger.warning( @@ -895,27 +1021,35 @@ class StatLoggerManager: "disabling stats logging to avoid incomplete stats." ) else: - factories.append(LoggingStatLogger) - - # engine_idx: StatLogger - self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} - prometheus_factory = PrometheusStatLogger - for engine_idx in self.engine_idxs: - loggers: list[StatLoggerBase] = [] - for logger_factory in factories: - # If we get a custom prometheus logger, use that - # instead. This is typically used for the ray case. - if isinstance(logger_factory, type) and issubclass( - logger_factory, PrometheusStatLogger - ): - prometheus_factory = logger_factory - continue - loggers.append(logger_factory(vllm_config, engine_idx)) # type: ignore - self.per_engine_logger_dict[engine_idx] = loggers - - # For Prometheus, need to share the metrics between EngineCores. - # Each EngineCore's metrics are expressed as a unique label. - self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs) + default_logger_factory = ( + AggregatedLoggingStatLogger + if aggregate_engine_logging + else LoggingStatLogger + ) + stat_logger_factories.append(default_logger_factory) + custom_prometheus_logger: bool = False + for stat_logger_factory in stat_logger_factories: + if isinstance(stat_logger_factory, type) and issubclass( + stat_logger_factory, AggregateStatLoggerBase + ): + global_stat_logger = stat_logger_factory( + vllm_config=vllm_config, + engine_indexes=self.engine_indexes, + ) + if isinstance(global_stat_logger, PrometheusStatLogger): + custom_prometheus_logger = True + else: + # per engine logger + global_stat_logger = PerEngineStatLoggerAdapter( + vllm_config=vllm_config, + engine_indexes=self.engine_indexes, + per_engine_stat_logger_factory=stat_logger_factory, # type: ignore[arg-type] + ) + self.stat_loggers.append(global_stat_logger) + if not custom_prometheus_logger: + self.stat_loggers.append( + PrometheusStatLogger(vllm_config, self.engine_indexes) + ) def record( self, @@ -926,9 +1060,7 @@ class StatLoggerManager: ): if engine_idx is None: engine_idx = 0 - - per_engine_loggers = self.per_engine_logger_dict[engine_idx] - for logger in per_engine_loggers: + for logger in self.stat_loggers: logger.record( scheduler_stats, iteration_stats, @@ -936,21 +1068,10 @@ class StatLoggerManager: engine_idx=engine_idx, ) - self.prometheus_logger.record( - scheduler_stats, - iteration_stats, - mm_cache_stats=mm_cache_stats, - engine_idx=engine_idx, - ) - def log(self): - for per_engine_loggers in self.per_engine_logger_dict.values(): - for logger in per_engine_loggers: - logger.log() + for logger in self.stat_loggers: + logger.log() def log_engine_initialized(self): - self.prometheus_logger.log_engine_initialized() - - for per_engine_loggers in self.per_engine_logger_dict.values(): - for logger in per_engine_loggers: - logger.log_engine_initialized() + for agg_logger in self.stat_loggers: + agg_logger.log_engine_initialized() From 3e051bda82efe351ecfb5bb21de1606accc976d4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 13 Oct 2025 21:12:52 -0400 Subject: [PATCH 08/92] [UX] Replace VLLM_ALL2ALL_BACKEND with --all2all-backend (#26732) Signed-off-by: mgoin --- docs/design/dbo.md | 4 +- docs/serving/expert_parallel_deployment.md | 38 ++++++++++--------- vllm/config/parallel.py | 33 +++++++++++++++- vllm/config/vllm.py | 8 ++-- .../base_device_communicator.py | 3 ++ .../device_communicators/cuda_communicator.py | 15 ++++---- .../device_communicators/xpu_communicator.py | 12 +++--- vllm/engine/arg_utils.py | 5 +++ .../model_executor/layers/fused_moe/config.py | 12 +++--- .../quantization/utils/flashinfer_fp4_moe.py | 2 +- vllm/platforms/cuda.py | 4 +- vllm/v1/engine/utils.py | 5 ++- 12 files changed, 90 insertions(+), 51 deletions(-) diff --git a/docs/design/dbo.md b/docs/design/dbo.md index d92c47c80f951..f2d98ccd063fa 100644 --- a/docs/design/dbo.md +++ b/docs/design/dbo.md @@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve * `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch * `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch -Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests. +Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests. Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled. -EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo` +EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency` Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES` diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 93ed383395f27..cd6515dde75ef 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -14,13 +14,16 @@ Before using EP, you need to install the necessary dependencies. We are actively ### Backend Selection Guide -vLLM provides three communication backends for EP: +vLLM provides multiple communication backends for EP. Use `--all2all-backend` to select one: | Backend | Use Case | Features | Best For | |---------|----------|----------|----------| -| `pplx` | Single node | Chunked prefill support | Development, best for intra-node deployments | -| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout | High-throughput scenarios, prefill-dominated workloads | -| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout | Low-latency scenarios, decode-dominated workloads | +| `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration | +| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development | +| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios | +| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios | +| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes | +| `naive` | Testing/debugging | Simple broadcast-based implementation | Debugging, not recommended for production | ## Single Node Deployment @@ -47,11 +50,11 @@ The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parall ```bash # Single node EP deployment with pplx backend -VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 \ - vllm serve deepseek-ai/DeepSeek-V3-0324 \ - --tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU --data-parallel-size 8 \ # Data parallelism across 8 processes - --enable-expert-parallel # Enable expert parallelism + --enable-expert-parallel \ # Enable expert parallelism + --all2all-backend pplx # Use pplx communication backend ``` ## Multi-Node Deployment @@ -70,8 +73,8 @@ The following example deploys `DeepSeek-V3-0324` across 2 nodes using `deepep_lo ```bash # Node 1 (Primary - handles incoming requests) -VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \ - vllm serve deepseek-ai/DeepSeek-V3-0324 \ +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --all2all-backend deepep_low_latency \ --tensor-parallel-size 1 \ # TP size per node --enable-expert-parallel \ # Enable EP --data-parallel-size 16 \ # Total DP size across all nodes @@ -81,8 +84,8 @@ VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \ --api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended) # Node 2 (Secondary - headless mode, no API server) -VLLM_ALL2ALL_BACKEND=deepep_low_latency VLLM_USE_DEEP_GEMM=1 \ - vllm serve deepseek-ai/DeepSeek-V3-0324 \ +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --all2all-backend deepep_low_latency \ --tensor-parallel-size 1 \ # TP size per node --enable-expert-parallel \ # Enable EP --data-parallel-size 16 \ # Total DP size across all nodes @@ -169,11 +172,12 @@ Single node deployment with EPLB enabled: ```bash # Single node with EPLB load balancing -VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V3-0324 \ - --tensor-parallel-size 1 \ # Tensor parallelism - --data-parallel-size 8 \ # Data parallelism - --enable-expert-parallel \ # Enable EP - --enable-eplb \ # Enable load balancer +vllm serve deepseek-ai/DeepSeek-V3-0324 \ + --tensor-parallel-size 1 \ # Tensor parallelism + --data-parallel-size 8 \ # Data parallelism + --enable-expert-parallel \ # Enable EP + --all2all-backend pplx \ # Use pplx communication backend + --enable-eplb \ # Enable load balancer --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}' ``` diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 084e458f88309..b7ef0fef68330 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -113,6 +113,25 @@ class ParallelConfig: with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 will have experts [1, 3]. This strategy can help improve load balancing for grouped expert models with no redundant experts.""" + all2all_backend: ( + Literal[ + "naive", + "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter", + "flashinfer_all2allv", + ] + | None + ) = None + """All2All backend for MoE expert parallel communication. If not set, uses + the value from VLLM_ALL2ALL_BACKEND environment variable. Available options: + - "naive": Naive all2all implementation using broadcasts + - "allgather_reducescatter": All2all based on allgather and reducescatter + - "pplx": Use pplx kernels + - "deepep_high_throughput": Use deepep high-throughput kernels + - "deepep_low_latency": Use deepep low-latency kernels + - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl""" num_redundant_experts: int | None = None """`num_redundant_experts` is deprecated and has been replaced with `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. @@ -341,7 +360,7 @@ class ParallelConfig: @property def use_sequence_parallel_moe(self) -> bool: return ( - envs.VLLM_ALL2ALL_BACKEND + self.all2all_backend in ( "allgather_reducescatter", "naive", @@ -390,7 +409,7 @@ class ParallelConfig: factors.append(self.tensor_parallel_size) factors.append(self.enable_expert_parallel) factors.append(self.data_parallel_size) - factors.append(envs.VLLM_ALL2ALL_BACKEND) + factors.append(self.all2all_backend) factors.append(self.enable_eplb) if self.enable_eplb: factors.append(self.eplb_config.log_balancedness) @@ -400,6 +419,16 @@ class ParallelConfig: return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__(self) -> None: + # Set all2all_backend from env var if not specified, with deprecation warning + if self.all2all_backend is None: + self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND + if envs.is_set("VLLM_ALL2ALL_BACKEND"): + logger.warning_once( + "VLLM_ALL2ALL_BACKEND environment variable is deprecated and " + "will be removed in a future release. Please use the " + "--all2all-backend command-line argument instead." + ) + # Forward deprecated fields to their new location if self.num_redundant_experts is not None: self.eplb_config.num_redundant_experts = self.num_redundant_experts diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c94101bf608f2..4da164c1a0a96 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -523,13 +523,13 @@ class VllmConfig: ) if self.parallel_config.enable_dbo: - a2a_backend = envs.VLLM_ALL2ALL_BACKEND + a2a_backend = self.parallel_config.all2all_backend assert a2a_backend in ["deepep_low_latency", "deepep_high_throughput"], ( "Microbatching currently only supports the deepep_low_latency and " f"deepep_high_throughput all2all backend. {a2a_backend} is not " - "supported. To fix set the VLLM_ALL2ALL_BACKEND environment " - "variable to deepep_low_latency or deepep_high_throughput and " - "install the DeepEP kernels." + "supported. To fix use --all2all-backend=deepep_low_latency or " + "--all2all-backend=deepep_high_throughput and install the DeepEP" + " kernels." ) if not self.model_config.disable_cascade_attn: diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 007c65acedb9b..9566dbac7f22f 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -111,6 +111,7 @@ class DeviceCommunicatorBase: self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank) use_ep = False + all2all_backend = None from vllm.config import get_current_vllm_config config = get_current_vllm_config() @@ -119,9 +120,11 @@ class DeviceCommunicatorBase: # where all data parallel ranks execute forward together), # we initialize the all2all manager used in expert parallel. use_ep = config.parallel_config.data_parallel_size > 1 + all2all_backend = config.parallel_config.all2all_backend self.is_ep_communicator = "ep" in unique_name self.use_all2all = self.is_ep_communicator and use_ep + self.all2all_backend = all2all_backend self.all2all_manager: All2AllManagerBase | None = None def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 39b02311fe873..971a87f57dbb9 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -91,33 +91,32 @@ class CudaCommunicator(DeviceCommunicatorBase): self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device) if self.use_all2all: - all2all_backend = envs.VLLM_ALL2ALL_BACKEND - if all2all_backend == "naive": + if self.all2all_backend == "naive": from .all2all import NaiveAll2AllManager self.all2all_manager = NaiveAll2AllManager(self.cpu_group) - elif all2all_backend == "allgather_reducescatter": + elif self.all2all_backend == "allgather_reducescatter": from .all2all import AgRsAll2AllManager self.all2all_manager = AgRsAll2AllManager(self.cpu_group) - elif all2all_backend == "pplx": + elif self.all2all_backend == "pplx": from .all2all import PPLXAll2AllManager self.all2all_manager = PPLXAll2AllManager(self.cpu_group) - elif all2all_backend == "deepep_high_throughput": + elif self.all2all_backend == "deepep_high_throughput": from .all2all import DeepEPHTAll2AllManager self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group) - elif all2all_backend == "deepep_low_latency": + elif self.all2all_backend == "deepep_low_latency": from .all2all import DeepEPLLAll2AllManager self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group) - elif all2all_backend == "flashinfer_all2allv": + elif self.all2all_backend == "flashinfer_all2allv": from .all2all import FlashInferAllToAllManager self.all2all_manager = FlashInferAllToAllManager(self.cpu_group) else: - raise ValueError(f"Unknown all2all backend: {all2all_backend}") + raise ValueError(f"Unknown all2all backend: {self.all2all_backend}") if is_global_first_rank(): logger.info( diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index 83e336511059b..ad61fdfb8ea52 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -6,7 +6,6 @@ import torch import torch.distributed as dist from torch.distributed import ProcessGroup -import vllm.envs as envs from vllm.logger import init_logger from .base_device_communicator import DeviceCommunicatorBase @@ -24,15 +23,14 @@ class XpuCommunicator(DeviceCommunicatorBase): ): super().__init__(cpu_group, device, device_group, unique_name) if self.use_all2all: - all2all_backend = envs.VLLM_ALL2ALL_BACKEND - if all2all_backend != "naive": + if self.all2all_backend != "naive": logger.warning( - "`%s` all2all manager is not supported on XPU." + "`%s` all2all manager is not supported on XPU. " "Falling back to `naive` all2all manager for XPU.", - all2all_backend, + self.all2all_backend, ) - all2all_backend = "naive" - if all2all_backend == "naive": + self.all2all_backend = "naive" + if self.all2all_backend == "naive": from .all2all import NaiveAll2AllManager self.all2all_manager = NaiveAll2AllManager(self.cpu_group) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f0eb3c2213384..09c8b4ca02c57 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -371,6 +371,7 @@ class EngineArgs: data_parallel_hybrid_lb: bool = False data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + all2all_backend: str | None = ParallelConfig.all2all_backend enable_dbo: bool = ParallelConfig.enable_dbo dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold @@ -763,6 +764,9 @@ class EngineArgs: parallel_group.add_argument( "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"] ) + parallel_group.add_argument( + "--all2all-backend", **parallel_kwargs["all2all_backend"] + ) parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"]) parallel_group.add_argument( "--dbo-decode-token-threshold", @@ -1461,6 +1465,7 @@ class EngineArgs: data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, enable_expert_parallel=self.enable_expert_parallel, + all2all_backend=self.all2all_backend, enable_dbo=self.enable_dbo, dbo_decode_token_threshold=self.dbo_decode_token_threshold, dbo_prefill_token_threshold=self.dbo_prefill_token_threshold, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 377116124522c..38ea6acc0fc50 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -641,6 +641,7 @@ class FusedMoEParallelConfig: ep_rank: int use_ep: bool # whether to use EP or not + all2all_backend: str # all2all backend for MoE communication @property def use_all2all_kernels(self): @@ -648,21 +649,18 @@ class FusedMoEParallelConfig: @property def use_pplx_kernels(self): - return self.use_all2all_kernels and envs.VLLM_ALL2ALL_BACKEND == "pplx" + return self.use_all2all_kernels and self.all2all_backend == "pplx" @property def use_deepep_ht_kernels(self): return ( self.use_all2all_kernels - and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" + and self.all2all_backend == "deepep_high_throughput" ) @property def use_deepep_ll_kernels(self): - return ( - self.use_all2all_kernels - and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency" - ) + return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency" @staticmethod def make( @@ -762,6 +760,7 @@ class FusedMoEParallelConfig: ep_size=1, ep_rank=0, use_ep=False, + all2all_backend=vllm_parallel_config.all2all_backend, ) # DP + EP / TP + EP / DP + TP + EP assert use_ep @@ -777,6 +776,7 @@ class FusedMoEParallelConfig: ep_size=ep_size, ep_rank=ep_rank, use_ep=True, + all2all_backend=vllm_parallel_config.all2all_backend, ) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 1c6b5de83b2ba..ddb74a27dc122 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -58,7 +58,7 @@ def build_flashinfer_fp4_cutlass_moe_prepare_finalize( ) -> mk.FusedMoEPrepareAndFinalize: """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel""" use_dp = moe.moe_parallel_config.dp_size > 1 - enable_alltoallv = envs.VLLM_ALL2ALL_BACKEND == "flashinfer_all2allv" + enable_alltoallv = moe.moe_parallel_config.all2all_backend == "flashinfer_all2allv" return create_flashinfer_prepare_finalize( use_dp=use_dp, use_nvfp4=True, enable_alltoallv=enable_alltoallv ) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b51421b6a32d3..0252c3acb08c1 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -192,7 +192,7 @@ class CudaPlatformBase(Platform): compilation_config = vllm_config.compilation_config if ( - envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" + parallel_config.all2all_backend == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 and compilation_config.cudagraph_mode != CUDAGraphMode.NONE ): @@ -204,7 +204,7 @@ class CudaPlatformBase(Platform): "kernels are optimized for prefill and are incompatible with " "CUDA Graphs. " "In order to use CUDA Graphs for decode-optimized workloads, " - "set VLLM_ALL2ALL_BACKEND to another option, such as " + "use --all2all-backend with another option, such as " "deepep_low_latency, pplx, or allgather_reducescatter." ) compilation_config.cudagraph_mode = CUDAGraphMode.NONE diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index e617abf6b2c7d..159b779111c44 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -356,9 +356,10 @@ class CoreEngineActorManager: ) device_str = current_platform.ray_device_key + all2all_backend = vllm_config.parallel_config.all2all_backend if envs.VLLM_RAY_DP_PACK_STRATEGY == "fill" and ( - envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" - or envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency" + all2all_backend == "deepep_high_throughput" + or all2all_backend == "deepep_low_latency" ): raise ValueError( "DeepEP kernels require EP ranks [0,7] (same for [8,15], ...) " From b59dd19b55036050cf491501614fb46cedf5c5c1 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Mon, 13 Oct 2025 18:15:34 -0700 Subject: [PATCH 09/92] [compile] Enable sequence parallelism for full cuda graph without specifying compile sizes (#26681) Signed-off-by: angelayi --- vllm/compilation/collective_fusion.py | 11 +++++++++-- vllm/compilation/inductor_pass.py | 2 +- vllm/compilation/pass_manager.py | 4 +++- vllm/compilation/sequence_parallelism.py | 20 +++++++++++++++++++- vllm/compilation/vllm_inductor_pass.py | 2 ++ 5 files changed, 34 insertions(+), 5 deletions(-) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 1dc8888607f54..7c85c89bcd7ac 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -431,8 +431,15 @@ class AsyncTPPass(VllmPatternMatcherPass): self.dump_patterns(config, self.patterns) - def is_applicable_for_shape(self, shape: int | None) -> bool: - # only do replace for specific shapes + def is_applicable(self, shape: int | None) -> bool: + # This pass is applied on top of the sequence parallelism pass. + # It inherits the same applicability condition as `SequenceParallelismPass`. + # See `SequenceParallelismPass.is_applicable` for more details. + if ( + not self.compilation_config.splitting_ops + or self.compilation_config.use_inductor_graph_partition + ): + return True tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index b9ec3cf6c5edb..4b263fa6f5a2b 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -96,7 +96,7 @@ class InductorPass(CustomGraphPass): encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") return hashlib.sha256(encoded).hexdigest() - def is_applicable_for_shape(self, shape: int | None): + def is_applicable(self, shape: int | None): return True diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index e323fa1f77349..55fe235e2d2c1 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -71,9 +71,11 @@ class PostGradPassManager(CustomGraphPass): shape = get_pass_context().runtime_shape for pass_ in self.passes: - if pass_.is_applicable_for_shape(shape): + if pass_.is_applicable(shape): pass_(graph) VllmInductorPass.dump_prefix += 1 + else: + logger.debug("Skipping %s with shape %s", pass_, shape) # post-cleanup goes before fix_functionalization # because it requires a functional graph diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 8ff530cebd82d..31624a8fdcc0f 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -482,7 +482,25 @@ class SequenceParallelismPass(VllmPatternMatcherPass): ).register(self.patterns) self.dump_patterns(config, self.patterns) - def is_applicable_for_shape(self, shape: int | None) -> bool: + def is_applicable(self, shape: int | None) -> bool: + # When sequence parallelism is enabled, the residual tensor from RMSNorm + # needs to be split along the sequence dimension. However, this dimension + # is symbolic during piecewise compilation, and splitting symbolic shapes + # is not supported. + # + # This pass is therefore only applied when the sequence dimension is + # concrete: + # 1. In full-graph compilation mode (no Dynamo splitting ops are used). + # For this case we always pad num_tokens to be a multiple of + # tensor_parallel_size, so there's no need to check shape % tp_size == 0. + # 2. For specific shape provided during compilation (e.g., from + # `compile_sizes`), which must be divisible by the tensor-parallel + # size. + if ( + not self.compilation_config.splitting_ops + or self.compilation_config.use_inductor_graph_partition + ): + return True tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index ad83e7b3e0c2e..beac928b5d718 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -3,6 +3,7 @@ import functools import operator import time +import weakref from typing import ClassVar import regex as re @@ -28,6 +29,7 @@ class VllmInductorPass(InductorPass): """Keep track of pass index for debug dump ordering.""" def __init__(self, config: VllmConfig): + self.compilation_config = weakref.proxy(config.compilation_config) self.pass_config = config.compilation_config.pass_config self.model_dtype = config.model_config.dtype if config.model_config else None self.device = config.device_config.device if config.device_config else None From cfded80793a595ea8bac867b24654c8e0ee26afb Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Mon, 13 Oct 2025 18:46:44 -0700 Subject: [PATCH 10/92] [Easy] Fix env type check errors from VLLM_DEBUG_LOG_API_SERVER_RESPONSE (#26742) Signed-off-by: Jialin Ouyang --- vllm/envs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/envs.py b/vllm/envs.py index c3686477d88d1..d93ae8b9c2250 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -24,6 +24,7 @@ if TYPE_CHECKING: CUDA_VISIBLE_DEVICES: str | None = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 VLLM_API_KEY: str | None = None + VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False S3_ACCESS_KEY_ID: str | None = None S3_SECRET_ACCESS_KEY: str | None = None S3_ENDPOINT_URL: str | None = None From 8a0af6a5610b7eb6232bc5f66fda40a46b275869 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 14 Oct 2025 10:12:09 +0800 Subject: [PATCH 11/92] [build][torch.compile] upgrade depyf version (#26702) Signed-off-by: youkaichao --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index a7aa801208969..ec668e16d0e97 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -39,7 +39,7 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. compressed-tensors == 0.12.2 # required for compressed-tensors -depyf==0.19.0 # required for profiling and debugging with compilation config +depyf==0.20.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files python-json-logger # Used by logging as per examples/others/logging_configuration.md From 8ae169286f93dfdf1c84f0a31eb2ee6a8debdbce Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 13 Oct 2025 22:22:16 -0400 Subject: [PATCH 12/92] [torch.compile] Unwrap fused_marlin_moe custom op (#26739) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/kernels/moe/test_moe.py | 5 ++- .../layers/fused_moe/__init__.py | 1 - .../layers/fused_moe/fused_marlin_moe.py | 39 ------------------- .../layers/quantization/awq_marlin.py | 3 +- .../compressed_tensors_moe.py | 7 ++-- .../model_executor/layers/quantization/fp8.py | 3 +- .../layers/quantization/gptq_marlin.py | 3 +- .../layers/quantization/modelopt.py | 3 +- .../layers/quantization/mxfp4.py | 7 +++- .../layers/quantization/quark/quark_moe.py | 3 +- 10 files changed, 22 insertions(+), 52 deletions(-) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 6b391c173f0bc..966e2f8f3b131 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -26,6 +26,7 @@ from vllm.model_executor.layers.fused_moe.config import ( int4_w4a16_moe_quant_config, int8_w8a16_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, modular_triton_fused_moe, @@ -724,7 +725,7 @@ def test_fused_marlin_moe( with set_current_vllm_config(vllm_config): torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map) - marlin_output = torch.ops.vllm.fused_marlin_moe( + marlin_output = fused_marlin_moe( a, qweight1, qweight2, @@ -837,7 +838,7 @@ def test_fused_marlin_moe_with_bias(m): with set_current_vllm_config(vllm_config): torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1, b_bias2) - marlin_output = torch.ops.vllm.fused_marlin_moe( + marlin_output = fused_marlin_moe( a, qweight1, qweight2, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 247919dcc8440..cb31045971bd8 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -51,7 +51,6 @@ __all__ = [ if HAS_TRITON: # import to register the custom ops - import vllm.model_executor.layers.fused_moe.fused_marlin_moe # noqa from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( BatchedDeepGemmExperts, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 58ed826ba037b..57e17f324d2e8 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( maybe_warn_marlin_atomic_add, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import direct_register_custom_op def fused_marlin_moe( @@ -241,44 +240,6 @@ def fused_marlin_moe( return torch.sum(intermediate_cache3.view(-1, topk, K), dim=1, out=output) -def fused_marlin_moe_fake( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - gating_output: torch.Tensor | None, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - quant_type_id: int, - apply_router_weight_on_input: bool = False, - global_num_experts: int = -1, - global_scale1: torch.Tensor | None = None, - global_scale2: torch.Tensor | None = None, - expert_map: torch.Tensor | None = None, - g_idx1: torch.Tensor | None = None, - g_idx2: torch.Tensor | None = None, - sort_indices1: torch.Tensor | None = None, - sort_indices2: torch.Tensor | None = None, - w1_zeros: torch.Tensor | None = None, - w2_zeros: torch.Tensor | None = None, - workspace: torch.Tensor | None = None, - intermediate_cache13: torch.Tensor | None = None, - intermediate_cache2: torch.Tensor | None = None, - is_k_full: bool = True, - output: torch.Tensor | None = None, - inplace: bool = False, -) -> torch.Tensor: - return torch.empty_like(hidden_states) - - -direct_register_custom_op( - op_name="fused_marlin_moe", - op_func=fused_marlin_moe, - fake_impl=fused_marlin_moe_fake, -) - - class MarlinExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__(self, quant_config: FusedMoEQuantConfig): # TODO (varun) : Enable activation quantization diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index e1633d392dbf6..d96c657e01192 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, @@ -604,7 +605,7 @@ class AWQMoEMethod(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_qweight, layer.w2_qweight, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index efc5bd3639f4b..3cc726aafd298 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.cpu_fused_moe import select_experts from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( is_valid_flashinfer_cutlass_fused_moe, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP, @@ -462,7 +463,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): # if self.use_marlin: assert self.fused_experts is None - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, @@ -1067,7 +1068,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): if self.use_marlin: assert activation == "silu", f"{activation} not supported for Marlin MoE." assert self.fused_experts is None - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, @@ -1654,7 +1655,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): indices_type=self.topk_indices_dtype, ) - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight_packed, layer.w2_weight_packed, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 9a03105fafbf6..02b1896a89964 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -26,6 +26,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.linear import ( LinearBase, @@ -1196,7 +1197,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): elif self.use_marlin: assert activation == "silu", f"{activation} not supported for Marlin MoE." assert self.fused_experts is None - result = torch.ops.vllm.fused_marlin_moe( + result = fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index dd86c990259f1..b22c3c125eada 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, @@ -765,7 +766,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_qweight, layer.w2_qweight, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7c7769455e8a4..0f0638899bf1e 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( is_valid_flashinfer_cutlass_fused_moe, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, @@ -1701,7 +1702,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): # if self.use_marlin: assert self.fused_experts is None - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5d78b82e3ee7c..a7f9fdcb5513e 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -21,7 +21,10 @@ from vllm.model_executor.layers.fused_moe.config import ( mxfp4_w4a16_moe_quant_config, ocp_mx_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import MarlinExperts +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + MarlinExperts, + fused_marlin_moe, +) from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( OAITritonExperts, ) @@ -947,7 +950,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): e_score_correction_bias=e_score_correction_bias, ) - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 778317e3a9592..c13cf7007e68f 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import ( fp8_w8a8_moe_quant_config, ocp_mx_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( is_rocm_aiter_moe_enabled, ) @@ -402,7 +403,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): ) if self.use_marlin: assert activation == "silu", f"{activation} not supported for Marlin MoE." - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, From 29350922c64a808a6de3b0e31fbadc2aebd6ba3f Mon Sep 17 00:00:00 2001 From: Heng Guo Date: Tue, 14 Oct 2025 11:03:16 +0800 Subject: [PATCH 13/92] [Feature][Quantization] auto_round format add support for regex (#24024) Signed-off-by: n1ck-guo Signed-off-by: Heng Guo Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../layers/quantization/auto_round.py | 44 ++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index 2889bc92dccb9..0e4815be603e2 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -4,6 +4,7 @@ from fractions import Fraction from typing import TYPE_CHECKING, Any +import regex as re import torch from vllm.logger import init_logger @@ -128,11 +129,44 @@ class AutoRoundConfig(QuantizationConfig): def get_layer_config(self, layer, layer_name: str): def get_config(name: str, quantized: bool = True): - cfg = self.extra_config.get(name, {}) if self.extra_config else {} + if not self.extra_config: + return ( + self.weight_bits if quantized else 16, + self.group_size if quantized else -1, + self.sym if quantized else True, + ) + + # exact match first + if name in self.extra_config: + cfg = self.extra_config[name] + return ( + cfg.get("bits", self.weight_bits if quantized else 16), + cfg.get("group_size", self.group_size if quantized else -1), + cfg.get("sym", self.sym if quantized else True), + ) + + REGEX_SPECIAL_CHARS = set(r"*+?^$()[]{}|\\") + for pattern, cfg in self.extra_config.items(): + if not isinstance(pattern, str) or not any( + c in REGEX_SPECIAL_CHARS for c in pattern + ): + continue + + try: + if re.search(re.compile(pattern), name) is not None: + return ( + cfg.get("bits", self.weight_bits if quantized else 16), + cfg.get("group_size", self.group_size if quantized else -1), + cfg.get("sym", self.sym if quantized else True), + ) + except re.error: + # Invalid regex, ignore. + continue + return ( - cfg.get("bits", self.weight_bits if quantized else 16), - cfg.get("group_size", self.group_size if quantized else -1), - cfg.get("sym", self.sym if quantized else True), + self.weight_bits if quantized else 16, + self.group_size if quantized else -1, + self.sym if quantized else True, ) # 1. Exact match from config @@ -176,7 +210,7 @@ class AutoRoundConfig(QuantizationConfig): f"consistent quant config for {sub_names}" ) - # 5. Fallback + # 5. Fallback or try a regular expression match return get_config(layer_name, quantized) def check_quantized(self, weight_bits: int) -> bool: From fe3edb4cf0027c99ff37f891349ed8e6d464b02e Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Tue, 14 Oct 2025 01:25:43 -0300 Subject: [PATCH 14/92] Add support for the /rerank endpoint in vllm bench serve (#26602) Signed-off-by: Max de Bayser --- docs/contributing/benchmarks.md | 46 +++++++ vllm/benchmarks/datasets.py | 129 +++++++++++++++++++ vllm/benchmarks/lib/endpoint_request_func.py | 49 ++++++- 3 files changed, 218 insertions(+), 6 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 6b1eabf3d67fa..0f2c4a5d7f069 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -35,6 +35,7 @@ th { | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` | | Random | ✅ | ✅ | `synthetic` | | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | +| RandomForReranking | ✅ | ✅ | `synthetic` | | Prefix Repetition | ✅ | ✅ | `synthetic` | | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` | @@ -878,6 +879,51 @@ vllm bench serve \ +#### Reranker Benchmark + +Benchmark the performance of rerank requests in vLLM. + +
+Show more + +Unlike generative models which use Completions API or Chat Completions API, +you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API. + +For reranking, the only supported dataset is `--dataset-name random-rerank` + +Start the server: + +```bash +vllm serve BAAI/bge-reranker-v2-m3 +``` + +Run the benchmark: + +```bash +vllm bench serve \ + --model BAAI/bge-reranker-v2-m3 \ + --backend vllm-rerank \ + --endpoint /v1/rerank \ + --dataset-name random-rerank \ + --tokenizer BAAI/bge-reranker-v2-m3 \ + --random-input-len 512 \ + --num-prompts 10 \ + --random-batch-size 5 +``` + +For reranker models, this will create `num_prompts / random_batch_size` requests with +`random_batch_size` "documents" where each one has close to `random_input_len` tokens. +In the example above, this results in 2 rerank requests with 5 "documents" each where +each document has close to 512 tokens. + +Please note that the `/v1/rerank` is also supported by embedding models. So if you're running +with an embedding model, also set `--no_reranker`. Because in this case the query is +treated as a individual prompt by the server, here we send `random_batch_size - 1` documents +to account for the extra prompt which is the query. The token accounting to report the +throughput numbers correctly is also adjusted. + +
+ [](){ #performance-benchmarks } ## Performance Benchmarks diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 331d31c1d0e63..d610389ddb6b0 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -572,6 +572,7 @@ class RandomDataset(BenchmarkDataset): # Ensure the lower bound for output length is at least 1 to # prevent sampling 0 tokens. output_low = max(output_low, 1) + output_high = max(output_high, 1) if input_low > input_high: raise ValueError( @@ -638,6 +639,112 @@ class RandomDataset(BenchmarkDataset): return prompt, total_input_len, token_mismatch +# ----------------------------------------------------------------------------- +# Random Dataset Implementation (Synthetic Data) +# ----------------------------------------------------------------------------- + + +class RandomDatasetForReranking(RandomDataset): + """ + Random dataset specialized for the needs of scoring: + - Batches of inputs + - Inputs composed of pairs + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, + input_len: int = RandomDataset.DEFAULT_INPUT_LEN, + batchsize: int = 1, + is_reranker: bool = True, + **kwargs, + ) -> list[SampleRequest]: + n_sep_tokens = int(is_reranker) + + query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len + + query_lens, _, query_offsets = self.get_sampling_params( + 1, range_ratio, query_len_param, 0, tokenizer + ) + + query_len = int(query_lens[0]) + + if not is_reranker: + assert num_requests > 1 and batchsize > 1 + num_requests -= 1 + batchsize -= 1 + doc_len_param = input_len + else: + doc_len_param = input_len - query_len - n_sep_tokens + + doc_lens, _, doc_offsets = self.get_sampling_params( + num_requests, range_ratio, doc_len_param, 0, tokenizer + ) + vocab_size = tokenizer.vocab_size + + query_prompt, query_input_len, token_mismatch_total = ( + self.generate_token_sequence( + tokenizer=tokenizer, + prefix_token_ids=[], + prefix_len=0, + vocab_size=vocab_size, + input_len=query_len, + offset=int(query_offsets[0]), + index=0, + ) + ) + + requests = [] + for i in range(num_requests): + prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501 + tokenizer=tokenizer, + prefix_token_ids=[], + prefix_len=0, + vocab_size=vocab_size, + input_len=int(doc_lens[i]), + offset=int(doc_offsets[i]), + index=i + 1, + ) + token_mismatch_total += token_mismatch + requests.append((prompt, total_input_len)) + + batch_requests = [] + # Create batched requests + for i in range(0, num_requests, batchsize): + batch = requests[i : i + batchsize] + query_contrib = ( + (query_input_len + n_sep_tokens) * len(batch) + if is_reranker + else query_input_len + ) + batch_requests.append( + SampleRequest( + prompt=[query_prompt] + [req[0] for req in batch], + prompt_len=query_contrib + sum(req[1] for req in batch), + expected_output_len=0, + request_id=request_id_prefix + str(i // batchsize), + ) + ) + + if token_mismatch_total != 0: + logger.warning( + "Across all generated prompts, there were %d %s tokens " + "than expected after decoding and re-encoding. This is " + "expected due to the imperfect nature of the sampling " + "procedure.", + abs(token_mismatch_total), + "more" if token_mismatch_total > 0 else "fewer", + ) + + return batch_requests + + # ----------------------------------------------------------------------------- # MultiModalDataset Implementation # ----------------------------------------------------------------------------- @@ -1149,6 +1256,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "sonnet", "random", "random-mm", + "random-rerank", "hf", "custom", "prefix_repetition", @@ -1292,6 +1400,14 @@ def add_dataset_parser(parser: FlexibleArgumentParser): default=1, help=("Batch size for random sampling. Only used for embeddings benchmark."), ) + random_group.add_argument( + "--no-reranker", + action="store_true", + help=( + "Whether the model supports reranking natively." + " Only used for reranker benchmark." + ), + ) # random multimodal dataset options random_mm_group = parser.add_argument_group( @@ -1678,6 +1794,19 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), + "random-rerank": lambda: RandomDatasetForReranking( + random_seed=args.seed, + dataset_path=args.dataset_path, + disable_shuffle=args.disable_shuffle, + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + input_len=args.random_input_len, + range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, + batchsize=args.random_batch_size, + is_reranker=not args.no_reranker, + ), "prefix_repetition": lambda: PrefixRepetitionRandomDataset( random_seed=args.seed, dataset_path=args.dataset_path, diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 2e5c100a3031d..4f427a31b9ee1 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -64,7 +64,7 @@ class StreamedResponseHandler: class RequestFuncInput: """The input for the request function.""" - prompt: str + prompt: str | list[str] api_url: str prompt_len: int output_len: int @@ -484,7 +484,7 @@ async def async_request_openai_audio( return output -async def _run_openai_embeddings( +async def _run_pooling_request( session: aiohttp.ClientSession, api_url: str, payload: dict[str, Any], @@ -497,7 +497,7 @@ async def _run_openai_embeddings( try: async with session.post(url=api_url, headers=headers, json=payload) as response: if response.status == 200: - output.latency = time.perf_counter() - st + output.ttft = output.latency = time.perf_counter() - st data = await response.json() output.success = True output.generated_text = "" @@ -536,7 +536,43 @@ async def async_request_openai_embeddings( } _update_headers_common(headers, request_func_input) - return await _run_openai_embeddings( + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_vllm_rerank( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "vLLM score API", "rerank") + + assert ( + isinstance(request_func_input.prompt, list) + and len(request_func_input.prompt) > 1 + ) + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "query": request_func_input.prompt[0], + "documents": request_func_input.prompt[1:], + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( session, api_url, payload=payload, @@ -572,7 +608,7 @@ async def async_request_openai_embeddings_chat( } _update_headers_common(headers, request_func_input) - return await _run_openai_embeddings( + return await _run_pooling_request( session, api_url, payload=payload, @@ -685,7 +721,7 @@ async def async_request_infinity_embeddings( } _update_headers_common(headers, request_func_input) - return await _run_openai_embeddings( + return await _run_pooling_request( session, api_url, payload=payload, @@ -722,6 +758,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "infinity-embeddings": async_request_infinity_embeddings, "infinity-embeddings-clip": async_request_infinity_embeddings_clip, # (Infinity embedding server does not support vlm2vec) + "vllm-rerank": async_request_vllm_rerank, } OPENAI_COMPATIBLE_BACKENDS = [ From 2e36cdbe2b4bff4370a02105fdfbdd015d6ec4e8 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Tue, 14 Oct 2025 12:51:55 +0800 Subject: [PATCH 15/92] [Docs] Add a start tag to build.inc.md (#26747) Signed-off-by: windsonsea --- docs/getting_started/installation/cpu/arm.inc.md | 2 +- docs/getting_started/installation/cpu/build.inc.md | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index e45baa0aa4938..15fce69b44871 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -23,7 +23,7 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] ---8<-- "docs/getting_started/installation/cpu/build.inc.md" +--8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information" Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 4bd4d39a6f80b..f99497128fd37 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -1,3 +1,5 @@ +# --8<-- [start:extra-information] + First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```bash @@ -39,7 +41,4 @@ If you want to develop vLLM, install it in editable mode instead. VLLM_TARGET_DEVICE=cpu python setup.py develop ``` -!!! note - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM. - # --8<-- [end:extra-information] From 4497c8f82114ef82bc3dd903b790e27c29b22de1 Mon Sep 17 00:00:00 2001 From: XiongfeiWei Date: Mon, 13 Oct 2025 22:04:23 -0700 Subject: [PATCH 16/92] Fix lora tests failure in TPU CI due to the removal of LoRA bias (#26723) Signed-off-by: Xiongfei Wei --- vllm/v1/worker/tpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 6fd71259dbcbf..828f09cbc8d8d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -2128,12 +2128,11 @@ def replace_set_lora(model): lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: torch.Tensor | None, - bias: torch.Tensor | None = None, ): # TODO: The integer index leads to a recompilation, but converting it # to a tensor doesn't seem to work anymore. This might be fixed with a # later release of torch_xla. - self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias) + self._original_set_lora(index, lora_a, lora_b, embeddings_tensor) torch_xla.sync(wait=False) def _tpu_reset_lora(self, index: int): From 4821ac1b4d36c5780ec9769c40fcacc0f8c41a7d Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 14 Oct 2025 13:57:26 +0800 Subject: [PATCH 17/92] [CI] [ROCm] Automate CC list for ROCm related issue (#26753) Signed-off-by: vllmellm --- .github/workflows/issue_autolabel.yml | 138 ++++++++++++++++++-------- 1 file changed, 95 insertions(+), 43 deletions(-) diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index c2b17abe811cd..7d565ef9f2e45 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -13,6 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Label issues based on keywords + id: label-step uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | @@ -42,7 +43,6 @@ jobs: searchIn: "body" }, ], - // Substring search - matches anywhere in text (partial matches) substrings: [ { @@ -89,14 +89,12 @@ jobs: term: "hip_", searchIn: "both" }, - // ROCm tools and libraries { term: "hipify", searchIn: "both" }, ], - // Regex patterns - for complex pattern matching regexPatterns: [ { @@ -107,13 +105,17 @@ jobs: } ], }, + // Add more label configurations here as needed + // example: { + // keywords: [...], + // substrings: [...], + // regexPatterns: [...] + // }, }; - // Helper function to create regex based on search type function createSearchRegex(term, type) { // Escape special regex characters in the term const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); - switch (type) { case 'keyword': // Word boundary search - matches whole words only @@ -125,16 +127,13 @@ jobs: throw new Error(`Unknown search type: ${type}`); } } - // Helper function to find matching terms in text with line information function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { const matches = []; const lines = text.split('\n'); - for (const termConfig of searchTerms) { let regex; let term, searchIn, pattern, description, flags; - // Handle different input formats (string or object) if (typeof termConfig === 'string') { term = termConfig; @@ -146,21 +145,17 @@ jobs: description = termConfig.description; flags = termConfig.flags; } - // Skip if this term shouldn't be searched in the current location if (searchIn !== 'both' && searchIn !== searchLocation) { continue; } - // Create appropriate regex if (searchType === 'regex') { regex = new RegExp(pattern, flags || "gi"); } else { regex = createSearchRegex(term, searchType); } - const termMatches = []; - // Check each line for matches lines.forEach((line, lineIndex) => { const lineMatches = line.match(regex); @@ -175,15 +170,14 @@ jobs: originalTerm: term || pattern, description: description, // Show context around the match in the line - context: line.length > 100 ? - line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), - line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' + context: line.length > 100 ? + line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), + line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' : line.trim() }); }); } }); - if (termMatches.length > 0) { matches.push({ term: term || (description || pattern), @@ -196,64 +190,48 @@ jobs: }); } } - return matches; } - // Helper function to check if label should be added async function processLabel(labelName, config) { const body = context.payload.issue.body || ""; const title = context.payload.issue.title || ""; - core.notice(`Processing label: ${labelName}`); core.notice(`Issue Title: "${title}"`); core.notice(`Issue Body length: ${body.length} characters`); - let shouldAddLabel = false; let allMatches = []; let reason = ''; - const keywords = config.keywords || []; const substrings = config.substrings || []; const regexPatterns = config.regexPatterns || []; - core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); - // Search in title if (title.trim()) { core.notice(`Searching in title: "${title}"`); - const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); - allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); } - // Search in body if (body.trim()) { core.notice(`Searching in body (${body.length} characters)`); - const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); - allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); } - if (allMatches.length > 0) { core.notice(`Found ${allMatches.length} matching term(s):`); - for (const termMatch of allMatches) { const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; - if (termMatch.searchType === 'regex') { core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); } else { core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); } - // Show details for each match termMatch.matches.forEach((match, index) => { core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); @@ -266,7 +244,6 @@ jobs: } }); } - shouldAddLabel = true; const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); @@ -274,13 +251,10 @@ jobs: const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); - reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; } - core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`); - if (shouldAddLabel) { const existingLabels = context.payload.issue.labels.map(l => l.name); if (!existingLabels.includes(labelName)) { @@ -296,14 +270,92 @@ jobs: core.notice(`Label "${labelName}" already present.`); return false; } - core.notice(`No matching terms found for label "${labelName}".`); return false; } - // Process all configured labels - const processLabels = Object.entries(labelConfig) - .map(([labelName, config]) => processLabel(labelName, config)); - const labelsAdded = await Promise.all(processLabels); - const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0); - core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); \ No newline at end of file + const labelsAddedResults = await Promise.all( + Object.entries(labelConfig).map(([labelName, config]) => + processLabel(labelName, config).then(added => ({ labelName, added })) + ) + ); + + const numLabelsAdded = labelsAddedResults.filter(r => r.added).length; + core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); + + // Return which labels were added for the next step + const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName); + core.setOutput('labels_added', JSON.stringify(addedLabels)); + return addedLabels; + + - name: CC users for labeled issues + if: steps.label-step.outputs.labels_added != '[]' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + with: + script: | + // Configuration: Map labels to GitHub users to CC + // You can add multiple users per label, and multiple label configurations + const ccConfig = { + rocm: { + users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3'] + message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions + }, + // Add more label -> user mappings here + // Example: + // cuda: { + // users: ['user1', 'user2'], + // message: 'CC {users} for CUDA-related issue' + // }, + // performance: { + // users: ['perfexpert'], + // message: 'CC {users} for performance issue' + // }, + }; + + const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}'); + core.notice(`Labels added: ${labelsAdded.join(', ')}`); + + // Get existing comments to check for already mentioned users + const comments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const issueBody = context.payload.issue.body || ''; + const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n'); + + // Process each label that was added + for (const label of labelsAdded) { + if (ccConfig[label]) { + const config = ccConfig[label]; + const usersToMention = []; + + // Check which users haven't been mentioned yet + for (const user of config.users) { + const mentionPattern = new RegExp(`@${user}\\b`, 'i'); + if (!mentionPattern.test(allExistingText)) { + usersToMention.push(user); + } else { + core.notice(`@${user} already mentioned for label "${label}", skipping`); + } + } + + // Post comment if there are users to mention + if (usersToMention.length > 0) { + const mentions = usersToMention.map(u => `@${u}`).join(' '); + const message = config.message.replace('{users}', mentions); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: message + }); + + core.notice(`CC comment added for label "${label}": ${mentions}`); + } else { + core.notice(`All users for label "${label}" already mentioned, skipping comment`); + } + } + } \ No newline at end of file From d3cc8427c0a930937389d0ce6ec99405e7117929 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 14 Oct 2025 01:10:23 -0500 Subject: [PATCH 18/92] [ci] Adding the test-amd.yaml for test definitions for the AMD backend. (alternative PR) (#26718) Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 1265 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1265 insertions(+) create mode 100644 .buildkite/test-amd.yaml diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml new file mode 100644 index 0000000000000..b2a3a0a775baa --- /dev/null +++ b/.buildkite/test-amd.yaml @@ -0,0 +1,1265 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + +# Documentation +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). +# command(str): the single command to run for tests. incompatible with commands. +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second +# command runs on the second host. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. + +# When adding a test +# - If the test belongs to an existing group, add it there +# - If the test is short, add to any existing step +# - If the test takes more than 10min, then it is okay to create a new step. +# Note that all steps execute in parallel. + +steps: +##### fast check tests ##### + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/generate_nightly_torch_test.py + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration Test (API Server) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/test_basic_correctness + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + +- label: EPLB Algorithm Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +##### fast check tests ##### +##### 1 GPU test ##### + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Engine Test # 25min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + #grade: Blocking + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization + +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Test others (CPU) # 5 mins + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py + parallelism: 4 + +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_pass_manager.py + - pytest -v -s compile/test_fusion.py + - pytest -v -s compile/test_fusion_attn.py + - pytest -v -s compile/test_functionalization.py + - pytest -v -s compile/test_silu_mul_quant_fusion.py + - pytest -v -s compile/test_sequence_parallelism.py + - pytest -v -s compile/test_async_tp.py + - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/piecewise/ + +- label: PyTorch Fullgraph Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - tests/kernels/core + commands: + - pytest -v -s kernels/core + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ + +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: OpenAI API correctness # 22min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ + +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use + +##### models test ##### + +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Models Test (Extended) 1 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models Test + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* + +- label: Transformers Nightly Models Test + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Blackwell Test # 38 min + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/fusion.py + - vllm/compilation/fusion_attn.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + # Fusion + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + +##### 1 GPU test ##### +##### multi gpus test ##### + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + + +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H200 test ##### +- label: Distrubted Tests (H200) # optional + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh From 481545b397572ff460f4eee747df2066cde9501c Mon Sep 17 00:00:00 2001 From: Ryan Li Date: Tue, 14 Oct 2025 14:52:21 +0800 Subject: [PATCH 19/92] scheduler.py: Update the name of the default scheduler. (#26758) Signed-off-by: Ryan Li --- vllm/config/scheduler.py | 12 ++++++------ vllm/engine/arg_utils.py | 5 ----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 061aa4d4a4f5b..d5eb077309238 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -131,12 +131,12 @@ class SchedulerConfig: some image tokens can be scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled as TTTT in one step and IIIIIIIIII in the next.""" - # scheduler class or path. "vllm.core.scheduler.Scheduler" (default) - # or "mod.custom_class". - scheduler_cls: str | type[object] = "vllm.core.scheduler.Scheduler" - """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the - default scheduler. Can be a class directly or the path to a class of form - "mod.custom_class".""" + # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler" + # (default) or "mod.custom_class". + scheduler_cls: str | type[object] = "vllm.v1.core.sched.scheduler.Scheduler" + """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is + the default scheduler. Can be a class directly or the path to a class of + form "mod.custom_class".""" disable_hybrid_kv_cache_manager: bool = False """If set to True, KV cache manager will allocate the same size of KV cache diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 09c8b4ca02c57..14be20b3a5d45 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1742,11 +1742,6 @@ class EngineArgs: self.enable_prefix_caching = incremental_prefill_supported logger.info("(%s) prefix caching by default", action) - # V1 should use the new scheduler by default. - # Swap it only if this arg is set to the original V0 default - if self.scheduler_cls == EngineArgs.scheduler_cls: - self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler" - # When no user override, set the default values based on the usage # context. # Use different default values for different hardware. From 01ad27faff35286670b50d800e27e58b548b1fea Mon Sep 17 00:00:00 2001 From: CSWYF3634076 Date: Tue, 14 Oct 2025 14:55:23 +0800 Subject: [PATCH 20/92] [Model][Bugfix]fix ernie45 load failed due to ernie45 eplb code (#26684) Signed-off-by: wangyafeng --- vllm/model_executor/models/ernie45_moe.py | 34 +++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index e01f26731cd92..607589e68ef33 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only ErineMoE model compatible with HuggingFace weights.""" -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from itertools import islice from typing import Any @@ -139,10 +140,10 @@ class Ernie4_5_MoeMoE(nn.Module): # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts self.n_local_physical_experts = self.n_physical_experts // self.ep_size @@ -426,8 +427,10 @@ class Ernie4_5_MoeModel(nn.Module): self.vocab_size = config.vocab_size self.config = config parallel_config = vllm_config.parallel_config + eplb_config = parallel_config.eplb_config enable_eplb = parallel_config.enable_eplb - self.num_redundant_experts = parallel_config.num_redundant_experts + + self.num_redundant_experts = eplb_config.num_redundant_experts if get_pp_group().is_first_rank: self.embed_tokens = VocabParallelEmbedding( @@ -570,20 +573,27 @@ class Ernie4_5_MoeModel(nn.Module): # Skip loading extra bias for GPTQ models. if ( - name.endswith(".bias") or name.endswith("_bias") - ) and name not in params_dict: + name_mapped.endswith(".bias") or name_mapped.endswith("_bias") + ) and name_mapped not in params_dict: continue - param = params_dict[name] - - weight_loader = param.weight_loader - weight_loader( + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast( + Callable[..., bool], param.weight_loader + ) + success = weight_loader( param, loaded_weight, - name, + name_mapped, shard_id=shard_id, expert_id=expert_id, + return_success=True, ) - break + if success: + name = name_mapped + break else: if is_expert_weight: # We've checked that this is an expert weight From d32c611f455766c9d67034b5e0f8e66f28f4a3ba Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Tue, 14 Oct 2025 00:04:00 -0700 Subject: [PATCH 21/92] [CI/Build] Use 127.0.0.1 instead of localhost in utils (#26750) Signed-off-by: Ye (Charlotte) Qi --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 8fee507084382..5bfdf703390ee 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -157,7 +157,7 @@ class RemoteOpenAIServer: self.host = None self.port = None else: - self.host = str(args.host or "localhost") + self.host = str(args.host or "127.0.0.1") self.port = int(args.port) self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None From fd85c9f4263de8cf9bc9f51bef9471344436614c Mon Sep 17 00:00:00 2001 From: Max Wittig Date: Tue, 14 Oct 2025 09:17:39 +0200 Subject: [PATCH 22/92] [Bugfix][FE]: Always include usage with `--enable-force-include-usage ` (#20983) Signed-off-by: Max Wittig Signed-off-by: Antoine Auger Co-authored-by: Antoine Auger --- pyproject.toml | 1 + .../openai/test_enable_force_include_usage.py | 126 ++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 2 + vllm/entrypoints/openai/run_batch.py | 8 ++ vllm/entrypoints/openai/serving_chat.py | 15 +-- vllm/entrypoints/openai/serving_completion.py | 16 +-- vllm/entrypoints/openai/serving_engine.py | 3 - vllm/entrypoints/openai/serving_responses.py | 1 - .../openai/serving_transcription.py | 4 + vllm/entrypoints/openai/speech_to_text.py | 7 +- vllm/entrypoints/utils.py | 19 ++- 11 files changed, 172 insertions(+), 30 deletions(-) create mode 100644 tests/entrypoints/openai/test_enable_force_include_usage.py diff --git a/pyproject.toml b/pyproject.toml index eb9bdb593baac..95dda76063bc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ markers = [ "distributed: run this test only in distributed GPU tests", "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", + "extra_server_args: extra arguments to pass to the server fixture", ] [tool.ty.src] diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py new file mode 100644 index 0000000000000..3ddf2308eb1d5 --- /dev/null +++ b/tests/entrypoints/openai/test_enable_force_include_usage.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import openai +import pytest +import pytest_asyncio + +from ...utils import RemoteOpenAIServer + + +@pytest.fixture(scope="module") +def chat_server_with_force_include_usage(request): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "128", + "--enforce-eager", + "--max-num-seqs", + "1", + "--enable-force-include-usage", + "--port", + "55857", + "--gpu-memory-utilization", + "0.2", + ] + + with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def chat_client_with_force_include_usage(chat_server_with_force_include_usage): + async with chat_server_with_force_include_usage.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_chat_with_enable_force_include_usage( + chat_client_with_force_include_usage: openai.AsyncOpenAI, +): + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ] + + stream = await chat_client_with_force_include_usage.chat.completions.create( + model="Qwen/Qwen3-0.6B", + messages=messages, + max_completion_tokens=10, + extra_body=dict(min_tokens=10), + temperature=0.0, + stream=True, + ) + last_completion_tokens = 0 + async for chunk in stream: + if not len(chunk.choices): + assert chunk.usage.prompt_tokens >= 0 + assert ( + last_completion_tokens == 0 + or chunk.usage.completion_tokens > last_completion_tokens + or ( + not chunk.choices + and chunk.usage.completion_tokens == last_completion_tokens + ) + ) + assert chunk.usage.total_tokens == ( + chunk.usage.prompt_tokens + chunk.usage.completion_tokens + ) + else: + assert chunk.usage is None + + +@pytest.fixture(scope="module") +def transcription_server_with_force_include_usage(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-num-seqs", + "1", + "--enforce-eager", + "--enable-force-include-usage", + "--gpu-memory-utilization", + "0.2", + ] + + with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def transcription_client_with_force_include_usage( + transcription_server_with_force_include_usage, +): + async with ( + transcription_server_with_force_include_usage.get_async_client() as async_client + ): + yield async_client + + +@pytest.mark.asyncio +async def test_transcription_with_enable_force_include_usage( + transcription_client_with_force_include_usage, winning_call +): + res = ( + await transcription_client_with_force_include_usage.audio.transcriptions.create( + model="openai/whisper-large-v3-turbo", + file=winning_call, + language="en", + temperature=0.0, + stream=True, + timeout=30, + ) + ) + + async for chunk in res: + if not len(chunk.choices): + # final usage sent + usage = chunk.usage + assert isinstance(usage, dict) + assert usage["prompt_tokens"] > 0 + assert usage["completion_tokens"] > 0 + assert usage["total_tokens"] > 0 + else: + assert not hasattr(chunk, "usage") diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ec5632523fe3c..fd80ba7a9afca 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1808,6 +1808,7 @@ async def init_app_state( state.openai_serving_models, request_logger=request_logger, log_error_stack=args.log_error_stack, + enable_force_include_usage=args.enable_force_include_usage, ) if "transcription" in supported_tasks else None @@ -1818,6 +1819,7 @@ async def init_app_state( state.openai_serving_models, request_logger=request_logger, log_error_stack=args.log_error_stack, + enable_force_include_usage=args.enable_force_include_usage, ) if "transcription" in supported_tasks else None diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index ecee27a329d22..c8ca6e7d29baa 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -104,6 +104,13 @@ def make_arg_parser(parser: FlexibleArgumentParser): default=False, help="If set to True, enable prompt_tokens_details in usage.", ) + parser.add_argument( + "--enable-force-include-usage", + action="store_true", + default=False, + help="If set to True, include usage on every request " + "(even when stream_options is not specified)", + ) return parser @@ -361,6 +368,7 @@ async def run_batch( chat_template=None, chat_template_content_format="auto", enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, ) if "generate" in supported_tasks else None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 96525f2068593..26027112eb589 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -58,7 +58,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_l from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall -from vllm.entrypoints.utils import get_max_tokens +from vllm.entrypoints.utils import get_max_tokens, should_include_usage from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.logprobs import Logprob @@ -101,7 +101,6 @@ class OpenAIServingChat(OpenAIServing): models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, - enable_force_include_usage=enable_force_include_usage, log_error_stack=log_error_stack, ) @@ -352,7 +351,6 @@ class OpenAIServingChat(OpenAIServing): conversation, tokenizer, request_metadata, - enable_force_include_usage=self.enable_force_include_usage, ) try: @@ -518,7 +516,6 @@ class OpenAIServingChat(OpenAIServing): conversation: list[ConversationMessage], tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, - enable_force_include_usage: bool, ) -> AsyncGenerator[str, None]: created_time = int(time.time()) chunk_object_type: Final = "chat.completion.chunk" @@ -596,13 +593,9 @@ class OpenAIServingChat(OpenAIServing): return stream_options = request.stream_options - if stream_options: - include_usage = stream_options.include_usage or enable_force_include_usage - include_continuous_usage = ( - include_usage and stream_options.continuous_usage_stats - ) - else: - include_usage, include_continuous_usage = False, False + include_usage, include_continuous_usage = should_include_usage( + stream_options, self.enable_force_include_usage + ) try: async for res in result_generator: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 7af64306023a3..7cbe9c69435c3 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -27,7 +27,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.renderer import RenderConfig -from vllm.entrypoints.utils import get_max_tokens +from vllm.entrypoints.utils import get_max_tokens, should_include_usage from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt from vllm.logger import init_logger from vllm.logprobs import Logprob @@ -56,11 +56,11 @@ class OpenAIServingCompletion(OpenAIServing): models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, - enable_force_include_usage=enable_force_include_usage, log_error_stack=log_error_stack, ) self.enable_prompt_tokens_details = enable_prompt_tokens_details self.default_sampling_params = self.model_config.get_diff_sampling_param() + self.enable_force_include_usage = enable_force_include_usage if self.default_sampling_params: source = self.model_config.generation_config source = "model" if source == "auto" else source @@ -256,7 +256,6 @@ class OpenAIServingCompletion(OpenAIServing): num_prompts=num_prompts, tokenizer=tokenizer, request_metadata=request_metadata, - enable_force_include_usage=self.enable_force_include_usage, ) # Non-streaming response @@ -320,7 +319,6 @@ class OpenAIServingCompletion(OpenAIServing): num_prompts: int, tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, - enable_force_include_usage: bool, ) -> AsyncGenerator[str, None]: num_choices = 1 if request.n is None else request.n previous_text_lens = [0] * num_choices * num_prompts @@ -331,13 +329,9 @@ class OpenAIServingCompletion(OpenAIServing): first_iteration = True stream_options = request.stream_options - if stream_options: - include_usage = stream_options.include_usage or enable_force_include_usage - include_continuous_usage = ( - include_usage and stream_options.continuous_usage_stats - ) - else: - include_usage, include_continuous_usage = False, False + include_usage, include_continuous_usage = should_include_usage( + stream_options, self.enable_force_include_usage + ) try: async for prompt_idx, res in result_generator: diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a041950ffd20b..3965d2dac0887 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -249,7 +249,6 @@ class OpenAIServing: *, request_logger: RequestLogger | None, return_tokens_as_token_ids: bool = False, - enable_force_include_usage: bool = False, log_error_stack: bool = False, ): super().__init__() @@ -260,8 +259,6 @@ class OpenAIServing: self.request_logger = request_logger self.return_tokens_as_token_ids = return_tokens_as_token_ids - self.enable_force_include_usage = enable_force_include_usage - self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) self._apply_mistral_chat_template_async = make_async( apply_mistral_chat_template, executor=self._tokenizer_executor diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 3b9015efd305d..744df98a4278e 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -127,7 +127,6 @@ class OpenAIServingResponses(OpenAIServing): models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids, - enable_force_include_usage=enable_force_include_usage, log_error_stack=log_error_stack, ) diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index d043f55648d2c..33da7034afabc 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -37,6 +37,7 @@ class OpenAIServingTranscription(OpenAISpeechToText): request_logger: RequestLogger | None, return_tokens_as_token_ids: bool = False, log_error_stack: bool = False, + enable_force_include_usage: bool = False, ): super().__init__( engine_client=engine_client, @@ -45,6 +46,7 @@ class OpenAIServingTranscription(OpenAISpeechToText): return_tokens_as_token_ids=return_tokens_as_token_ids, task_type="transcribe", log_error_stack=log_error_stack, + enable_force_include_usage=enable_force_include_usage, ) async def create_transcription( @@ -96,6 +98,7 @@ class OpenAIServingTranslation(OpenAISpeechToText): request_logger: RequestLogger | None, return_tokens_as_token_ids: bool = False, log_error_stack: bool = False, + enable_force_include_usage: bool = False, ): super().__init__( engine_client=engine_client, @@ -104,6 +107,7 @@ class OpenAIServingTranslation(OpenAISpeechToText): return_tokens_as_token_ids=return_tokens_as_token_ids, task_type="translate", log_error_stack=log_error_stack, + enable_force_include_usage=enable_force_include_usage, ) async def create_translation( diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index fa6e962a1dd70..e012f43260c2b 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -58,6 +58,7 @@ class OpenAISpeechToText(OpenAIServing): return_tokens_as_token_ids: bool = False, task_type: Literal["transcribe", "translate"] = "transcribe", log_error_stack: bool = False, + enable_force_include_usage: bool = False, ): super().__init__( engine_client=engine_client, @@ -74,6 +75,8 @@ class OpenAISpeechToText(OpenAIServing): self.model_config, task_type ) + self.enable_force_include_usage = enable_force_include_usage + self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB if self.default_sampling_params: @@ -261,9 +264,7 @@ class OpenAISpeechToText(OpenAIServing): completion_tokens = 0 num_prompt_tokens = 0 - include_usage = ( - request.stream_include_usage if request.stream_include_usage else False - ) + include_usage = self.enable_force_include_usage or request.stream_include_usage include_continuous_usage = ( request.stream_continuous_usage_stats if include_usage and request.stream_continuous_usage_stats diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 1504705cf0e2b..c006a76d3cdf4 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -14,7 +14,11 @@ from starlette.background import BackgroundTask, BackgroundTasks from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + CompletionRequest, + StreamOptions, +) from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser @@ -237,3 +241,16 @@ def log_non_default_args(args: Namespace | EngineArgs): ) logger.info("non-default args: %s", non_default_args) + + +def should_include_usage( + stream_options: StreamOptions | None, enable_force_include_usage: bool +) -> tuple[bool, bool]: + if stream_options: + include_usage = stream_options.include_usage or enable_force_include_usage + include_continuous_usage = include_usage and bool( + stream_options.continuous_usage_stats + ) + else: + include_usage, include_continuous_usage = enable_force_include_usage, False + return include_usage, include_continuous_usage From 577d498212022f95dc3a59746b1da1c6ed23eaba Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 14 Oct 2025 15:49:59 +0800 Subject: [PATCH 23/92] [Plugin] Make plugin group clear (#26757) Signed-off-by: wangxiyuan --- vllm/platforms/__init__.py | 4 ++-- vllm/plugins/__init__.py | 7 +++++++ vllm/plugins/io_processors/__init__.py | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index d63ef78f5b2d2..b9140b4fe676b 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -6,7 +6,7 @@ from itertools import chain from typing import TYPE_CHECKING from vllm import envs -from vllm.plugins import load_plugins_by_group +from vllm.plugins import PLATFORM_PLUGINS_GROUP, load_plugins_by_group from vllm.utils import resolve_obj_by_qualname, supports_xccl from .interface import CpuArchEnum, Platform, PlatformEnum @@ -188,7 +188,7 @@ builtin_platform_plugins = { def resolve_current_platform_cls_qualname() -> str: - platform_plugins = load_plugins_by_group("vllm.platform_plugins") + platform_plugins = load_plugins_by_group(PLATFORM_PLUGINS_GROUP) activated_plugins = [] diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index fe04b759a12c8..0d8988f27959f 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -9,7 +9,14 @@ import vllm.envs as envs logger = logging.getLogger(__name__) +# Default plugins group will be loaded in all processes(process0, engine core +# process and worker processes) DEFAULT_PLUGINS_GROUP = "vllm.general_plugins" +# IO processor plugins group will be loaded in process0 only +IO_PROCESSOR_PLUGINS_GROUP = "vllm.io_processor_plugins" +# Platform plugins group will be loaded in all processes when +# `vllm.platforms.current_platform` is called and the value not initialized, +PLATFORM_PLUGINS_GROUP = "vllm.platform_plugins" # make sure one process only loads plugins once plugins_loaded = False diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py index cb58bfe75f1d7..c7b01ae341440 100644 --- a/vllm/plugins/io_processors/__init__.py +++ b/vllm/plugins/io_processors/__init__.py @@ -4,7 +4,7 @@ import logging from vllm.config import VllmConfig -from vllm.plugins import load_plugins_by_group +from vllm.plugins import IO_PROCESSOR_PLUGINS_GROUP, load_plugins_by_group from vllm.plugins.io_processors.interface import IOProcessor from vllm.utils import resolve_obj_by_qualname @@ -37,7 +37,7 @@ def get_io_processor( # Load all installed plugin in the group multimodal_data_processor_plugins = load_plugins_by_group( - "vllm.io_processor_plugins" + IO_PROCESSOR_PLUGINS_GROUP ) loadable_plugins = {} From d2f816d6ff99ec0623f6596b90925f8164e6c7a6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 14 Oct 2025 17:36:21 +0800 Subject: [PATCH 24/92] [Bugfix] Standardize merging multimodal embeddings (#26771) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ernie45_vl.py | 6 +++--- vllm/model_executor/models/glm4_1v.py | 6 +++--- vllm/model_executor/models/hyperclovax_vision.py | 6 +++--- vllm/model_executor/models/interns1.py | 6 +++--- vllm/model_executor/models/internvl.py | 6 +++--- vllm/model_executor/models/keye.py | 6 +++--- vllm/model_executor/models/llava_onevision.py | 4 ++-- vllm/model_executor/models/minicpmo.py | 4 ++-- vllm/model_executor/models/minicpmv.py | 8 ++++---- vllm/model_executor/models/nano_nemotron_vl.py | 6 +++--- vllm/model_executor/models/nemotron_vl.py | 4 ++-- vllm/model_executor/models/ovis2_5.py | 6 +++--- vllm/model_executor/models/phi4_multimodal.py | 4 ++-- vllm/model_executor/models/phi4mm.py | 4 ++-- vllm/model_executor/models/qwen2_5_omni_thinker.py | 8 ++++---- vllm/model_executor/models/qwen2_5_vl.py | 10 +++++----- vllm/model_executor/models/qwen2_vl.py | 6 +++--- vllm/model_executor/models/qwen3_omni_moe_thinker.py | 8 ++++---- vllm/model_executor/models/qwen3_vl.py | 6 +++--- 19 files changed, 57 insertions(+), 57 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index dc465c87cf4b9..f40bd01deccd5 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1645,12 +1645,12 @@ class Ernie4_5_VLMoeForConditionalGeneration( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 6e58f8c32f8ad..132f26253b367 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1608,11 +1608,11 @@ class Glm4vForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def forward( diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index ad39443f93daa..3d28ba951b94e 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -749,12 +749,12 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 28a4a1e8d2596..176aa3252d67b 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -753,12 +753,12 @@ class InternS1ForConditionalGeneration( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_vision_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_vision_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_vision_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 28a35595f43aa..05b822d6fdbf5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1358,12 +1358,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA) for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_vision_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_vision_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_vision_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 028162fdbf110..292a07c00d07b 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1459,12 +1459,12 @@ class BaseKeyeModule(nn.Module): for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def forward( diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index c9a27728eb735..c4cae240ea469 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -881,8 +881,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += tuple(vision_embeddings) + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_pixels(multimodal_input) multimodal_embeddings += tuple(video_embeddings) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 371c9607c5c5b..fa2feb0ba10b4 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -762,7 +762,7 @@ class MiniCPMO(MiniCPMV2_6): for modality in modalities: if modality == "audios": audio_input = modalities["audios"] - audio_features = self._process_audio_input(audio_input) - multimodal_embeddings += tuple(audio_features) + audio_embeddings = self._process_audio_input(audio_input) + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 173cab3bffc10..ef2bbac756541 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1129,12 +1129,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): for modality in modalities: if modality == "images": image_input = modalities["images"] - image_features = self._process_vision_input(image_input) - multimodal_embeddings += tuple(image_features) + image_embeddings = self._process_vision_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] - video_features = self._process_vision_input(video_input) - multimodal_embeddings += tuple(video_features) + video_embeddings = self._process_vision_input(video_input) + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index dfb7cb7fe6bd4..e874aaa0fc7ad 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1263,12 +1263,12 @@ class NemotronH_Nano_VL_V2( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 42f70ef105a5d..2f78e2f60c93b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -575,8 +575,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index b4e2f42be5979..758611afb9a46 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -616,12 +616,12 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_visual_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_visual_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_visual_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index b99e3a5a1fd84..207bd000c5b7a 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -1430,8 +1430,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if modality == "images": audio_projection_mode = "vision" image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += tuple(vision_embeddings) + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "audios": audio_input = modalities["audios"] audio_embeddings = self._process_audio_input( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index dce31f9d0aac6..a54d4d15ba9bb 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1248,8 +1248,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if modality == "images": audio_projection_mode = "vision" image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += tuple(vision_embeddings) + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "audios": audio_input = modalities["audios"] audio_embeddings = self._process_audio_input( diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 07f814ef64187..c40b97a2c4e09 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -1210,14 +1210,14 @@ class Qwen2_5OmniThinkerForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) if modality == "audio": audio_embeddings = self._process_audio_input(multimodal_input) - multimodal_embeddings += audio_embeddings + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings # TODO (ywang96): support overlapping modality embeddings so that diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3f205307cb225..3079d3b9a41aa 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1586,19 +1586,19 @@ class Qwen2_5_VLForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) + image_embeddings = self._process_image_input(multimodal_input) if self.is_multimodal_pruning_enabled: - vision_embeddings = self._postprocess_image_embeds_evs( - vision_embeddings, multimodal_input + image_embeddings = self._postprocess_image_embeds_evs( + image_embeddings, multimodal_input ) - multimodal_embeddings += vision_embeddings + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input ) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def forward( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8069039b0c560..821a9d13dc6f7 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1561,12 +1561,12 @@ class Qwen2VLForConditionalGeneration( for modality in modalities: if modality == "images": image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "videos": video_input = modalities["videos"] video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index b1eceaa6ef41d..d565a0108432a 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1260,14 +1260,14 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) if modality == "audio": audio_embeddings = self._process_audio_input(multimodal_input) - multimodal_embeddings += audio_embeddings + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings def get_input_embeddings( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 39714faf9833e..f114aae25c51b 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1601,11 +1601,11 @@ class Qwen3VLForConditionalGeneration( for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - vision_embeddings = self._process_image_input(multimodal_input) - multimodal_embeddings += vision_embeddings + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input(multimodal_input) - multimodal_embeddings += video_embeddings + multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings def _compute_deepstack_embeds( From 74704d455386528892f54b2dc78b5e282db5a7e0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 14 Oct 2025 17:42:45 +0800 Subject: [PATCH 25/92] [Model] Use merge_by_field_config for MM models (O-P) (#26776) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/phi3v.py | 23 ++----- vllm/model_executor/models/phi4_multimodal.py | 64 ++++-------------- vllm/model_executor/models/phi4mm.py | 65 ++++--------------- 3 files changed, 30 insertions(+), 122 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 93cc7af176d21..b86fe67fb4768 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -56,7 +56,6 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .clip import CLIPVisionModel @@ -70,7 +69,6 @@ from .utils import ( AutoWeightsLoader, WeightsMapper, _merge_multimodal_embeddings, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -564,6 +562,8 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): dummy_inputs=Phi3VDummyInputsBuilder, ) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant): + merge_by_field_config = True + hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embed_tokens.wte": "embed_tokens", @@ -631,8 +631,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) if pixel_values is not None: return Phi3VImagePixelInputs( type="pixel_values", - pixel_values=flatten_bn(pixel_values), - image_sizes=flatten_bn(image_sizes, concat=True), + pixel_values=pixel_values, + image_sizes=image_sizes, resolve_bindings={ "h": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size, "w": CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size, @@ -642,7 +642,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) if image_embeds is not None: return Phi3VImageEmbeddingInputs( type="image_embeds", - data=flatten_bn(image_embeds), + data=image_embeds, ) raise AssertionError("This line should be unreachable.") @@ -652,19 +652,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) image_input: Phi3VImageInputs, ) -> torch.Tensor: if image_input["type"] == "image_embeds": - image_data = image_input["data"] - if is_list_of(image_data, torch.Tensor): - # it's already a list of tensors - return image_data - if len(image_data.shape) == 3: - # 3D tensor - return list(torch.unbind(image_data, dim=0)) - raise ValueError( - "We expect batched 2D tensors; " - "this can be either a list of 2D tensors or a single 3D tensor." - ) + return image_input["data"] assert self.vision_embed_tokens is not None + image_embeds = self.vision_embed_tokens( image_input["pixel_values"], image_input["image_sizes"] ) diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 207bd000c5b7a..4799b7aba7f76 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -64,7 +64,6 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .idefics2_vision_model import Idefics2VisionTransformer @@ -72,7 +71,6 @@ from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .utils import ( AutoWeightsLoader, WeightsMapper, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -672,7 +670,7 @@ class Phi4MMImagePixelInputs(TensorSchema): type: Literal["pixel_values"] - data: Annotated[ + pixel_values: Annotated[ torch.Tensor | list[torch.Tensor], TensorShape( "bn", "p", 3, "h", "w", dynamic_dims={"p"} @@ -721,7 +719,7 @@ class Phi4MMAudioFeatureInputs(TensorSchema): type: Literal["audio_features"] - data: Annotated[ + audio_features: Annotated[ torch.Tensor | list[torch.Tensor], TensorShape("bn", "t", 80, dynamic_dims={"t"}), ] @@ -1189,6 +1187,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): Implements the Phi-4-multimodal-instruct model in vLLM. """ + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": [ "qkv_proj", @@ -1273,7 +1273,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if audio_features is not None: return Phi4MMAudioFeatureInputs( - type="audio_features", data=flatten_bn(audio_features) + type="audio_features", + audio_features=audio_features, ) if audio_embeds is not None: @@ -1298,7 +1299,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if audio_input["type"] == "audio_embeds": return audio_input["data"] - audio_features = audio_input["data"] + audio_features = audio_input["audio_features"] # (e.g. multiple examples) and the second dim is the multi-audio dim # (e.g. multiple audios in the same example) @@ -1315,8 +1316,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): def _parse_and_validate_image_input( self, **kwargs: object ) -> Phi4MMImagePixelInputs | None: - image_pixel_values: NestedTensors = kwargs.get("image_pixel_values") - if image_pixel_values is None: + pixel_values = kwargs.get("image_pixel_values") + if pixel_values is None: return None image_sizes = kwargs.get("image_sizes") @@ -1328,52 +1329,9 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): and num_img_tokens is not None ), "Missing image inputs" - if is_list_of(image_pixel_values, torch.Tensor): - assert all(p.dim() == 5 for p in image_pixel_values), ( - "Incorrect image inputs" - ) - # list len is batch_size. - # each tensor has dimension: num_img_per_example, num_hd_patches, - # channels, height, width. - # need to pad along num_hd_patches. - # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w. - image_pixel_values = cat_with_pad(image_pixel_values, dim=0) - elif isinstance(image_pixel_values, torch.Tensor): - # dimension: batch_size, num_img_per_example, num_hd_patches, - # channels, height, width. - # we flatten first 2 dims to make it a single large batch for - # SigLIP Encoder. - assert image_pixel_values.dim() == 6, "Incorrect image inputs" - image_pixel_values = image_pixel_values.flatten(0, 1) - else: - raise ValueError("Incorrect image_pixel_values inputs") - - if isinstance(image_attention_mask, list): - image_attention_mask = cat_with_pad(image_attention_mask, dim=0) - elif isinstance(image_attention_mask, torch.Tensor): - image_attention_mask = image_attention_mask.flatten(0, 1) - else: - raise ValueError("Incorrect image_attention_mask inputs") - - if isinstance(image_sizes, list): - image_sizes = torch.cat(image_sizes, dim=0) - elif isinstance(image_sizes, torch.Tensor): - image_sizes = image_sizes.flatten(0, 1) - else: - raise ValueError("Incorrect image_sizes inputs") - - if isinstance(num_img_tokens, list): - num_img_tokens = [ - n for num_tensor in num_img_tokens for n in num_tensor.tolist() - ] - elif isinstance(num_img_tokens, torch.Tensor): - num_img_tokens = num_img_tokens.flatten(0, 1).tolist() - else: - raise ValueError("Incorrect num_img_tokens inputs") - return Phi4MMImagePixelInputs( type="pixel_values", - data=image_pixel_values, + pixel_values=pixel_values, image_sizes=image_sizes, image_attention_mask=image_attention_mask, num_img_tokens=num_img_tokens, @@ -1405,7 +1363,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: dtype = next(self.image_embed.parameters()).dtype - pixel_values = image_input["data"].to(dtype) + pixel_values = image_input["pixel_values"].to(dtype) image_sizes = image_input["image_sizes"] image_attention_mask = image_input["image_attention_mask"] image_embeds = self.image_embed( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index a54d4d15ba9bb..acad72b058fcd 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -50,13 +50,12 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of from vllm.utils.tensor_schema import TensorSchema, TensorShape from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .phi4mm_audio import AudioEmbedding -from .utils import AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix +from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -467,7 +466,7 @@ class Phi4MMImagePixelInputs(TensorSchema): type: Literal["pixel_values"] - data: Annotated[ + pixel_values: Annotated[ torch.Tensor | list[torch.Tensor], TensorShape( "bn", "p", 3, "h", "w", dynamic_dims={"p"} @@ -499,7 +498,7 @@ class Phi4MMAudioFeatureInputs(TensorSchema): type: Literal["audio_features"] - data: Annotated[ + audio_features: Annotated[ torch.Tensor | list[torch.Tensor], TensorShape("bn", "t", 80, dynamic_dims={"t"}), ] @@ -986,6 +985,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): Implements the Phi-4-multimodal-instruct model in vLLM. """ + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": [ "qkv_proj", @@ -1094,7 +1095,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if audio_features is not None: return Phi4MMAudioFeatureInputs( - type="audio_features", data=flatten_bn(audio_features) + type="audio_features", + audio_features=audio_features, ) if audio_embeds is not None: @@ -1119,7 +1121,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): if audio_input["type"] == "audio_embeds": return audio_input["data"] - audio_features = audio_input["data"] + audio_features = audio_input["audio_features"] # (e.g. multiple examples) and the second dim is the multi-audio dim # (e.g. multiple audios in the same example) @@ -1136,8 +1138,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): def _parse_and_validate_image_input( self, **kwargs: object ) -> Phi4MMImagePixelInputs | None: - input_image_embeds: NestedTensors = kwargs.get("input_image_embeds") - if input_image_embeds is None: + pixel_values = kwargs.get("input_image_embeds") + if pixel_values is None: return None image_sizes = kwargs.get("image_sizes") @@ -1149,52 +1151,9 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): and num_img_tokens is not None ), "Missing image inputs" - if is_list_of(input_image_embeds, torch.Tensor): - assert all(p.dim() == 5 for p in input_image_embeds), ( - "Incorrect image inputs" - ) - # list len is batch_size. - # each tensor has dimension: num_img_per_example, num_hd_patches, - # channels, height, width. - # need to pad along num_hd_patches. - # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w. - input_image_embeds = cat_with_pad(input_image_embeds, dim=0) - elif isinstance(input_image_embeds, torch.Tensor): - # dimension: batch_size, num_img_per_example, num_hd_patches, - # channels, height, width. - # we flatten first 2 dims to make it a single large batch for - # SigLIP Encoder. - assert input_image_embeds.dim() == 6, "Incorrect image inputs" - input_image_embeds = input_image_embeds.flatten(0, 1) - else: - raise ValueError("Incorrect input_image_embeds inputs") - - if isinstance(image_attention_mask, list): - image_attention_mask = cat_with_pad(image_attention_mask, dim=0) - elif isinstance(image_attention_mask, torch.Tensor): - image_attention_mask = image_attention_mask.flatten(0, 1) - else: - raise ValueError("Incorrect image_attention_mask inputs") - - if isinstance(image_sizes, list): - image_sizes = torch.cat(image_sizes, dim=0) - elif isinstance(image_sizes, torch.Tensor): - image_sizes = image_sizes.flatten(0, 1) - else: - raise ValueError("Incorrect image_sizes inputs") - - if isinstance(num_img_tokens, list): - num_img_tokens = [ - n for num_tensor in num_img_tokens for n in num_tensor.tolist() - ] - elif isinstance(num_img_tokens, torch.Tensor): - num_img_tokens = num_img_tokens.flatten(0, 1).tolist() - else: - raise ValueError("Incorrect num_img_tokens inputs") - return Phi4MMImagePixelInputs( type="pixel_values", - data=input_image_embeds, + pixel_values=pixel_values, image_sizes=image_sizes, image_attention_mask=image_attention_mask, num_img_tokens=num_img_tokens, @@ -1223,7 +1182,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): self, image_input: Phi4MMImagePixelInputs ) -> list[torch.Tensor]: dtype = next(self.vision_encoder.parameters()).dtype - pixel_values = image_input["data"].to(dtype) + pixel_values = image_input["pixel_values"].to(dtype) image_sizes = image_input["image_sizes"] image_attention_mask = image_input["image_attention_mask"] image_embeds = self.vision_encoder( From 7e6edb14698c1a760272dd44363a288aeb07b571 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 14 Oct 2025 04:46:05 -0500 Subject: [PATCH 26/92] [NIXL][HeteroTP] Enable KV transfer from HND prefill to NHD decode (#26556) Signed-off-by: Chendi Xue --- docs/features/nixl_connector_usage.md | 10 +++ .../nixl_integration/run_accuracy_test.sh | 13 +++- .../kv_connector/unit/test_nixl_connector.py | 60 ++++++++++++++++- tests/v1/kv_connector/unit/utils.py | 2 + vllm/config/kv_transfer.py | 3 + .../kv_connector/v1/nixl_connector.py | 67 ++++++++++++++++++- 6 files changed, 150 insertions(+), 5 deletions(-) diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 795b0c77d610e..bfc0e0d86c6ae 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -156,6 +156,16 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`). Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior. +## Experimental Feature + +### Heterogenuous KV Layout support + +Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration + +```bash +--kv-transfer-config '{..., "enable_permute_local_kv":"True"}' +``` + ## Example Scripts/Code Refer to these example scripts in the vLLM repository: diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 3bf722900df37..ed6154462bb2b 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -19,11 +19,18 @@ done echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" +DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD +if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then + KV_CONFIG_HETERO_LAYOUT=',"enable_permute_local_kv":"True"' +else + KV_CONFIG_HETERO_LAYOUT='' +fi + # Build the kv-transfer-config once if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then - KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"'${KV_CONFIG_HETERO_LAYOUT}'}' else - KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}" + KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\""${KV_CONFIG_HETERO_LAYOUT}"}" fi # Models to run @@ -117,6 +124,7 @@ run_tests_for_model() { # Build the command with or without model-specific args BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \ + VLLM_KV_CACHE_LAYOUT='HND' \ UCX_NET_DEVICES=all \ VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \ vllm serve $model_name \ @@ -157,6 +165,7 @@ run_tests_for_model() { # Build the command with or without model-specific args BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID \ + VLLM_KV_CACHE_LAYOUT=$DECODER_KV_LAYOUT \ UCX_NET_DEVICES=all \ VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT \ vllm serve $model_name \ diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 71f5d4b2b0fd9..a911ddc56b023 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -286,9 +286,12 @@ def test_prompt_less_than_block_size(): class FakeNixlConnectorWorker(NixlConnectorWorker): REMOTE_ENGINE_ID = "remote_engine" - def __init__(self, *args, hand_shake_latency: float = 1.8, **kwargs): + def __init__( + self, *args, hand_shake_latency: float = 1.8, kv_cache_layout="HND", **kwargs + ): super().__init__(*args, **kwargs) self._hand_shake_latency = hand_shake_latency + self.kv_cache_layout = kv_cache_layout def _nixl_handshake( self, host: str, port: int, remote_tp_size: int, expected_engine_id: str @@ -564,10 +567,63 @@ class TestNixlHandshake: # We don't check layout for homogeneous TP and MLA for now, as the # whole block is moved. - worker.add_remote_agent(meta, remote_tp_size=2) + with pytest.raises(RuntimeError): + # mismatched layout is expected to fail + worker.add_remote_agent(meta, remote_tp_size=2) with pytest.raises(AssertionError): worker.add_remote_agent(meta, remote_tp_size=1) + @patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, + ) + def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental( + self, dist_init + ): + """ + Verify that adding a remote agent fails if kv_cache_layout differs. + This test is only relevant for heterogeneous TP. + """ + vllm_config = create_vllm_config(enable_permute_local_kv=True) + + # Mock TP world size to 2 to force heterogeneous TP when + # remote_tp_size=1 + with patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size", # noqa: E501 + return_value=2, + ): + # Initialize connector and worker (with fake NIXL wrapper) + connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + connector.connector_worker = FakeNixlConnectorWorker( + vllm_config, + connector.engine_id, + hand_shake_latency=0, + kv_cache_layout="NHD", + ) + worker = connector.connector_worker + + # Minimal local registration params used by add_remote_agent + worker.slot_size_per_layer = [2048] + worker.block_len_per_layer = [2048 * worker.block_size] + worker.num_blocks = 1 + worker.dst_num_blocks[worker.engine_id] = worker.num_blocks + + # Metadata with different kv_cache_layout than local worker + meta = NixlAgentMetadata( + engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + agent_metadata=FakeNixlWrapper.AGENT_METADATA, + kv_caches_base_addr=[0], + num_blocks=1, + # prefill TP=1, decode TP=2, remote block_lens is double to local + block_lens=[i * 2 for i in worker.block_len_per_layer], + attn_backend_name=worker.backend_name, + kv_cache_layout="HND", + ) + + # We don't check layout for homogeneous TP and MLA for now, as the + # whole block is moved. + worker.add_remote_agent(meta, remote_tp_size=1) + # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which # we put here is important. First run ray, it will clean up the resources, then diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index b07fd0536a436..e7f505d55e7a4 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -83,6 +83,7 @@ def create_vllm_config( block_size: int = 16, max_model_len: int = 10000, enable_chunked_prefill: bool = True, + enable_permute_local_kv: bool = False, ) -> VllmConfig: """Initialize VllmConfig For Testing.""" scheduler_config = SchedulerConfig( @@ -108,6 +109,7 @@ def create_vllm_config( kv_transfer_config = KVTransferConfig( kv_connector="NixlConnector", kv_role="kv_both", + enable_permute_local_kv=enable_permute_local_kv, ) return VllmConfig( scheduler_config=scheduler_config, diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py index d7a9d5808319e..eafd0e015a88d 100644 --- a/vllm/config/kv_transfer.py +++ b/vllm/config/kv_transfer.py @@ -61,6 +61,9 @@ class KVTransferConfig: """The Python module path to dynamically load the KV connector from. Only supported in V1.""" + enable_permute_local_kv: bool = False + """Experiment feature flag to enable HND to NHD KV Transfer""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8c4c82f76ff29..490f209373db3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -563,6 +563,7 @@ class NixlConnectorWorker: self.world_size = get_tensor_model_parallel_world_size() self.tp_group = get_tp_group() self.num_blocks = 0 + self.enable_permute_local_kv = False # KV Caches and nixl tracking data. self.device_type = current_platform.device_type @@ -1094,6 +1095,23 @@ class NixlConnectorWorker: is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1 remote_block_len = nixl_agent_meta.block_lens[0] + if nixl_agent_meta.kv_cache_layout != self.kv_cache_layout: + if ( + self.vllm_config.kv_transfer_config is not None + and self.vllm_config.kv_transfer_config.enable_permute_local_kv + and nixl_agent_meta.kv_cache_layout == "HND" + ): + logger.info( + "Remote is HND and local is NHD, enabled additional permute " + "on local device KV." + ) + self.enable_permute_local_kv = True + else: + raise RuntimeError( + "Heterogeneous TP expects same kv_cache_layout. " + "Or enable experimental feature to use HND to NHD support by " + "setting 'enable_permute_local_kv'=True in --kv-transfer-config." + ) if self.use_mla or is_kv_replicated: # With replicated KV cache, only the number of blocks can differ. assert self.block_len_per_layer == nixl_agent_meta.block_lens, ( @@ -1114,7 +1132,10 @@ class NixlConnectorWorker: remote_block_size //= 2 if tp_ratio > 1: # Heterogeneous TP expects same kv_cache_layout. - assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout + if nixl_agent_meta.kv_cache_layout == "NHD": + raise ValueError( + "Heterogeneous TP is not supported for remote with NHD." + ) if self.device_type == "xpu": raise ValueError("Heterogeneous TP is not supported on XPU") @@ -1226,6 +1247,41 @@ class NixlConnectorWorker: "d2h", ) + def permute_device_kv(self, block_ids: list[int]): + """Transforms the layout of received KV cache blocks to the local format. + + This method corrects layout mismatches from direct memory copies by + permuting the tensor dimensions. + + - **Source Layout:** `[num_blocks, n_kv_head, block_size, head_dim]` + - **Target Layout:** `[num_blocks, block_size, n_kv_head, head_dim]` + + Args: + block_ids: A list of block IDs to update and permute. + + Implementation: + - x = blocks_to_update.reshape(src_shape) # view local kv with sender layout + - permuted_blocks = x.permute(*inv_order) # transpose n_kv_heads, block_size + - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back + + """ + split_k_and_v = not (self.use_mla or self._use_pallas or self._use_flashinfer) + inv_order = [0, 2, 1, 3] + sample_cache = list(self.device_kv_caches.values())[0][0] + target_shape = list(sample_cache.shape) + target_shape[0] = -1 + src_shape = tuple(target_shape[i] for i in inv_order) + indices = torch.tensor(block_ids, device=sample_cache.device) + + for _, cache_or_caches in self.device_kv_caches.items(): + cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] + for cache in cache_list: + blocks_to_update = cache.index_select(0, indices) + permuted_blocks = blocks_to_update.reshape(src_shape).permute( + *inv_order + ) + cache.index_copy_(0, indices, permuted_blocks) + def get_finished(self) -> tuple[set[str], set[str]]: """ Get requests that are done sending or recving on this specific worker. @@ -1273,6 +1329,15 @@ class NixlConnectorWorker: del self._reqs_to_send[req_id] done_sending.add(req_id) + if self.enable_permute_local_kv and len(done_recving) > 0: + block_ids = [] + for req_id in done_recving: + meta = self._recving_metadata.pop(req_id) + assert meta, f"{req_id} not found in recving_metadata list" + block_ids += meta.local_block_ids + + self.permute_device_kv(block_ids) + return done_sending, done_recving def _get_new_notifs(self) -> set[str]: From d1d063a588205161fe5613d5a6d76311ce0dc83e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 14 Oct 2025 18:03:46 +0800 Subject: [PATCH 27/92] [Chore] Use `max_transformers_version` for Qwen-VL test (#26792) Signed-off-by: DarkLight1337 --- tests/models/multimodal/generation/test_common.py | 2 -- tests/models/registry.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 0572898368d6d..f124220bb16d9 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -707,8 +707,6 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, - # FIXME: https://github.com/huggingface/transformers/issues/38358 - marks=[pytest.mark.skip("Model initialization fails")], ), "qwen2_vl": VLMTestInfo( models=["Qwen/Qwen2-VL-2B-Instruct"], diff --git a/tests/models/registry.py b/tests/models/registry.py index fbc11c2ddfd4c..ec12e82ea36ea 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -752,6 +752,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen/Qwen-VL", extras={"chat": "Qwen/Qwen-VL-Chat"}, trust_remote_code=True, + max_transformers_version="4.53.3", + transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501 hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, ), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo( From 70b1b330e10f5eba8bf003500834d214c8b4a559 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 14 Oct 2025 11:05:15 +0100 Subject: [PATCH 28/92] Don't allow `typos` to fix by default (#26785) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 832c3edcdc7fe..121bdb750de5d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,6 +16,7 @@ repos: rev: v1.38.1 hooks: - id: typos + args: [--force-exclude] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v21.1.2 hooks: From ef9676a1f1af444d35160794b79c85594c517a6c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 14 Oct 2025 18:21:53 +0800 Subject: [PATCH 29/92] [Doc] ruff format some Python examples (#26767) Signed-off-by: DarkLight1337 --- docs/configuration/conserving_memory.md | 44 +-- docs/configuration/optimization.md | 24 +- docs/contributing/model/basic.md | 4 +- docs/contributing/model/multimodal.md | 42 +-- docs/contributing/model/registration.md | 2 +- docs/contributing/model/transcription.md | 12 +- docs/deployment/frameworks/cerebrium.md | 4 +- docs/deployment/frameworks/dstack.md | 4 +- docs/deployment/frameworks/haystack.md | 2 +- .../frameworks/hf_inference_endpoints.md | 36 +-- docs/deployment/frameworks/litellm.md | 13 +- .../retrieval_augmented_generation.md | 4 +- docs/design/cuda_graphs.md | 16 +- docs/design/io_processor_plugins.md | 27 +- docs/design/metrics.md | 12 +- docs/design/prefix_caching.md | 4 +- docs/features/lora.md | 11 +- docs/features/multimodal_inputs.md | 277 ++++++++++-------- docs/features/reasoning_outputs.md | 56 ++-- docs/features/tool_calling.md | 37 +-- 20 files changed, 341 insertions(+), 290 deletions(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 26b95ad053337..2b0654fa6d463 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs. ```python from vllm import LLM -llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", - tensor_parallel_size=2) +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` !!! warning @@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option). ```python from vllm import LLM -llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2) +llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) ``` ## Reduce CUDA Graphs @@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag: ```python from vllm import LLM -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True) +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) ``` ## Adjust cache size @@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem from vllm import LLM # Accept up to 3 images and 1 video per prompt -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 3, "video": 1}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"image": 3, "video": 1}, +) ``` You can go a step further and disable unused modalities completely by setting its limit to zero. @@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a from vllm import LLM # Accept any number of images but no videos -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"video": 0}) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"video": 0}, +) ``` You can even run a multi-modal model for text-only inference: @@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference: from vllm import LLM # Don't accept images. Just text. -llm = LLM(model="google/gemma-3-27b-it", - limit_mm_per_prompt={"image": 0}) +llm = LLM( + model="google/gemma-3-27b-it", + limit_mm_per_prompt={"image": 0}, +) ``` ### Configurable options @@ -173,14 +175,14 @@ Here are some examples: from vllm import LLM # Available for Qwen2-VL series models -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28 +) # Available for InternVL series models -llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) +llm = LLM( + model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12 +) ``` diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 5c74610ebd290..24c1efa61f286 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -100,7 +100,7 @@ from vllm import LLM llm = LLM( model="meta-llama/Llama-3.3-70B-Instruct, tensor_parallel_size=4, - pipeline_parallel_size=2 + pipeline_parallel_size=2, ) ``` @@ -257,18 +257,24 @@ Examples: ```python # Use a larger cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_cache_gb=8) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=8, +) # Use a shared-memory based IPC cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - tensor_parallel_size=2, - mm_processor_cache_type="shm", - mm_processor_cache_gb=8) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + tensor_parallel_size=2, + mm_processor_cache_type="shm", + mm_processor_cache_gb=8, +) # Disable the cache -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_cache_gb=0) +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_cache_gb=0, +) ``` ### Cache Placement diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index aafdb1058e03c..a423f4e683378 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -73,8 +73,8 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, ) -> torch.Tensor: ... ``` diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index 724dc2284e282..721081dffb499 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -16,7 +16,7 @@ Further update the model as follows: ... @classmethod - def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): return "" @@ -45,14 +45,14 @@ Further update the model as follows: ... def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: - assert self.vision_encoder is not None image_features = self.vision_encoder(image_input) return self.multi_modal_projector(image_features) def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - + self, + **kwargs: object, + ) -> MultiModalEmbeddings | None: # Validate the multimodal input keyword arguments image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: @@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m For example, if the model supports any number of images but only one video per prompt: ```python -def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: +def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None, "video": 1} ``` @@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, + mm_options: Mapping[str, BaseDummyOptions] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) @@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ```python def get_image_size_with_most_features(self) -> ImageSize: image_processor = self.get_image_processor() - return ImageSize(width=image_processor.size["width"], - height=image_processor.size["height"]) + return ImageSize( + width=image_processor.size["width"], + height=image_processor.size["height"], + ) ``` Fuyu does not expect image placeholders in the inputs to HF processor, so @@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in return { "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images, - overrides=image_overrides) + self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ) } ``` @@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_width=image_size.width, image_height=image_size.height, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows return PromptUpdateDetails.select_token_id( image_tokens + [bos_token_id], @@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies image_width=image_size.width, image_height=image_size.height, ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows return PromptUpdateDetails.select_token_id( image_tokens + [bos_token_id], @@ -810,9 +812,11 @@ to register them to the multi-modal registry: from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY -+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, -+ info=YourProcessingInfo, -+ dummy_inputs=YourDummyInputsBuilder) ++ @MULTIMODAL_REGISTRY.register_processor( ++ YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder, ++ ) class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index 35f35ffa4cde6..3bb4f961ef15f 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -42,7 +42,7 @@ def register(): ModelRegistry.register_model( "YourModelForCausalLM", - "your_code:YourModelForCausalLM" + "your_code:YourModelForCausalLM", ) ``` diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 4ce748ce1fed4..59f14a5ea27b9 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -15,6 +15,7 @@ Declare supported languages and capabilities: - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). ??? code "supported_languages and supports_transcription_only" + ```python from typing import ClassVar, Mapping, Literal import numpy as np @@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor This is for controlling general behavior of the API when serving your model: ??? code "get_speech_to_text_config()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: ??? code "get_generation_prompt()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: ??? code "get_generation_prompt()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface If your model requires a language and you want a default, override this method (see Whisper): ??? code "validate_language()" + ```python @classmethod def validate_language(cls, language: str | None) -> str | None: if language is None: logger.warning( - "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") + "Defaulting to language='en'. If you wish to transcribe " + "audio in a different language, pass the `language` field " + "in the TranscriptionRequest." + ) language = "en" return super().validate_language(language) ``` @@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo Provide a fast duration→token estimate to improve streaming usage statistics: ??? code "get_num_audio_tokens()" + ```python class YourASRModel(nn.Module, SupportsTranscription): ... @@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi Relevant server logic: ??? code "_preprocess_speech_to_text()" + ```python # vllm/entrypoints/openai/speech_to_text.py async def _preprocess_speech_to_text(...): diff --git a/docs/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md index 1f233c3204a15..960347d9525c4 100644 --- a/docs/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference ??? console "Command" - ```python + ```bash curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ -H 'Content-Type: application/json' \ -H 'Authorization: ' \ @@ -81,7 +81,7 @@ You should get a response like: ??? console "Response" - ```python + ```json { "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "result": { diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md index fe4d87f78f2aa..9d2c7f5bb565f 100644 --- a/docs/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: client = OpenAI( base_url="https://gateway.", - api_key="" + api_key="", ) completion = client.chat.completions.create( @@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: "role": "user", "content": "Compose a poem that explains the concept of recursion in programming.", } - ] + ], ) print(completion.choices[0].message.content) diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md index 836305cf15c42..b53b829d6d3c0 100644 --- a/docs/deployment/frameworks/haystack.md +++ b/docs/deployment/frameworks/haystack.md @@ -34,7 +34,7 @@ pip install vllm haystack-ai api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), model="mistralai/Mistral-7B-Instruct-v0.1", api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", - generation_kwargs = {"max_tokens": 512} + generation_kwargs={"max_tokens": 512}, ) response = generator.run( diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md index 75a234bdf1422..d39bb9a899c8a 100644 --- a/docs/deployment/frameworks/hf_inference_endpoints.md +++ b/docs/deployment/frameworks/hf_inference_endpoints.md @@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo import os client = OpenAI( - base_url = DEPLOYMENT_URL, - api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens + base_url=DEPLOYMENT_URL, + api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens ) chat_completion = client.chat.completions.create( - model = "HuggingFaceTB/SmolLM3-3B", - messages = [ + model="HuggingFaceTB/SmolLM3-3B", + messages=[ { "role": "user", "content": [ { "type": "text", - "text": "Give me a brief explanation of gravity in simple terms." + "text": "Give me a brief explanation of gravity in simple terms.", } - ] + ], } ], - stream = True + stream=True, ) for message in chat_completion: - print(message.choices[0].delta.content, end = "") + print(message.choices[0].delta.content, end="") ``` !!! note @@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg import os client = OpenAI( - base_url = DEPLOYMENT_URL, - api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens + base_url=DEPLOYMENT_URL, + api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens ) chat_completion = client.chat.completions.create( - model = "ibm-granite/granite-docling-258M", - messages = [ + model="ibm-granite/granite-docling-258M", + messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { - "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png" - } + "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png", + }, }, { "type": "text", - "text": "Convert this page to docling." - } + "text": "Convert this page to docling.", + }, ] } ], - stream = True + stream=True, ) for message in chat_completion: - print(message.choices[0].delta.content, end = "") + print(message.choices[0].delta.content, end="") ``` !!! note diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md index 0d6c3729911ad..9ea7c0373d2a1 100644 --- a/docs/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -36,15 +36,16 @@ pip install vllm litellm ```python import litellm - messages = [{ "content": "Hello, how are you?","role": "user"}] + messages = [{"content": "Hello, how are you?", "role": "user"}] # hosted_vllm is prefix key word and necessary response = litellm.completion( - model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name - messages=messages, - api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", - temperature=0.2, - max_tokens=80) + model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name + messages=messages, + api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", + temperature=0.2, + max_tokens=80, + ) print(response) ``` diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md index d86ab1600f126..37f90ef08f32e 100644 --- a/docs/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -40,7 +40,7 @@ pip install -U vllm \ 1. Run the script - ```python + ```bash python retrieval_augmented_generation_with_langchain.py ``` @@ -78,6 +78,6 @@ pip install vllm \ 1. Run the script: - ```python + ```bash python retrieval_augmented_generation_with_llamaindex.py ``` diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index f88a29f6eadd8..315746b0ef674 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -106,9 +106,11 @@ The dispatch code looks like: batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...) runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor) # execution -with set_forward_context(..., - cudagraph_runtime_mode=runtime_mode, - batch_descriptor=batch_descriptor): +with set_forward_context( + ..., + cudagraph_runtime_mode=runtime_mode, + batch_descriptor=batch_descriptor, +): output = self.model(...) ``` @@ -202,10 +204,10 @@ from vllm.config import CUDAGraphMode compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - dtype='auto', - compilation_config = compilation_config, - ) + model="meta-llama/Llama-3.1-8B-Instruct", + dtype="auto", + compilation_config=compilation_config, +) sampling_params = vllm.SamplingParams( temperature=0, # greedy decoding max_tokens=1024, diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index e70ee4a076e54..682fc5c413e2d 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin IO Processor plugins implement the `IOProcessor` interface (): ```python -IOProcessorInput = TypeVar('IOProcessorInput') -IOProcessorOutput = TypeVar('IOProcessorOutput') +IOProcessorInput = TypeVar("IOProcessorInput") +IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): def pre_process( self, prompt: IOProcessorInput, - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: + ) -> PromptType | Sequence[PromptType]: raise NotImplementedError async def pre_process_async( self, prompt: IOProcessorInput, - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: + ) -> PromptType | Sequence[PromptType]: return self.pre_process(prompt, request_id, **kwargs) @abstractmethod - def post_process(self, - model_output: Sequence[PoolingRequestOutput], - request_id: Optional[str] = None, - **kwargs) -> IOProcessorOutput: + def post_process( + self, + model_output: Sequence[PoolingRequestOutput], + request_id: str | None = None, + **kwargs, + ) -> IOProcessorOutput: raise NotImplementedError async def post_process_async( self, model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], - request_id: Optional[str] = None, + request_id: str | None = None, **kwargs, ) -> IOProcessorOutput: collected_output = [item async for i, item in model_output] @@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @abstractmethod def output_to_response( - self, plugin_output: IOProcessorOutput) -> IOProcessorResponse: + self, plugin_output: IOProcessorOutput + ) -> IOProcessorResponse: raise NotImplementedError ``` diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 90b2fd32f2979..c4a2d72a2f4a4 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -478,15 +478,17 @@ us with: ```python if seq_group.is_finished(): - if (seq_group.metrics.first_scheduled_time is not None and - seq_group.metrics.first_token_time is not None): + if ( + seq_group.metrics.first_scheduled_time is not None + and seq_group.metrics.first_token_time is not None + ): time_queue_requests.append( seq_group.metrics.first_scheduled_time - - seq_group.metrics.arrival_time) + seq_group.metrics.arrival_time + ) ... if seq_group.metrics.time_in_queue is not None: - time_in_queue_requests.append( - seq_group.metrics.time_in_queue) + time_in_queue_requests.append(seq_group.metrics.time_in_queue) ``` This seems duplicative, and one of them should be removed. The latter diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index 9941837bf1652..270699df623e0 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -112,8 +112,8 @@ class KVCacheBlock: ref_cnt: int # The pointers to form a doubly linked list for the free queue. - prev_free_block: Optional["KVCacheBlock"] = None - next_free_block: Optional["KVCacheBlock"] = None + prev_free_block: "KVCacheBlock | None" = None + next_free_block: "KVCacheBlock | None" = None ``` There are two design points to highlight: diff --git a/docs/features/lora.md b/docs/features/lora.md index db794b2ebd71d..d3b44520a5a79 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter. sampling_params = SamplingParams( temperature=0, max_tokens=256, - stop=["[/assistant]"] + stop=["[/assistant]"], ) prompts = [ @@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter. outputs = llm.generate( prompts, sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path), ) ``` @@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin: lora_request = LoRARequest( lora_name=lora_name, lora_path=local_path, - lora_int_id=abs(hash(lora_name)) + lora_int_id=abs(hash(lora_name)), ) return lora_request ``` @@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au if has_audio: question = f"<|audio|>{question}" chat = [ - { - "role": "user", - "content": question - } + {"role": "user", "content": question}, ] return tokenizer.apply_chat_template(chat, tokenize=False) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index dcc5ea3b90964..8f75f714d4b01 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis outputs = llm.generate({ "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, + "multi_modal_data": {"image": [image1, image2]}, }) for o in outputs: @@ -183,21 +181,24 @@ conversation = [ {"role": "assistant", "content": "Hello! How can I assist you today?"}, { "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - },{ - "type": "image_pil", - "image_pil": image_pil - }, { - "type": "image_embeds", - "image_embeds": image_embeds - }, { - "type": "text", - "text": "What's in these images?" - }], + "content": [ + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "image_pil", + "image_pil": image_pil, + }, + { + "type": "image_embeds", + "image_embeds": image_embeds, + }, + { + "type": "text", + "text": "What's in these images?", + }, + ], }, ] @@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with message = { "role": "user", "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + { + "type": "text", + "text": "Describe this set of frames. Consider the frames to be a part of the same video.", + }, ], } for i in range(len(video_frames)): @@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f # Custom black background for dark theme llm = LLM( model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}, ) # Custom brand color background (e.g., blue) llm = LLM( model="llava-hf/llava-1.5-7b-hf", - media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}, ) ``` @@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown limit_mm_per_prompt={"video": 1}, ) - sampling_params = SamplingParams( - max_tokens=1024, - ) + sampling_params = SamplingParams(max_tokens=1024) video_messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + { + "role": "user", + "content": [ {"type": "text", "text": "describe this video."}, { "type": "video", "video": video_path, "total_pixels": 20480 * 28 * 28, - "min_pixels": 16 * 28 * 28 - } + "min_pixels": 16 * 28 * 28, + }, ] }, ] @@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows: chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - { - "type": "image_url", - "image_url": { - url": image_url + messages=[ + { + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + { + "type": "text", + "text": "What’s in this image?", }, - "uuid": image_url # Optional - }, - ], - }], + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_url, # Optional + }, + ], + } + ], ) print("Chat completion output:", chat_response.choices[0].message.content) @@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows: chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - { - "type": "image_url", - "image_url": { - "url": image_url_duck + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the animals in these images?", }, - "uuid": image_url_duck # Optional - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion + { + "type": "image_url", + "image_url": {"url": image_url_duck}, + "uuid": image_url_duck, # Optional }, - "uuid": image_url_lion # Optional - }, - ], - }], + { + "type": "image_url", + "image_url": {"url": image_url_lion}, + "uuid": image_url_lion, # Optional + }, + ], + } + ], ) print("Chat completion output:", chat_response.choices[0].message.content) ``` @@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows: ## Use video url in the payload chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": video_url + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this video?", }, - "uuid": video_url # Optional - }, - ], - }], + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_url, # Optional + }, + ], + } + ], model=model, max_completion_tokens=64, ) @@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows: audio_base64 = encode_base64_content_from_url(audio_url) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - "data": audio_base64, - "format": "wav" + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?", }, - "uuid": audio_url # Optional - }, - ], - }], + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav", + }, + "uuid": audio_url, # Optional + }, + ], + }, + ], model=model, max_completion_tokens=64, ) @@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag ```python chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?", }, - "uuid": audio_url # Optional - }, - ], - }], + { + "type": "audio_url", + "audio_url": {"url": audio_url}, + "uuid": audio_url, # Optional + }, + ], + } + ], model=model, max_completion_tokens=64, ) @@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se # Basic usage - this is equivalent to the LLaVA example for offline inference model = "llava-hf/llava-1.5-7b-hf" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": f"{base64_image_embedding}", - "uuid": image_url # Optional + "uuid": image_url, # Optional } # Pass additional parameters (available to Qwen2-VL and MiniCPM-V) model = "Qwen/Qwen2-VL-2B-Instruct" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct + "image_embeds": f"{base64_image_embedding}", # Required + "image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct }, - "uuid": image_url # Optional + "uuid": image_url, # Optional } model = "openbmb/MiniCPM-V-2_6" - embeds = { + embeds = { "type": "image_embeds", "image_embeds": { - "image_embeds": f"{base64_image_embedding}" , # Required - "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 + "image_embeds": f"{base64_image_embedding}", # Required + "image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6 }, - "uuid": image_url # Optional + "uuid": image_url, # Optional } chat_completion = client.chat.completions.create( messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": [ { - "type": "text", - "text": "What's in this image?", + "role": "system", + "content": "You are a helpful assistant.", }, - embeds, - ], - }, - ], + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?", + }, + embeds, + ], + }, + ], model=model, ) ``` @@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit { "type": "image_embeds", "image_embeds": None, - "uuid": image_uuid + "uuid": image_uuid, }, # input_audio: { "type": "input_audio", "input_audio": None, - "uuid": audio_uuid + "uuid": audio_uuid, }, # PIL Image: { "type": "image_pil", - "image_pil": None - "uuid": image_uuid - } + "image_pil": None, + "uuid": image_uuid, + }, ``` diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 389b3cb21ef5d..ab04a1efcc083 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For Qwen3 series, if you want to disable thinking in reasoning mode, add: # extra_body={"chat_template_kwargs": {"enable_thinking": False}} - stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) + stream = client.chat.completions.create( + model=model, + messages=messages, + stream=True, + ) print("client: Start streaming chat completions...") printed_reasoning_content = False @@ -159,27 +161,29 @@ The reasoning content is also available when both tool calling and the reasoning client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} - }, - "required": ["location", "unit"] - } + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location", "unit"], + } + }, } - }] + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools, - tool_choice="auto" + tool_choice="auto", ) print(response) @@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """ Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and @@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to tuple[Optional[str], Optional[str]]: + self, + model_output: str, + request: ChatCompletionRequest | ResponsesRequest, + ) -> tuple[str | None, str | None]: """ Extract reasoning content from a complete model-generated string. @@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner @classmethod def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: - return cls(start_token_id=tokenizer.encode( - "", add_special_tokens=False)[0], - end_token_id=tokenizer.encode("", - add_special_tokens=False)[0]) + return cls( + start_token_id=tokenizer.encode("", add_special_tokens=False)[0], + end_token_id=tokenizer.encode("", add_special_tokens=False)[0], + ) def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.end_token_id in input_ids diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index e57a8945971f5..02a700c09d391 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools: return f"Getting the weather for {location} in {unit}..." tool_functions = {"get_weather": get_weather} - tools = [{ - "type": "function", - "function": { - "name": "get_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["location", "unit"], }, - "required": ["location", "unit"] - } - } - }] + }, + }, + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools, - tool_choice="auto" + tool_choice="auto", ) tool_call = response.choices[0].message.tool_calls[0].function @@ -402,8 +404,7 @@ Here is a summary of a plugin file: # adjust request. e.g.: set skip special tokens # to False for tool call output. - def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: return request # implement the tool call parse for stream call @@ -416,7 +417,7 @@ Here is a summary of a plugin file: current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: return delta # implement the tool parse for non-stream call From 780eb03d9b5da36325ab5ebddbf23bec31a789df Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 14 Oct 2025 18:27:07 +0800 Subject: [PATCH 30/92] [CI] Fix test_tool_id_kimi_k2 (#26787) Signed-off-by: chaunceyjiang --- .../openai/test_completion_with_function_calling.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 44d4176655375..6833f8d96d1c4 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -247,10 +247,10 @@ async def test_tool_id_kimi_k2( ) assert chat_completion.choices[0].message.tool_calls is not None assert len(chat_completion.choices[0].message.tool_calls) > 0 - assert ( - chat_completion.choices[0].message.tool_calls[0].id - == "functions.get_current_weather:0" - ) + assert chat_completion.choices[0].message.tool_calls[0].id in [ + "functions.get_current_weather:0", + "functions.get_forecast:1", + ] else: # Streaming test output_stream = await k2_client.chat.completions.create( @@ -266,7 +266,10 @@ async def test_tool_id_kimi_k2( if chunk.choices and chunk.choices[0].delta.tool_calls: output.extend(chunk.choices[0].delta.tool_calls) for o in output: - assert o.id is None or o.id == "functions.get_current_weather:0" + assert o.id is None or o.id in [ + "functions.get_current_weather:0", + "functions.get_forecast:1", + ] @pytest.mark.asyncio From 9c4cb68339047f11f6ebd615a0b2eadc42acb0cb Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 14 Oct 2025 19:55:10 +0800 Subject: [PATCH 31/92] [Chore] Remove `SupportsV0Only` interface and update supported models docs (#26783) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 453 +++++++++++------------ docs/usage/v1_guide.md | 6 - tests/models/registry.py | 4 - tests/models/test_initialization.py | 12 +- vllm/config/model.py | 4 - vllm/engine/arg_utils.py | 7 - vllm/model_executor/models/__init__.py | 4 - vllm/model_executor/models/interfaces.py | 21 -- vllm/model_executor/models/registry.py | 11 - 9 files changed, 220 insertions(+), 302 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index aece0022d9401..3ae24f602d8c2 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -335,108 +335,108 @@ th { } -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | -| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | -| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | ✅︎ | -| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | -| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | -| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ | -| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | -| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ | -| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ | -| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ | ✅︎ | -| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | -| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ | -| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | -| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ | -| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ | -| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | -| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | -| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ | -| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ | -| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ | -| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | ✅︎ | -| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ | -| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | -| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ | -| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | -| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | -| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ | -| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ | -| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ | -| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ | -| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ | -| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ | -| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ | -| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | +| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | +| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | +| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | +| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | +| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | +| `BailingMoeV2ForCausalLM` | Ling | `inclusionAI/Ling-mini-2.0`, etc. | ✅︎ | ✅︎ | +| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | +| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | +| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | +| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | +| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | +| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | +| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | +| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | +| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | +| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | +| `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | +| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | +| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | +| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | +| `FlexOlmoForCausalLM` | FlexOlmo | `allenai/FlexOlmo-7x7B-1T`, `allenai/FlexOlmo-7x7B-1T-RT`, etc. | | ✅︎ | +| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | +| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | +| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | +| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | +| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | +| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | +| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | +| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | +| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | +| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | +| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | +| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | +| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | +| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | +| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | +| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | +| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | +| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | +| `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | +| `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | +| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | +| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | +| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | +| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | +| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | +| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | +| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | +| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | +| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | +| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | +| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | +| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | +| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | +| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | +| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | +| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | +| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | +| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | +| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | +| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | +| `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | +| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | +| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | +| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | +| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | +| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | +| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | +| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | +| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | +| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ | Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | !!! note Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. @@ -453,21 +453,21 @@ See [this page](./pooling_models.md) for more information on how to use pooling These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ | -| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | -| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ | -| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ | -| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | ✅︎ | -| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | ✅︎ | -| `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `BertModel`C | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | +| `Gemma2Model`C | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | +| `Gemma3TextModel`C | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `GteModel`C | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | +| `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | +| `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | +| `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | +| `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -494,11 +494,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding These models primarily support the [`LLM.classify`](./pooling_models.md#llmclassify) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | +| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -511,16 +511,16 @@ If your model is not in the above list, we will try to automatically convert the Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ | -| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | ✅︎ | -| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | +| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | +| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | +| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -553,13 +553,13 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | \* | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | C Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -575,10 +575,10 @@ If your model is not in the above list, we will try to automatically convert the These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| -| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ | -| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|-----------------------------|-----------------------------------------| +| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | +| `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | !!! note Named Entity Recognition (NER) usage, please refer to , . @@ -604,29 +604,6 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model. -!!! important - **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) - or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: - - Offline inference: - - ```python - from vllm import LLM - - llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, - ) - ``` - - Online serving: - - ```bash - vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' - ``` - - **This is no longer required if you are using vLLM V1.** - !!! tip For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. @@ -663,70 +640,70 @@ See [this page](generative_models.md) for more information on how to use generat These models primarily accept the [`LLM.generate`](./generative_models.md#llmgenerate) API. Chat/Instruct models additionally support the [`LLM.chat`](./generative_models.md#llmchat) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| -| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | -| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | -| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | -| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ | -| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ | -| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ | -| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ | -| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ | -| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | -| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | -| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | -| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | -| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | -| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ | -| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ | -| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ | -| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | -| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ | -| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ | -| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | -| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | -| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | -| `MiDashengLMModel` | MiDashengLM | T + A+ | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ | -| `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | -| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | -| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | -| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | -| `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ | -| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | -| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | -| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ | -| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ | -| `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | -| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | ✅︎ | -| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | -| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | -| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | -| `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | ✅︎ | -| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | -| `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|----------------------|---------------------------| +| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | +| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | +| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | +| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | +| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | +| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | +| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | +| `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | +| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | +| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | +| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | +| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | +| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | +| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | +| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | +| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | +| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | +| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | +| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | +| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | +| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | +| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | +| `MiDashengLMModel` | MiDashengLM | T + A+ | `mispeech/midashenglm-7b` | | ✅︎ | +| `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | +| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | +| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | +| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | +| `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | +| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | +| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | +| `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | +| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | +| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | +| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | +| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | +| `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | +| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | +| `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | +| `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | +| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | +| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | +| `Step3VLForConditionalGeneration` | Step3-VL | T + I+ | `stepfun-ai/step3` | | ✅︎ | +| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | +| `Tarsier2ForConditionalGeneration`^ | Tarsier2 | T + IE+ + VE+ | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it! -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| -| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------| +| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM.     • For example, to use DeepSeek-VL2 series models: @@ -811,11 +788,11 @@ Some models are supported only via the [Transformers backend](#transformers). Th Speech2Text models trained specifically for Automatic Speech Recognition. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|-------------------|----------------------|---------------------------|---------------------| -| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ | -| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|----------------------|---------------------------| +| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | +| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | +| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ### Pooling Models @@ -830,12 +807,12 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A The following table lists those that are tested in vLLM. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------| -| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | ✅︎ | -| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ | -| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ | -| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | \* | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|----------------------|---------------------------| +| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | +| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | +| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | +| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | C Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. @@ -847,9 +824,9 @@ The following table lists those that are tested in vLLM. Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | -|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| -| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ | +| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|--------|-------------------|----------------------|---------------------------| +| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 340aaf54bb720..889648b3e7ed2 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -88,12 +88,6 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the | **Mamba Models** | 🟢 (Mamba-2), 🟢 (Mamba-1) | | **Multimodal Models** | 🟢 Functional | -vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol. - -!!! tip - - This corresponds to the V1 column in our [list of supported models](../models/supported_models.md). - See below for the status of models that are not yet supported or have more features planned in V1. #### Embedding Models diff --git a/tests/models/registry.py b/tests/models/registry.py index ec12e82ea36ea..c6dbae3a5347c 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -76,9 +76,6 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" - v0_only: bool = False - """The model is only available with the vLLM V0 engine.""" - hf_overrides: dict[str, Any] = field(default_factory=dict) """The ``hf_overrides`` required to load the model.""" @@ -694,7 +691,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo( "MiniMaxAI/MiniMax-VL-01", trust_remote_code=True, - v0_only=True, ), "Mistral3ForConditionalGeneration": _HfExamplesInfo( "mistralai/Mistral-Small-3.1-24B-Instruct-2503", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index f501798ffa36b..80bee3d8cf86c 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -88,13 +88,15 @@ def can_initialize( # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config return 1, 0, scheduler_kv_cache_config + if model_arch == "MiniMaxVL01ForConditionalGeneration": + pytest.skip( + "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`" + ) + with ( patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1), monkeypatch.context() as m, ): - if model_info.v0_only: - # NOTE(woosuk): skip the test for V0-only models - return if model_arch == "GptOssForCausalLM": # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when @@ -132,8 +134,6 @@ def can_initialize( @pytest.mark.parametrize("model_arch", MINIMAL_MODEL_ARCH_LIST) def test_can_initialize_small_subset(model_arch: str, monkeypatch: pytest.MonkeyPatch): """Test initializing small subset of supported models""" - if model_arch == "Lfm2ForCausalLM": - pytest.skip("Skipping until test supports V1-only models") can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) @@ -144,8 +144,6 @@ def test_can_initialize_large_subset(model_arch: str, monkeypatch: pytest.Monkey This test covers the complement of the tests covered in the "small subset" test. """ - if model_arch == "Lfm2ForCausalLM": - pytest.skip("Skipping until test supports V1-only models") can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS) diff --git a/vllm/config/model.py b/vllm/config/model.py index 0069dc6cca946..0c84c3f0af341 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1622,10 +1622,6 @@ class ModelConfig: def has_inner_state(self): return self._model_info.has_inner_state - @property - def is_v1_compatible(self) -> bool: - return not self._model_info.supports_v0_only - @property def use_mla(self) -> bool: return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 14be20b3a5d45..801c30dc94786 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1606,13 +1606,6 @@ class EngineArgs: ) return False - # No Mamba or Encoder-Decoder so far. - if not model_config.is_v1_compatible: - _raise_or_fallback( - feature_name=model_config.architectures, recommend_to_remove=False - ) - return False - # No Concurrent Partial Prefills so far. if ( self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index b56cb33400480..9f8dd042bf83a 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -8,14 +8,12 @@ from .interfaces import ( SupportsMultiModal, SupportsPP, SupportsTranscription, - SupportsV0Only, has_inner_state, supports_lora, supports_mrope, supports_multimodal, supports_pp, supports_transcription, - supports_v0_only, ) from .interfaces_base import ( VllmModelForPooling, @@ -43,6 +41,4 @@ __all__ = [ "supports_pp", "SupportsTranscription", "supports_transcription", - "SupportsV0Only", - "supports_v0_only", ] diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 5137cc261cc45..d25a0c18d1659 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -877,27 +877,6 @@ def supports_transcription( return getattr(model, "supports_transcription", False) -@runtime_checkable -class SupportsV0Only(Protocol): - """Models with this interface are not compatible with V1 vLLM.""" - - supports_v0_only: ClassVar[Literal[True]] = True - - -@overload -def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]: ... - - -@overload -def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]: ... - - -def supports_v0_only( - model: type[object] | object, -) -> TypeIs[type[SupportsV0Only]] | TypeIs[SupportsV0Only]: - return getattr(model, "supports_v0_only", False) - - @runtime_checkable class SupportsEagle3(Protocol): """The interface required for models that support diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 92ad19a20e024..3de94979ed30a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -44,7 +44,6 @@ from .interfaces import ( supports_multimodal_raw_input_only, supports_pp, supports_transcription, - supports_v0_only, ) from .interfaces_base import ( get_default_pooling_type, @@ -479,7 +478,6 @@ class _ModelInfo: has_noops: bool supports_transcription: bool supports_transcription_only: bool - supports_v0_only: bool @staticmethod def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": @@ -504,7 +502,6 @@ class _ModelInfo: supports_transcription_only=( supports_transcription(model) and model.supports_transcription_only ), - supports_v0_only=supports_v0_only(model), has_noops=has_noops(model), ) @@ -1063,14 +1060,6 @@ class _ModelRegistry: model_cls, _ = self.inspect_model_cls(architectures, model_config) return model_cls.supports_transcription_only - def is_v1_compatible( - self, - architectures: str | list[str], - model_config: ModelConfig, - ) -> bool: - model_cls, _ = self.inspect_model_cls(architectures, model_config) - return not model_cls.supports_v0_only - ModelRegistry = _ModelRegistry( { From c715ba373508f293acae1e67e3fbb6c054e6ce12 Mon Sep 17 00:00:00 2001 From: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:00:54 +0200 Subject: [PATCH 32/92] [Feature] Change vllm.py with pydantic validation (#26726) Signed-off-by: Vladislav Signed-off-by: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/vllm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 4da164c1a0a96..b0ed12894065d 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -7,13 +7,13 @@ import json import os import time from contextlib import contextmanager -from dataclasses import field, replace +from dataclasses import replace from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar import torch -from pydantic import ConfigDict +from pydantic import ConfigDict, Field from pydantic.dataclasses import dataclass import vllm.envs as envs @@ -57,23 +57,23 @@ class VllmConfig: # TODO: use default_factory once default constructing ModelConfig doesn't # try to download a model - model_config: ModelConfig = None # type: ignore + model_config: ModelConfig = Field(default=None) """Model configuration.""" - cache_config: CacheConfig = field(default_factory=CacheConfig) + cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration.""" - parallel_config: ParallelConfig = field(default_factory=ParallelConfig) + parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) """Parallel configuration.""" - scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig) + scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig) """Scheduler configuration.""" - device_config: DeviceConfig = field(default_factory=DeviceConfig) + device_config: DeviceConfig = Field(default_factory=DeviceConfig) """Device configuration.""" - load_config: LoadConfig = field(default_factory=LoadConfig) + load_config: LoadConfig = Field(default_factory=LoadConfig) """Load configuration.""" lora_config: LoRAConfig | None = None """LoRA configuration.""" speculative_config: SpeculativeConfig | None = None """Speculative decoding configuration.""" - structured_outputs_config: StructuredOutputsConfig = field( + structured_outputs_config: StructuredOutputsConfig = Field( default_factory=StructuredOutputsConfig ) """Structured outputs configuration.""" @@ -81,7 +81,7 @@ class VllmConfig: """Observability configuration.""" quant_config: QuantizationConfig | None = None """Quantization configuration.""" - compilation_config: CompilationConfig = field(default_factory=CompilationConfig) + compilation_config: CompilationConfig = Field(default_factory=CompilationConfig) """`torch.compile` and cudagraph capture configuration for the model. As a shorthand, `-O` can be used to directly specify the compilation @@ -103,7 +103,7 @@ class VllmConfig: # some opaque config, only used to provide additional information # for the hash computation, mainly used for testing, debugging or out of # tree config registration. - additional_config: dict | SupportsHash = field(default_factory=dict) + additional_config: dict | SupportsHash = Field(default_factory=dict) """Additional config for specified platform. Different platforms may support different configs. Make sure the configs are valid for the platform you are using. Contents must be hashable.""" From fdd32750f0bce9edd05f85c3550d6ebc3b06931f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 14 Oct 2025 20:06:35 +0800 Subject: [PATCH 33/92] [CI/Build] Cleanup LoRA test (#26752) Signed-off-by: Jee Jee Li --- tests/lora/test_chatglm3_tp.py | 5 ----- tests/lora/test_llama_tp.py | 3 --- tests/lora/test_minicpmv_tp.py | 6 +++--- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index d8058c5f87a81..f4f151180decb 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -58,7 +58,6 @@ def test_chatglm3_lora(chatglm3_lora_files): max_loras=4, max_lora_rank=64, trust_remote_code=True, - enable_chunked_prefill=True, ) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) @@ -70,7 +69,6 @@ def test_chatglm3_lora(chatglm3_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_chatglm3_lora_tp4(chatglm3_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -81,7 +79,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=False, - enable_chunked_prefill=True, ) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) @@ -93,7 +90,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use @@ -107,7 +103,6 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): tensor_parallel_size=4, trust_remote_code=True, fully_sharded_loras=True, - enable_chunked_prefill=True, gpu_memory_utilization=0.85, ) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 50fd63d35cded..e1d6a8674a01a 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -113,7 +113,6 @@ def test_llama_lora(sql_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_llama_lora_tp4(sql_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -127,7 +126,6 @@ def test_llama_lora_tp4(sql_lora_files): @multi_gpu_test(num_gpus=4) -@create_new_process_for_each_test() def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -142,7 +140,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): @multi_gpu_test(num_gpus=2) -@create_new_process_for_each_test() def test_tp2_serialize_and_deserialize_lora( tmp_path, sql_lora_files, sql_lora_huggingface_id ): diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index ce98fe2f86137..1cf8ed602b6a4 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -8,7 +8,7 @@ from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.platforms import current_platform -from ..utils import create_new_process_for_each_test +from ..utils import multi_gpu_test MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -88,7 +88,7 @@ def test_minicpmv_lora(minicpmv_lora_files): current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm", ) -@create_new_process_for_each_test() +@multi_gpu_test(num_gpus=4) def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -112,7 +112,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm", ) -@create_new_process_for_each_test() +@multi_gpu_test(num_gpus=4) def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, From ea97940d6c2a0b56affa0d288b046acde4be616e Mon Sep 17 00:00:00 2001 From: Jaya Yuan Date: Tue, 14 Oct 2025 21:07:50 +0800 Subject: [PATCH 34/92] [DCP] Support Decode Context Parallel (DCP) for GQA with FlashAttention (#24864) Signed-off-by: yuanyongjie.yyj Signed-off-by: FENP <32334296+FENP@users.noreply.github.com> Signed-off-by: Jaya Yuan --- tests/distributed/test_context_parallel.py | 6 +- tests/models/registry.py | 5 +- vllm/attention/ops/common.py | 10 +- vllm/config/model.py | 17 ++ vllm/v1/attention/backends/flash_attn.py | 202 ++++++++++++++++++--- vllm/v1/attention/backends/utils.py | 1 + vllm/v1/worker/gpu_model_runner.py | 1 + 7 files changed, 209 insertions(+), 33 deletions(-) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 149b502a85a75..5495640af07eb 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -204,17 +204,21 @@ def _compare_cp_with_tp( CP_TEXT_GENERATION_MODELS = { - # [MLA attention only] "deepseek-ai/DeepSeek-V2-Lite-Chat": [ CPTestSettings.detailed(), CPTestSettings.detailed(tp_base=2), ], + "bigcode/gpt_bigcode-santacoder": [ + CPTestSettings.detailed(), + CPTestSettings.detailed(tp_base=2), + ], } CP_TEST_MODELS = [ # TODO support other models # [LANGUAGE GENERATION] "deepseek-ai/DeepSeek-V2-Lite-Chat", + "bigcode/gpt_bigcode-santacoder", ] diff --git a/tests/models/registry.py b/tests/models/registry.py index c6dbae3a5347c..617dc30691aa8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -262,7 +262,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo( "bigcode/starcoder", - extras={"tiny": "bigcode/tiny_starcoder_py"}, + extras={ + "tiny": "bigcode/tiny_starcoder_py", + "santacoder": "bigcode/gpt_bigcode-santacoder", + }, min_transformers_version="4.55.1", transformers_version_reason="HF model broken in 4.55.0", ), diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 1234e1b2e46a8..b6b7ecd2552a7 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -173,6 +173,7 @@ def cp_lse_ag_out_rs( cp_attn_lse: torch.Tensor, cp_group: GroupCoordinator, ctx: CPTritonContext = None, + return_lse=False, ): """ cp_attn_out: [ B, H, D ] @@ -192,8 +193,15 @@ def cp_lse_ag_out_rs( cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) - out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + assert out.is_contiguous() out = cp_group.reduce_scatter(out, dim=1) + + if return_lse: + cp_num_heads = lse.shape[1] // cp_group.world_size + cp_rank = cp_group.rank_in_group + lse = lse[:, cp_num_heads * cp_rank : cp_num_heads * (cp_rank + 1)] + return out, lse return out diff --git a/vllm/config/model.py b/vllm/config/model.py index 0c84c3f0af341..2be939eb654d8 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1202,6 +1202,23 @@ class ModelConfig: "Supported models implement the `SupportsPP` interface." ) + decode_context_parallel_size = parallel_config.decode_context_parallel_size + if decode_context_parallel_size > 1 and not self.use_mla: + total_num_kv_heads = self.get_total_num_kv_heads() + assert tensor_parallel_size > total_num_kv_heads, ( + f"tensor parallel size {tensor_parallel_size} must be greater " + f"than total num kv heads {total_num_kv_heads} when enable " + f"decode context parallel for GQA/MQA" + ) + + max_dcp_size = tensor_parallel_size // total_num_kv_heads + assert decode_context_parallel_size <= max_dcp_size, ( + f"decode context parallel size must less than or equal to " + f"(tensor parallel size {tensor_parallel_size} // total " + f"num kv heads {total_num_kv_heads}) = {max_dcp_size}, " + f"but got {decode_context_parallel_size}" + ) + def get_sliding_window(self) -> int | None: """Get the sliding window size from the HF text config if present.""" return getattr(self.hf_text_config, "sliding_window", None) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fb5ff499de2cd..fa4e34536135d 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -17,6 +17,7 @@ from vllm.attention.backends.abstract import ( is_quantized_kv_cache, ) from vllm.attention.layer import Attention +from vllm.attention.ops.common import cp_lse_ag_out_rs from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import ( flash_attn_supports_fp8, @@ -32,6 +33,7 @@ if is_flash_attn_varlen_func_available(): ) from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.distributed.parallel_state import get_dcp_group from vllm.logger import init_logger from vllm.utils import cdiv from vllm.v1.attention.backends.utils import ( @@ -147,6 +149,10 @@ class FlashAttentionMetadata: prefix_kv_lens: torch.Tensor | None suffix_kv_lens: torch.Tensor | None + # For GQA DCP + max_dcp_context_kv_len: int | None = None + dcp_context_kv_lens: torch.Tensor | None = None + # Optional aot scheduling scheduler_metadata: torch.Tensor | None = None prefix_scheduler_metadata: torch.Tensor | None = None @@ -216,6 +222,16 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad self.max_num_splits = 0 # No upper bound on the number of splits. self.aot_schedule = get_flash_attn_version() == 3 + try: + from vllm.distributed.parallel_state import get_dcp_group + + self.dcp_world_size = get_dcp_group().world_size + self.dcp_rank = get_dcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.dcp_world_size = 1 + self.dcp_rank = 0 + self.use_full_cuda_graph = ( self.compilation_config.cudagraph_mode.has_full_cudagraphs() ) @@ -306,7 +322,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad batch_size=batch_size, max_seqlen_q=max_query_len, max_seqlen_k=max_seq_len, - num_heads_q=self.num_heads_q, + num_heads_q=self.num_heads_q * self.dcp_world_size, num_heads_kv=self.num_heads_kv, headdim=self.headdim, cache_seqlens=seqlens, @@ -320,8 +336,35 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad return None use_cascade = common_prefix_len > 0 + max_dcp_context_kv_len = 0 + dcp_context_kv_lens = None - if use_cascade: + cu_prefix_query_lens = None + prefix_kv_lens = None + suffix_kv_lens = None + prefix_scheduler_metadata = None + + if self.dcp_world_size > 1: + query_kv_lens_cpu = ( + common_attn_metadata.query_start_loc_cpu[1:] + - common_attn_metadata.query_start_loc_cpu[:-1] + ) + dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu + dcp_context_kv_lens_cpu = dcp_context_kv_lens_cpu // self.dcp_world_size + ( + self.dcp_rank <= (dcp_context_kv_lens_cpu - 1) % self.dcp_world_size + ) + dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device) + max_dcp_context_kv_len = dcp_context_kv_lens.max().item() + + scheduler_metadata = schedule( + batch_size=num_reqs, + cu_query_lens=query_start_loc, + max_query_len=max_query_len, + seqlens=dcp_context_kv_lens, + max_seq_len=max_dcp_context_kv_len, + causal=False, + ) + elif use_cascade: cu_prefix_query_lens = torch.tensor( [0, num_actual_tokens], dtype=torch.int32, device=self.device ) @@ -348,10 +391,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad causal=True, ) else: - cu_prefix_query_lens = None - prefix_kv_lens = None - suffix_kv_lens = None - prefix_scheduler_metadata = None scheduler_metadata = schedule( batch_size=num_reqs, cu_query_lens=query_start_loc, @@ -379,6 +418,8 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad seq_lens=seq_lens, block_table=block_table_tensor, slot_mapping=slot_mapping, + max_dcp_context_kv_len=max_dcp_context_kv_len, + dcp_context_kv_lens=dcp_context_kv_lens, use_cascade=use_cascade, common_prefix_len=common_prefix_len, scheduler_metadata=scheduler_metadata, @@ -396,6 +437,8 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad class FlashAttentionImpl(AttentionImpl): + can_return_lse_for_decode: bool = True + def __init__( self, num_heads: int, @@ -562,30 +605,45 @@ class FlashAttentionImpl(AttentionImpl): descale_shape = (cu_seqlens_q.shape[0] - 1, self.num_kv_heads) - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=cu_seqlens_q, - max_seqlen_q=max_seqlen_q, - seqused_k=seqused_k, - max_seqlen_k=max_seqlen_k, - softmax_scale=self.scale, - causal=attn_metadata.causal, - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=block_table, - softcap=self.logits_soft_cap, - scheduler_metadata=scheduler_metadata, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - num_splits=attn_metadata.max_num_splits, - s_aux=self.sinks, - ) - return output + if self.dcp_world_size > 1: + self._forward_with_dcp( + query[:num_actual_tokens], + key[:num_actual_tokens], + value[:num_actual_tokens], + key_cache, + value_cache, + output[:num_actual_tokens], + attn_metadata, + q_descale=layer._q_scale.expand(descale_shape), + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) + return output + else: + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=seqused_k, + max_seqlen_k=max_seqlen_k, + softmax_scale=self.scale, + causal=attn_metadata.causal, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + scheduler_metadata=scheduler_metadata, + fa_version=self.vllm_flash_attn_version, + q_descale=layer._q_scale.expand(descale_shape), + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + num_splits=attn_metadata.max_num_splits, + s_aux=self.sinks, + ) + return output # Cascade attention (rare case). cascade_attention( @@ -615,6 +673,86 @@ class FlashAttentionImpl(AttentionImpl): ) return output + def _forward_with_dcp( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + output: torch.Tensor, + attn_metadata: FlashAttentionMetadata, + q_descale: torch.Tensor | None = None, + k_descale: torch.Tensor | None = None, + v_descale: torch.Tensor | None = None, + ) -> torch.Tensor: + cu_seqlens_q = attn_metadata.query_start_loc + max_seqlen_q = attn_metadata.max_query_len + block_table = attn_metadata.block_table + + query = query.contiguous() + query_across_dcp = get_dcp_group().all_gather(query, dim=1) + context_attn_out, context_lse = flash_attn_varlen_func( + q=query_across_dcp, + k=key_cache, + v=value_cache, + out=None, + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=attn_metadata.dcp_context_kv_lens, + max_seqlen_k=attn_metadata.max_dcp_context_kv_len, + softmax_scale=self.scale, + causal=False, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + return_softmax_lse=True, + scheduler_metadata=attn_metadata.scheduler_metadata, + fa_version=self.vllm_flash_attn_version, + q_descale=q_descale, + k_descale=k_descale, + v_descale=v_descale, + ) + # FA returns LSE in shape [ H, B ] but cp_lse_ag_out_rs wants [ B, H ] + context_attn_out_cor, context_lse_cor = cp_lse_ag_out_rs( + context_attn_out, + context_lse.transpose(0, 1), + get_dcp_group(), + return_lse=True, + ) + context_lse_cor = context_lse_cor.transpose(0, 1).contiguous() + + query_attn_out, query_lse = flash_attn_varlen_func( + q=query, + k=key, + v=value, + out=None, + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + cu_seqlens_k=cu_seqlens_q, + max_seqlen_k=max_seqlen_q, + softmax_scale=self.scale, + causal=attn_metadata.causal, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + softcap=self.logits_soft_cap, + return_softmax_lse=True, + fa_version=self.vllm_flash_attn_version, + q_descale=q_descale, + k_descale=k_descale, + v_descale=v_descale, + ) + assert context_attn_out_cor.shape == query_attn_out.shape + assert context_lse_cor.shape == query_lse.shape + merge_attn_states( + output, + context_attn_out_cor, + context_lse_cor, + query_attn_out, + query_lse, + ) + def _forward_encoder_attention( self, query: torch.Tensor, @@ -684,6 +822,7 @@ def use_cascade_attention( use_sliding_window: bool, use_local_attention: bool, num_sms: int, + dcp_world_size: int, ) -> bool: """Decide whether to use cascade attention. @@ -705,6 +844,9 @@ def use_cascade_attention( num_reqs = len(query_lens) if num_reqs < 8: return False + # disable cascade attention for DCP + if dcp_world_size > 1: + return False # Heuristics to decide whether using cascade attention is beneficial. # 1. When FlashDecoding is not used for normal attention, cascade attention diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index beb267f196fb9..cb5855548098b 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -345,6 +345,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): use_sliding_window: bool, use_local_attention: bool, num_sms: int, + dcp_world_size: int, ) -> bool: return False diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5c2893bd09266..ce174664710bb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1523,6 +1523,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): use_sliding_window=use_sliding_window, use_local_attention=use_local_attention, num_sms=self.num_sms, + dcp_world_size=self.dcp_world_size, ) return common_prefix_len if use_cascade else 0 From e9f1b8c9e9ab9dd659e95a43b38329f34a805281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 14 Oct 2025 21:26:11 +0800 Subject: [PATCH 35/92] Adjusted the model order of the model registration file (#26798) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- vllm/model_executor/models/registry.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3de94979ed30a..c43964285c052 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -60,9 +60,6 @@ _TEXT_GENERATION_MODELS = { "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"), "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"), - "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), - "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), - "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), # baichuan-7b, upper case 'C' in the class name "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-13b, lower case 'c' in the class name @@ -87,8 +84,10 @@ _TEXT_GENERATION_MODELS = { "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "Exaone4ForCausalLM": ("exaone4", "Exaone4ForCausalLM"), - "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), + "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), + "FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"), "FlexOlmoForCausalLM": ("flex_olmo", "FlexOlmoForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), @@ -126,11 +125,12 @@ _TEXT_GENERATION_MODELS = { "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), - "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), - "FalconH1ForCausalLM": ("falcon_h1", "FalconH1ForCausalLM"), "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), + "MiniMaxForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), + "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), + "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), # transformers's mpt class has lower case From ca683a2a729d894286e7fef6afcb4d34b75e37ca Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 14 Oct 2025 06:40:59 -0700 Subject: [PATCH 36/92] use combo kernel to fuse qk-norm and qk-rope (#26682) Signed-off-by: Boyuan Feng --- vllm/config/compilation.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 5313112a19a60..60aef2f6f7e1c 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -513,6 +513,16 @@ class CompilationConfig: if isinstance(self.pass_config, dict): self.pass_config = PassConfig(**self.pass_config) + if ( + is_torch_equal_or_newer("2.9.0.dev") + and "combo_kernels" not in self.inductor_compile_config + and "benchmark_combo_kernel" not in self.inductor_compile_config + ): + # use horizontal fusion, which is useful for fusing qk-norm and + # qk-rope when query and key have different shapes. + self.inductor_compile_config["combo_kernels"] = True + self.inductor_compile_config["benchmark_combo_kernel"] = True + # migrate the deprecated flags if not self.use_cudagraph: logger.warning( From 88a49745af3ad85cec643bf9f22eacc2436042fc Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 14 Oct 2025 22:32:36 +0800 Subject: [PATCH 37/92] [issues template] Encourage the author implement their own ideas (#26671) Signed-off-by: wang.yuqi --- .github/ISSUE_TEMPLATE/100-documentation.yml | 5 +++++ .github/ISSUE_TEMPLATE/400-bug-report.yml | 5 +++++ .github/ISSUE_TEMPLATE/450-ci-failure.yml | 5 +++++ .github/ISSUE_TEMPLATE/500-feature-request.yml | 5 +++++ .github/ISSUE_TEMPLATE/600-new-model.yml | 5 +++++ .github/ISSUE_TEMPLATE/700-performance-discussion.yml | 5 +++++ .github/ISSUE_TEMPLATE/750-RFC.yml | 5 +++++ 7 files changed, 35 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml index 74d397b231acd..1c3f4e8b039cc 100644 --- a/.github/ISSUE_TEMPLATE/100-documentation.yml +++ b/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -20,6 +20,11 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to implement the proposed documentation improvements? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 8c5c28cd77cff..9b235a46c3a85 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -99,6 +99,11 @@ body: - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. Thanks for reporting 🙏! +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to fix this bug? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml index 7af0e0673a2f3..2716f7c6c9a18 100644 --- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml +++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml @@ -67,3 +67,8 @@ body: attributes: value: > Thanks for reporting 🙏! +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to fix this CI Failure? diff --git a/.github/ISSUE_TEMPLATE/500-feature-request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml index 097d88f50930d..c99b62d21d79b 100644 --- a/.github/ISSUE_TEMPLATE/500-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml @@ -29,6 +29,11 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to implement this feature? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml index 5f0125ef98096..45a465a7e8af7 100644 --- a/.github/ISSUE_TEMPLATE/600-new-model.yml +++ b/.github/ISSUE_TEMPLATE/600-new-model.yml @@ -31,6 +31,11 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to implement this new model? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml index 3d31c11550167..3c3c02cacb528 100644 --- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml +++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml @@ -50,6 +50,11 @@ body: attributes: value: > Thanks for contributing 🎉! +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to implement the performance enhancements? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index c0e009855964a..fbbfabe5f56aa 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -43,6 +43,11 @@ body: Any other things you would like to mention. validations: required: false +- type: checkboxes + id: plan-to-implement + attributes: + options: + - label: Are you planning to implement this feature? - type: checkboxes id: askllm attributes: From 720394de43c80c7b3a3022d449c068b99499df18 Mon Sep 17 00:00:00 2001 From: Qier Li Date: Tue, 14 Oct 2025 10:38:07 -0400 Subject: [PATCH 38/92] [KVConnector][Metrics] Aggregate scheduler-side KVConnectorStats (#26046) Signed-off-by: Qier Li --- .../kv_connector/unit/test_nixl_connector.py | 69 +++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 4 ++ 2 files changed, 73 insertions(+) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index a911ddc56b023..869e80a1af88c 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -839,6 +839,75 @@ def test_multi_kv_connector_stats_aggregation(): assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6 +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, +) +def test_scheduler_kv_connector_stats_aggregation(): + """Test scheduler and worker KV connector stats aggregation.""" + from vllm.v1.core.sched.output import SchedulerOutput + + scheduler = create_scheduler(create_vllm_config()) + + # Worker stats with transfer metrics + worker_stats = NixlKVConnectorStats() + worker_stats.record_transfer(get_default_xfer_telemetry()) + worker_stats.data["remote_tokens"] = [] + + # Scheduler stats with custom metric (needs dummy transfer to avoid being skipped) + scheduler_stats = NixlKVConnectorStats() + scheduler_stats.data.update( + { # dummy transfer just for testing, to bypass is_empty() check + "transfer_duration": [0], + "post_duration": [0], + "bytes_transferred": [0], + "num_descriptors": [0], + "remote_tokens": [128], + } + ) + + # Mock the scheduler connector's stats method + scheduler.connector.get_kv_connector_stats = lambda: MultiKVConnectorStats( + data={"NixlConnector": scheduler_stats} + ) + + model_output = ModelRunnerOutput( + req_ids=["req_0"], + req_id_to_index={"req_0": 0}, + sampled_token_ids=[[123]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + kv_connector_stats=MultiKVConnectorStats( + data={"NixlConnector": worker_stats} + ) + ), + ) + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=None, + num_scheduled_tokens={"req_0": 1}, + total_num_scheduled_tokens=1, + scheduled_spec_decode_tokens={}, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=[0], + finished_req_ids=set(), + free_encoder_mm_hashes=set(), + structured_output_request_ids={}, + grammar_bitmask=None, + ) + + engine_core_outputs = scheduler.update_from_output(scheduler_output, model_output) + + final_stats = next( + iter(engine_core_outputs.values()) + ).scheduler_stats.kv_connector_stats + nixl_stats = final_stats["NixlConnector"] + assert nixl_stats.num_successful_transfers == 2 + assert nixl_stats.data["remote_tokens"] == [128] + + @pytest.mark.parametrize("distributed_executor_backend", ["ray", None]) @patch( "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index cbbdf48c6e0cd..55d7f17d5081e 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -924,6 +924,10 @@ class Scheduler(SchedulerInterface): kv_connector_stats = ( kv_connector_output.kv_connector_stats if kv_connector_output else None ) + if kv_connector_stats and self.connector: + stats = self.connector.get_kv_connector_stats() + if stats: + kv_connector_stats = kv_connector_stats.aggregate(stats) failed_kv_load_req_ids = None if kv_connector_output and kv_connector_output.invalid_block_ids: From df850c4912d30d583682b7fc4d872aa5fd90bced Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 14 Oct 2025 23:31:43 +0800 Subject: [PATCH 39/92] [Feature][Responses API] Stream Function Call - harmony (#24317) Signed-off-by: chaunceyjiang --- .../openai/test_response_api_with_harmony.py | 202 ++++++++++++------ vllm/entrypoints/openai/serving_responses.py | 81 ++++++- 2 files changed, 213 insertions(+), 70 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 57d88f84d2519..4251d06435c11 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -16,6 +16,22 @@ from ...utils import RemoteOpenAIServer MODEL_NAME = "openai/gpt-oss-20b" +GET_WEATHER_SCHEMA = { + "type": "function", + "name": "get_weather", + "description": "Get current temperature for provided coordinates in celsius.", # noqa + "parameters": { + "type": "object", + "properties": { + "latitude": {"type": "number"}, + "longitude": {"type": "number"}, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, +} + @pytest.fixture(scope="module") def server(): @@ -305,6 +321,54 @@ async def test_streaming_types(client: OpenAI, model_name: str): assert len(stack_of_event_types) == 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_with_streaming_types(client: OpenAI, model_name: str): + # this links the "done" type with the "start" type + # so every "done" type should have a corresponding "start" type + # and every open block should be closed by the end of the stream + pairs_of_event_types = { + "response.completed": "response.created", + "response.output_item.done": "response.output_item.added", + "response.output_text.done": "response.output_text.delta", + "response.reasoning_text.done": "response.reasoning_text.delta", + "response.reasoning_part.done": "response.reasoning_part.added", + "response.function_call_arguments.done": "response.function_call_arguments.delta", # noqa + } + + tools = [GET_WEATHER_SCHEMA] + input_list = [ + { + "role": "user", + "content": "What's the weather like in Paris today?", + } + ] + stream_response = await client.responses.create( + model=model_name, + input=input_list, + tools=tools, + stream=True, + ) + + stack_of_event_types = [] + async for event in stream_response: + if event.type == "response.created": + stack_of_event_types.append(event.type) + elif event.type == "response.completed": + assert stack_of_event_types[-1] == pairs_of_event_types[event.type] + stack_of_event_types.pop() + if event.type.endswith("added"): + stack_of_event_types.append(event.type) + elif event.type.endswith("delta"): + if stack_of_event_types[-1] == event.type: + continue + stack_of_event_types.append(event.type) + elif event.type.endswith("done"): + assert stack_of_event_types[-1] == pairs_of_event_types[event.type] + stack_of_event_types.pop() + assert len(stack_of_event_types) == 0 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("background", [True, False]) @@ -483,23 +547,7 @@ def call_function(name, args): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling(client: OpenAI, model_name: str): - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - } - ] + tools = [GET_WEATHER_SCHEMA] response = await client.responses.create( model=model_name, @@ -565,21 +613,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str): }, "strict": True, }, - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - }, + GET_WEATHER_SCHEMA, ] response = await client.responses.create( @@ -643,23 +677,7 @@ async def test_function_calling_multi_turn(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling_required(client: OpenAI, model_name: str): - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - } - ] + tools = [GET_WEATHER_SCHEMA] with pytest.raises(BadRequestError): await client.responses.create( @@ -689,23 +707,7 @@ async def test_system_message_with_tools(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling_full_history(client: OpenAI, model_name: str): - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get current temperature for provided coordinates in celsius.", # noqa - "parameters": { - "type": "object", - "properties": { - "latitude": {"type": "number"}, - "longitude": {"type": "number"}, - }, - "required": ["latitude", "longitude"], - "additionalProperties": False, - }, - "strict": True, - } - ] + tools = [GET_WEATHER_SCHEMA] input_messages = [ {"role": "user", "content": "What's the weather like in Paris today?"} @@ -745,6 +747,74 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str): assert response_2.output_text is not None +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_calling_with_stream(client: OpenAI, model_name: str): + tools = [GET_WEATHER_SCHEMA] + input_list = [ + { + "role": "user", + "content": "What's the weather like in Paris today?", + } + ] + stream_response = await client.responses.create( + model=model_name, + input=input_list, + tools=tools, + stream=True, + ) + assert stream_response is not None + final_tool_calls = {} + final_tool_calls_named = {} + async for event in stream_response: + if event.type == "response.output_item.added": + if event.item.type != "function_call": + continue + final_tool_calls[event.output_index] = event.item + final_tool_calls_named[event.item.name] = event.item + elif event.type == "response.function_call_arguments.delta": + index = event.output_index + tool_call = final_tool_calls[index] + if tool_call: + tool_call.arguments += event.delta + final_tool_calls_named[tool_call.name] = tool_call + elif event.type == "response.function_call_arguments.done": + assert event.arguments == final_tool_calls_named[event.name].arguments + for tool_call in final_tool_calls.values(): + if ( + tool_call + and tool_call.type == "function_call" + and tool_call.name == "get_weather" + ): + args = json.loads(tool_call.arguments) + result = call_function(tool_call.name, args) + input_list += [tool_call] + break + assert result is not None + response = await client.responses.create( + model=model_name, + input=input_list + + [ + { + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + } + ], + tools=tools, + stream=True, + ) + assert response is not None + async for event in response: + # check that no function call events in the stream + assert event.type != "response.function_call_arguments.delta" + assert event.type != "response.function_call_arguments.done" + # check that the response contains output text + if event.type == "response.completed": + assert len(event.response.output) > 0 + assert event.response.output_text is not None + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_output_messages_enabled(client: OpenAI, model_name: str, server): diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 744df98a4278e..51e2856a5a9dd 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -23,6 +23,8 @@ from openai.types.responses import ( ResponseCodeInterpreterToolCallParam, ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, + ResponseFunctionCallArgumentsDeltaEvent, + ResponseFunctionCallArgumentsDoneEvent, ResponseFunctionToolCall, ResponseFunctionWebSearch, ResponseOutputItem, @@ -927,6 +929,11 @@ class OpenAIServingResponses(OpenAIServing): # to add the tool call request to prev_outputs so that the # parse_response_input can find the tool call request when # parsing the tool call output. + if ( + isinstance(response_msg, dict) + and response_msg.get("type") == "function_call" + ): + response_msg = ResponseFunctionToolCall.model_validate(response_msg) if isinstance(response_msg, ResponseFunctionToolCall): prev_outputs.append(response_msg) return messages @@ -1398,19 +1405,48 @@ class OpenAIServingResponses(OpenAIServing): current_output_index = 0 current_item_id: str = "" sent_output_item_added = False - + is_first_function_call_delta = False async for ctx in result_generator: assert isinstance(ctx, StreamingHarmonyContext) if ctx.is_expecting_start(): current_output_index += 1 sent_output_item_added = False - + is_first_function_call_delta = False if len(ctx.parser.messages) > 0: previous_item = ctx.parser.messages[-1] if previous_item.recipient is not None: - # Deal with tool call here - pass + # Deal with tool call + if previous_item.recipient.startswith("functions."): + function_name = previous_item.recipient[len("functions.") :] + yield _increment_sequence_number_and_return( + ResponseFunctionCallArgumentsDoneEvent( + type="response.function_call_arguments.done", + arguments=previous_item.content[0].text, + name=function_name, + item_id=current_item_id, + output_index=current_output_index, + sequence_number=-1, + ) + ) + function_call_item = ResponseFunctionToolCall( + type="function_call", + arguments=previous_item.content[0].text, + name=function_name, + item_id=current_item_id, + output_index=current_output_index, + sequence_number=-1, + call_id=f"fc_{random_uuid()}", + status="completed", + ) + yield _increment_sequence_number_and_return( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=function_call_item, + ) + ) elif previous_item.channel == "analysis": content = ResponseReasoningTextContent( text=previous_item.content[0].text, @@ -1766,6 +1802,43 @@ class OpenAIServingResponses(OpenAIServing): ), ) ) + # developer tools will be triggered on the commentary channel + # and recipient starts with "functions.TOOL_NAME" + if ( + ctx.parser.current_channel == "commentary" + and ctx.parser.current_recipient + and ctx.parser.current_recipient.startswith("functions.") + ): + if is_first_function_call_delta is False: + is_first_function_call_delta = True + fc_name = ctx.parser.current_recipient[len("functions.") :] + tool_call_item = ResponseFunctionToolCall( + name=fc_name, + type="function_call", + id=current_item_id, + call_id=f"call_{random_uuid()}", + arguments="", + status="in_progress", + ) + current_item_id = f"fc_{random_uuid()}" + yield _increment_sequence_number_and_return( + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=tool_call_item, + ) + ) + else: + yield _increment_sequence_number_and_return( + ResponseFunctionCallArgumentsDeltaEvent( + item_id=current_item_id, + delta=ctx.parser.last_content_delta, + output_index=current_output_index, + sequence_number=-1, + type="response.function_call_arguments.delta", + ) + ) async def responses_stream_generator( self, From e6cdbd6792eb3a33f41963a1f1ea4e6314d4a27c Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 14 Oct 2025 23:37:34 +0800 Subject: [PATCH 40/92] Revert "[issues template] Encourage the author implement their own ideas" (#26814) --- .github/ISSUE_TEMPLATE/100-documentation.yml | 5 ----- .github/ISSUE_TEMPLATE/400-bug-report.yml | 5 ----- .github/ISSUE_TEMPLATE/450-ci-failure.yml | 5 ----- .github/ISSUE_TEMPLATE/500-feature-request.yml | 5 ----- .github/ISSUE_TEMPLATE/600-new-model.yml | 5 ----- .github/ISSUE_TEMPLATE/700-performance-discussion.yml | 5 ----- .github/ISSUE_TEMPLATE/750-RFC.yml | 5 ----- 7 files changed, 35 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml index 1c3f4e8b039cc..74d397b231acd 100644 --- a/.github/ISSUE_TEMPLATE/100-documentation.yml +++ b/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -20,11 +20,6 @@ body: attributes: value: > Thanks for contributing 🎉! -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to implement the proposed documentation improvements? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 9b235a46c3a85..8c5c28cd77cff 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -99,11 +99,6 @@ body: - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. Thanks for reporting 🙏! -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to fix this bug? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml index 2716f7c6c9a18..7af0e0673a2f3 100644 --- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml +++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml @@ -67,8 +67,3 @@ body: attributes: value: > Thanks for reporting 🙏! -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to fix this CI Failure? diff --git a/.github/ISSUE_TEMPLATE/500-feature-request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml index c99b62d21d79b..097d88f50930d 100644 --- a/.github/ISSUE_TEMPLATE/500-feature-request.yml +++ b/.github/ISSUE_TEMPLATE/500-feature-request.yml @@ -29,11 +29,6 @@ body: attributes: value: > Thanks for contributing 🎉! -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to implement this feature? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml index 45a465a7e8af7..5f0125ef98096 100644 --- a/.github/ISSUE_TEMPLATE/600-new-model.yml +++ b/.github/ISSUE_TEMPLATE/600-new-model.yml @@ -31,11 +31,6 @@ body: attributes: value: > Thanks for contributing 🎉! -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to implement this new model? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml index 3c3c02cacb528..3d31c11550167 100644 --- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml +++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml @@ -50,11 +50,6 @@ body: attributes: value: > Thanks for contributing 🎉! -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to implement the performance enhancements? - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index fbbfabe5f56aa..c0e009855964a 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -43,11 +43,6 @@ body: Any other things you would like to mention. validations: required: false -- type: checkboxes - id: plan-to-implement - attributes: - options: - - label: Are you planning to implement this feature? - type: checkboxes id: askllm attributes: From 6d87a2838cfbddbfd0a13df6998e0e35bd816703 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 14 Oct 2025 11:47:49 -0400 Subject: [PATCH 41/92] [Config] Remove Unused Environment Variable `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` (#26743) Signed-off-by: yewentao256 --- vllm/envs.py | 7 ------- vllm/v1/worker/gpu_model_runner.py | 1 - 2 files changed, 8 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index d93ae8b9c2250..8d8c96297d914 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -198,7 +198,6 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True VLLM_TUNED_CONFIG_FOLDER: str | None = None - VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False @@ -1304,12 +1303,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool( int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0")) ), - # Disable padding to CUDA graph capture batch sizes. - # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378 - # After the issue is fixed, we can remove this flag. - "VLLM_DISABLE_PAD_FOR_CUDAGRAPH": lambda: bool( - int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0")) - ), # Used to force set up loopback IP "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), # Used to set the process name prefix for vLLM processes. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ce174664710bb..67a93fed52749 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2067,7 +2067,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: if ( self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH and hasattr(self, "cudagraph_batch_sizes") and self.cudagraph_batch_sizes and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1] From efc8f7d8141eeed311d5f25f5e366a1a5bdc35bf Mon Sep 17 00:00:00 2001 From: Reza Barazesh <3146276+rzabarazesh@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:45:06 -0400 Subject: [PATCH 42/92] Update coveragerc and add codecov.yml for path fixes (#26435) Signed-off-by: Reza Barazesh --- .coveragerc | 17 ++++++++++++++++- codecov.yml | 12 ++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 codecov.yml diff --git a/.coveragerc b/.coveragerc index bc6342956109b..b7a9fdb4e05a8 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,10 @@ [run] -source = vllm +# Track the installed vllm package (this is what actually gets imported during tests) +# Use wildcard pattern to match the installed location +source = + vllm + */dist-packages/vllm + */site-packages/vllm omit = */tests/* */test_* @@ -12,6 +17,16 @@ omit = */benchmarks/* */docs/* +[paths] +# Map all possible vllm locations to a canonical "vllm" path +# This ensures coverage.combine properly merges data from different test runs +source = + vllm + /vllm-workspace/src/vllm + /vllm-workspace/vllm + */site-packages/vllm + */dist-packages/vllm + [report] exclude_lines = pragma: no cover diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000000..304c0be8105fc --- /dev/null +++ b/codecov.yml @@ -0,0 +1,12 @@ +codecov: + require_ci_to_pass: false + +fixes: + # Map source code paths to repository root paths + # Wildcards match any Python version (python3.*) + - "/vllm-workspace/src/vllm/::vllm/" + - "/vllm-workspace/vllm/::vllm/" + - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/" + - "/usr/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/lib/python3.*/site-packages/vllm/::vllm/" From 04b5f9802da90f27636549677ec280d34236a51d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 13:52:05 -0400 Subject: [PATCH 43/92] [CI] Raise VLLM_MAX_SIZE_MB to 500 due to failing Build wheel - CUDA 12.9 (#26722) Signed-off-by: mgoin --- .buildkite/check-wheel-size.py | 4 ++-- docker/Dockerfile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 76f6d7aeca0d8..77ee313687fc8 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -5,11 +5,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB # Note that we have 800 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/6326 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) def print_top_10_largest_files(zip_file): diff --git a/docker/Dockerfile b/docker/Dockerfile index 3a0db3cc49f61..f9e07acb855c3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=450 +ARG VLLM_MAX_SIZE_MB=500 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ From aba48f7db10508fe69294990d6d3aaf6c04f2326 Mon Sep 17 00:00:00 2001 From: Ze'ev Klapow Date: Tue, 14 Oct 2025 14:20:39 -0400 Subject: [PATCH 44/92] [Kernel][MoE] Add MoE tunings for GLM 4.6-FP8 and GLM 4.5 Air on NVidia B200 (#26818) --- .../E=32,N=1408,device_name=NVIDIA_B200.json | 147 ++++++++++++++++++ ...evice_name=NVIDIA_B200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ .../E=64,N=1408,device_name=NVIDIA_B200.json | 147 ++++++++++++++++++ 3 files changed, 441 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..8ed3ad3527170 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..7ffa2ac894871 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json new file mode 100644 index 0000000000000..9952f80834794 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.4.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} From c3a722fcb2c95c61af3ce5e822b6eb8285abeb85 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 14:38:59 -0400 Subject: [PATCH 45/92] [CI Failure] Fix tests with missing TinyLlama-1.1B-Chat-v1.0-FP8-e2e (#26816) Signed-off-by: mgoin --- tests/compile/test_async_tp.py | 2 +- tests/compile/test_fusion_all_reduce.py | 2 +- tests/compile/test_sequence_parallelism.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index d396d3940f67f..102a929bf2409 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -332,7 +332,7 @@ def async_tp_pass_on_test_model( # this is a fake model name to construct the model config # in the vllm_config, it's not really used. - model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8" vllm_config.model_config = ModelConfig( model=model_name, trust_remote_code=True, dtype=dtype, seed=42 ) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 7e5c460db1744..455d1bb039057 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -229,7 +229,7 @@ def all_reduce_fusion_pass_on_test_model( # this is a fake model name to construct the model config # in the vllm_config, it's not really used. - model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8" vllm_config.model_config = ModelConfig( model=model_name, trust_remote_code=True, dtype=dtype, seed=42 ) diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index afb31cb95be09..6abab88e63696 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -278,7 +278,7 @@ def sequence_parallelism_pass_on_test_model( # this is a fake model name to construct the model config # in the vllm_config, it's not really used. - model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + model_name = "RedHatAI/Llama-3.2-1B-Instruct-FP8" vllm_config.model_config = ModelConfig( model=model_name, trust_remote_code=True, dtype=dtype, seed=42 ) From 87efc681dbd57ab8e522c1d5cf50dc6ee5f7db2d Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Tue, 14 Oct 2025 11:54:12 -0700 Subject: [PATCH 46/92] llama4_vision_rope: add HIP override to accept (q, k) and avoid (positions, q, k) mismatch (#26790) Signed-off-by: Huamin Li <3ericli@gmail.com> --- .../layers/rotary_embedding/llama4_vision_rope.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py index efef8877bcaae..6241cb5abbc8e 100644 --- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -78,3 +78,10 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): key: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None]: return self.forward_native(query, key) + + def forward_hip( # type: ignore[override] + self, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + return self.forward_native(query, key) From 82af928c41886a9e7e56467a08c9b99982f7c980 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 14 Oct 2025 15:38:20 -0400 Subject: [PATCH 47/92] [Attention][Spec Decode] FlashMLA spec decode support (#26541) Signed-off-by: Matthew Bonanni --- tests/v1/attention/test_mla_backends.py | 229 ++++++++++++------ vllm/v1/attention/backends/mla/common.py | 49 +++- .../attention/backends/mla/flashattn_mla.py | 5 +- .../attention/backends/mla/flashinfer_mla.py | 6 +- vllm/v1/attention/backends/mla/flashmla.py | 18 +- 5 files changed, 215 insertions(+), 92 deletions(-) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 35f7c61458f2d..f41f63ed2af46 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -1,6 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for v1 MLA backends without GPUModelRunner dependency.""" +"""Tests for v1 MLA backends without GPUModelRunner dependency. + +Known Issues: +- FLASH_ATTN_MLA backend occasionally produces NaN values in + test_backend_correctness[mixed_small] when run after + test_backend_correctness[small_prefill], but passes when run alone. +""" import pytest import torch @@ -14,6 +20,8 @@ from tests.v1.attention.utils import ( ) from vllm import _custom_ops as ops from vllm.attention.backends.registry import _Backend +from vllm.attention.ops.flashmla import is_flashmla_dense_supported +from vllm.config.vllm import set_current_vllm_config from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec @@ -29,6 +37,10 @@ BACKENDS_TO_TEST = [ if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10: BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA) +# Remove FLASHMLA from the list if not supported +if not is_flashmla_dense_supported()[0]: + BACKENDS_TO_TEST.remove(_Backend.FLASHMLA) + torch.manual_seed(42) @@ -66,6 +78,12 @@ BATCH_SPECS = { "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8), "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]), "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]), + "spec_decode_small": BatchSpec( + seq_lens=[128, 256, 512, 1024], query_lens=[4, 4, 4, 4] + ), + "spec_decode_medium": BatchSpec( + seq_lens=[512, 1024, 2048, 512, 1024, 2048], query_lens=[8, 8, 8, 8, 8, 8] + ), } @@ -239,61 +257,64 @@ def run_attention_backend( builder_cls, impl_cls = try_get_attention_backend(backend) - # Build metadata - builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) - attn_metadata = builder.build( - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) + # Set the current vllm config so that get_current_vllm_config() works + # in the backend implementations + with set_current_vllm_config(vllm_config): + # Build metadata + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) - # Instantiate MLA implementation - num_heads = vllm_config.model_config.get_num_attention_heads( - vllm_config.parallel_config - ) - num_kv_heads = vllm_config.model_config.get_num_kv_heads( - vllm_config.parallel_config - ) - head_size = vllm_config.model_config.get_head_size() - scale = 1.0 / (head_size**0.5) - impl = impl_cls( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=num_kv_heads, - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype="auto", - logits_soft_cap=None, - attn_type="decoder", - kv_sharing_target_layer_name=None, - q_lora_rank=None, - kv_lora_rank=kv_lora_rank, - qk_nope_head_dim=qk_nope_head_dim, - qk_rope_head_dim=qk_rope_head_dim, - qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, - v_head_dim=v_head_dim, - kv_b_proj=mock_kv_b_proj, - ) + # Instantiate MLA implementation + num_heads = vllm_config.model_config.get_num_attention_heads( + vllm_config.parallel_config + ) + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config + ) + head_size = vllm_config.model_config.get_head_size() + scale = 1.0 / (head_size**0.5) + impl = impl_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="auto", + logits_soft_cap=None, + attn_type="decoder", + kv_sharing_target_layer_name=None, + q_lora_rank=None, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, + v_head_dim=v_head_dim, + kv_b_proj=mock_kv_b_proj, + ) - # Process weights to create W_UK_T and W_UV attributes needed by MLA - act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) - impl.process_weights_after_loading(act_dtype) + # Process weights to create W_UK_T and W_UV attributes needed by MLA + act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) + impl.process_weights_after_loading(act_dtype) - # Create mock layer and output buffer - mock_layer = MockAttentionLayer(device) - num_tokens = query.shape[0] - output = torch.empty( - num_tokens, num_heads * v_head_dim, dtype=query.dtype, device=query.device - ) + # Create mock layer and output buffer + mock_layer = MockAttentionLayer(device) + num_tokens = query.shape[0] + output = torch.empty( + num_tokens, num_heads * v_head_dim, dtype=query.dtype, device=query.device + ) - # Run forward pass - # NOTE: The query, key, and value are already shaped correctly - # in the calling test function. - output = impl.forward( - mock_layer, query, kv_c, k_pe, kv_cache, attn_metadata, output=output - ) + # Run forward pass + # NOTE: The query, key, and value are already shaped correctly + # in the calling test function. + output = impl.forward( + mock_layer, query, kv_c, k_pe, kv_cache, attn_metadata, output=output + ) - return output + return output @pytest.mark.parametrize( @@ -309,6 +330,8 @@ def run_attention_backend( "large_prefill", "single_decode", "single_prefill", + "spec_decode_small", + "spec_decode_medium", ], ) @pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) @@ -328,10 +351,39 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. """ + from vllm.v1.attention.backends.mla.common import QueryLenSupport + batch_spec = BATCH_SPECS[batch_spec_name] - vllm_config = create_vllm_config( - model_name=model, max_model_len=max(batch_spec.seq_lens), num_gpu_blocks=2048 + is_spec_decode_test = batch_spec_name.startswith("spec_decode") + spec_decode_backends = {_Backend.FLASH_ATTN_MLA, _Backend.FLASHMLA} + + block_size = 16 + required_blocks = sum( + (seq_len + block_size - 1) // block_size for seq_len in batch_spec.seq_lens ) + # Add 1 for null block at index 0, and some buffer + num_gpu_blocks = required_blocks + 1 + 100 + + vllm_config = create_vllm_config( + model_name=model, + max_model_len=max(batch_spec.seq_lens), + num_gpu_blocks=num_gpu_blocks, + block_size=block_size, + ) + + # For spec decode tests, add a speculative_config to set the reorder_batch_threshold + if is_spec_decode_test: + from vllm.config import SpeculativeConfig + + # Get the query length from the batch spec (they should all be uniform) + query_len = batch_spec.query_lens[0] + # Set num_speculative_tokens to query_len - 1 + # (since threshold is 1 + num_spec_tokens) + # Use ngram method which doesn't require a draft model + vllm_config.speculative_config = SpeculativeConfig( + method="ngram", num_speculative_tokens=query_len - 1 + ) + device = torch.device("cuda:0") kv_cache_spec = create_standard_kv_cache_spec(vllm_config) @@ -395,11 +447,37 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): # K_PE (rope component): [s_len, 1, qk_rope_head_dim] k_pe_full = torch.randn(s_len, 1, qk_rope_head_dim, dtype=dtype, device=device) - # Determine if this is decode or prefill + # Determine if this sequence uses the decode pipeline or prefill + # pipeline for each backend + # NOTE: For spec decode tests with uniform query_len > 1, backends that + # support spec decode (FLASH_ATTN_MLA with varlen support, FLASHMLA with + # uniform support) will use the decode pipeline (MQA-style), while + # backends that only support single-token queries will use the prefill + # pipeline (MHA-style). This ensures the reference implementation + # matches each backend's actual decode/prefill pipeline path. is_decode = [] - for i, backend in enumerate(BACKENDS_TO_TEST): + for backend_idx, backend in enumerate(BACKENDS_TO_TEST): builder_cls, _ = try_get_attention_backend(backend) - is_decode.append(q_len <= builder_cls.reorder_batch_threshold) + if is_spec_decode_test: + query_len_support = getattr( + builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY + ) + supports_spec = query_len_support != QueryLenSupport.SINGLE_ONLY + is_decode.append(supports_spec) + else: + threshold = getattr(builder_cls, "reorder_batch_threshold", None) + query_len_support = getattr( + builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY + ) + within_threshold = q_len <= threshold if threshold else False + if ( + within_threshold + and query_len_support == QueryLenSupport.UNIFORM + and i > 0 + ): + first_q_len = query_lens[0] + within_threshold = q_len == first_q_len + is_decode.append(within_threshold) # Split q into nope and rope components q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) @@ -478,11 +556,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0) sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2) - for i, backend in enumerate(BACKENDS_TO_TEST): - if is_decode[i]: - all_sdpa_outputs[i].append(sdpa_out_i_decode) + for backend_idx, backend in enumerate(BACKENDS_TO_TEST): + if is_decode[backend_idx]: + all_sdpa_outputs[backend_idx].append(sdpa_out_i_decode) else: - all_sdpa_outputs[i].append(sdpa_out_i_prefill) + all_sdpa_outputs[backend_idx].append(sdpa_out_i_prefill) # Inputs for vLLM MLA backends are just the new tokens all_q_vllm.append(q_c) @@ -497,9 +575,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): query_vllm = torch.cat(all_q_vllm, dim=0) kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0) k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0) - sdpa_outputs = [] - for i, backend in enumerate(BACKENDS_TO_TEST): - sdpa_outputs.append(torch.cat(all_sdpa_outputs[i], dim=0)) + sdpa_outputs = {} + for backend_idx, backend in enumerate(BACKENDS_TO_TEST): + sdpa_outputs[backend] = torch.cat(all_sdpa_outputs[backend_idx], dim=0) # Create mock kv_b_proj using the same weights as reference implementation from vllm.model_executor.layers.linear import ColumnParallelLinear @@ -516,7 +594,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): kv_b_proj_weight = kv_b_proj_weight.view( kv_lora_rank, num_q_heads * (qk_nope_head_dim + v_head_dim) ) - mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T) + mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T, requires_grad=False) # Create metadata using original batch spec common_attn_metadata = create_common_attn_metadata( @@ -537,7 +615,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): ) # 4. Run vLLM backends and compare - for i, backend_name in enumerate(BACKENDS_TO_TEST): + for backend_idx, backend_name in enumerate(BACKENDS_TO_TEST): + # Skip backends that don't support spec decode for spec decode tests + if is_spec_decode_test and backend_name not in spec_decode_backends: + continue + backend_output = run_attention_backend( backend_name, kv_cache_spec, @@ -556,14 +638,17 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): mock_kv_b_proj, ) + # Use backend_idx to get the correct SDPA output for this backend + expected_output = sdpa_outputs[backend_name] + # Check shape and dtype consistency - assert backend_output.shape == sdpa_outputs[i].shape, ( + assert backend_output.shape == expected_output.shape, ( f"[{backend_name}] shape {backend_output.shape} != " - f"SDPA shape {sdpa_outputs[i].shape}" + f"SDPA shape {expected_output.shape}" ) - assert backend_output.dtype == sdpa_outputs[i].dtype, ( + assert backend_output.dtype == expected_output.dtype, ( f"[{backend_name}] dtype {backend_output.dtype} != " - f"SDPA dtype {sdpa_outputs[i].dtype}" + f"SDPA dtype {expected_output.dtype}" ) assert torch.isfinite(backend_output).all(), ( @@ -574,12 +659,12 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): rtol = 1e-2 atol = 5e-1 - max_diff = torch.max(torch.abs(backend_output - sdpa_outputs[i])).item() + max_diff = torch.max(torch.abs(backend_output - expected_output)).item() max_rel_diff = torch.max( - torch.abs(backend_output - sdpa_outputs[i]) / torch.abs(sdpa_outputs[i]) + torch.abs(backend_output - expected_output) / torch.abs(expected_output) ).item() all_close = torch.allclose( - backend_output, sdpa_outputs[i], rtol=rtol, atol=atol + backend_output, expected_output, rtol=rtol, atol=atol ) assert all_close, ( diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index da56b5c9d3d22..f7e6f12363ad8 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -190,6 +190,7 @@ return curr_o @ W_O import functools from abc import abstractmethod from dataclasses import dataclass, field +from enum import Enum from typing import ClassVar, Generic, TypeVar import torch @@ -227,6 +228,24 @@ from vllm.v1.attention.backends.utils import ( ) from vllm.v1.kv_cache_interface import AttentionSpec + +class QueryLenSupport(Enum): + """Defines the level of query length support for an attention backend's + decode pipeline. + + - SINGLE_ONLY: Decode pipeline only supports single-token queries + (query_len=1) + - UNIFORM: Decode pipeline supports uniform multi-token queries + (all requests must have same query_len > 1) + - VARLEN: Decode pipeline supports variable-length queries + (mixed query lengths in same batch) + """ + + SINGLE_ONLY = "single_only" + UNIFORM = "uniform" + VARLEN = "varlen" + + try: from vllm.vllm_flash_attn import flash_attn_varlen_func @@ -460,19 +479,18 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): understand this class """ - # Whether the backend supports reordering the batch such that - # short sequences (i.e. verification for speculative decoding) are - # classified as decode requests. - # If True, this will increase `reorder_batch_threshold` (below) when - # speculative decoding is enabled, and set `require_uniform=True` when - # when reordering the batch. Non-uniform decode requests will - # fall back to prefill in this case. - supports_uniform_spec_as_decode: ClassVar[bool] = False + # Defines the level of query length support for this backend. + # - SINGLE_ONLY: Only single-token queries (no spec decode support) + # - UNIFORM: Supports uniform multi-token queries (spec decode with uniform lengths) + # - VARLEN: Supports variable-length queries (spec decode with mixed lengths) + # If set to UNIFORM or VARLEN, this will increase `reorder_batch_threshold` when + # speculative decoding is enabled. + query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.SINGLE_ONLY # The threshold for reordering the batch into decode and prefill requests. # If > 1, the batch will be reordered such that requests with # query length <= threshold are classified as decode requests. - # Use `supports_uniform_spec_as_decode` (above) to set this automatically + # Use `query_len_support` (above) to set this automatically # when speculative decoding is enabled. reorder_batch_threshold: int = 1 @@ -599,11 +617,18 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): device=device, ) - supports_spec_as_decode = self.supports_uniform_spec_as_decode + supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY self._init_reorder_batch_threshold( - self.reorder_batch_threshold, supports_spec_as_decode + self.reorder_batch_threshold, supports_spec_decode ) + # Validate consistency between query_len_support and reorder_batch_threshold + if self.query_len_support == QueryLenSupport.SINGLE_ONLY: + assert self.reorder_batch_threshold == 1, ( + f"reorder_batch_threshold must be 1 when query_len_support is " + f"SINGLE_ONLY, got {self.reorder_batch_threshold}" + ) + def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata): qo_indptr = prefill.query_start_loc @@ -745,7 +770,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): split_decodes_and_prefills( common_attn_metadata, decode_threshold=self.reorder_batch_threshold, - require_uniform=self.supports_uniform_spec_as_decode, + require_uniform=(self.query_len_support != QueryLenSupport.VARLEN), ) ) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 6e1586969fd4a..446f1c4f1f961 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -24,6 +24,7 @@ from vllm.v1.attention.backends.mla.common import ( MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder, + QueryLenSupport, ) from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec @@ -66,8 +67,8 @@ class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]): class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]): cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH - - reorder_batch_threshold: int = 512 + query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.VARLEN + reorder_batch_threshold: int = 512 # process small prefills with decode pathway def __init__( self, diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index add1c8dc972f5..44807c39cad30 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -13,6 +13,7 @@ from vllm.v1.attention.backends.mla.common import ( MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder, + QueryLenSupport, ) from vllm.v1.attention.backends.utils import AttentionCGSupport @@ -22,11 +23,8 @@ FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024 class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): - # enable spec-as-decode optimization - supports_uniform_spec_as_decode: ClassVar[bool] = True - - # enable full CUDA Graph support for decode-only capture cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH + query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM class FlashInferMLABackend(MLACommonBackend): diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index d8d1ab2c6cc0c..b15c09294c6b7 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -20,8 +20,13 @@ from vllm.v1.attention.backends.mla.common import ( MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder, + QueryLenSupport, +) +from vllm.v1.attention.backends.utils import ( + AttentionCGSupport, + reshape_attn_output_for_spec_decode, + reshape_query_for_spec_decode, ) -from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) @@ -62,6 +67,9 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH + query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM + reorder_batch_threshold: int = 512 # process small prefills with decode pathway + # ^ TODO(matt): tune this def __init__( self, @@ -216,8 +224,12 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): q = torch.cat(q, dim=-1) assert isinstance(q, torch.Tensor) + + num_decodes = attn_metadata.num_decodes + q = reshape_query_for_spec_decode(q, num_decodes) + o, lse = flash_mla_with_kvcache( - q=q.unsqueeze(1), # Add seqlen dim of 1 (decode) + q=q, k_cache=kv_c_and_k_pe_cache.unsqueeze(-2), # Add head dim of 1 block_table=attn_metadata.decode.block_table, cache_seqlens=attn_metadata.decode.seq_lens, @@ -230,4 +242,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): descale_k=layer._k_scale.reshape(1), ) + o = reshape_attn_output_for_spec_decode(o) + return o, lse From acaa2c0a4a53dbb57f85f1042b1a6f1e3f24cef5 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 14 Oct 2025 12:58:43 -0700 Subject: [PATCH 48/92] [Core] Reuse empty block lists whenever possible in KVCacheBlocks to mitigate GC costs (#24964) Signed-off-by: Jialin Ouyang --- vllm/v1/core/block_pool.py | 4 +- vllm/v1/core/kv_cache_coordinator.py | 5 +- vllm/v1/core/kv_cache_manager.py | 51 ++++++++++++++------ vllm/v1/core/sched/scheduler.py | 4 +- vllm/v1/core/single_type_kv_cache_manager.py | 15 ++++-- 5 files changed, 53 insertions(+), 26 deletions(-) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index cd22db410a6e2..15c06a0b107d8 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from typing import Any from vllm.distributed.kv_events import ( @@ -328,7 +328,7 @@ class BlockPool: ) return True - def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None: + def touch(self, blocks: tuple[Sequence[KVCacheBlock], ...]) -> None: """Touch a block increases its reference count by 1, and may remove the block from the free queue. This is used when a block is hit by another request with the same prefix. diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index ece382277255f..137e5e0cdb6d2 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod +from collections.abc import Sequence from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock @@ -51,7 +52,7 @@ class KVCacheCoordinator(ABC): self, request_id: str, num_tokens: int, - new_computed_blocks: tuple[list[KVCacheBlock], ...], + new_computed_blocks: tuple[Sequence[KVCacheBlock], ...], num_encoder_tokens: int, ) -> int: """ @@ -84,7 +85,7 @@ class KVCacheCoordinator(ABC): return num_blocks_to_allocate def save_new_computed_blocks( - self, request_id: str, new_computed_blocks: tuple[list[KVCacheBlock], ...] + self, request_id: str, new_computed_blocks: tuple[Sequence[KVCacheBlock], ...] ) -> None: """ Add the new computed blocks to the request. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7a1025fc2bb4f..ff221048dbd19 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +from collections.abc import Sequence from dataclasses import dataclass from typing import Literal, overload @@ -23,7 +25,7 @@ class KVCacheBlocks: structure from the Scheduler. """ - blocks: tuple[list[KVCacheBlock], ...] + blocks: tuple[Sequence[KVCacheBlock], ...] """ `blocks[i][j]` refers to the i-th kv_cache_group and the j-th block of tokens.We don't use block of @@ -31,12 +33,20 @@ class KVCacheBlocks: kv_cache_groups have the same number of blocks, which is true for now but will be broken if we want to give different block_size to different kv_cache_groups in the future. + + Each single type KVCacheBlocks could be represented as: + - list[KVCacheBlock] for more than one KVCacheBlock + - an empty tuple for requests without KVCacheBlock + (a precomputed KVCacheBlocks is in KVCacheManager to avoid GC overhead) """ def __add__(self, other: "KVCacheBlocks") -> "KVCacheBlocks": """Adds two KVCacheBlocks instances.""" return KVCacheBlocks( - tuple(blk1 + blk2 for blk1, blk2 in zip(self.blocks, other.blocks)) + tuple( + list(itertools.chain(blk1, blk2)) + for blk1, blk2 in zip(self.blocks, other.blocks) + ) ) @overload @@ -74,8 +84,10 @@ class KVCacheBlocks: return [block.block_id for block in self.blocks[0] if block.block_hash is None] def new_empty(self) -> "KVCacheBlocks": - """Creates a new KVCacheBlocks instance with no blocks.""" - return KVCacheBlocks(tuple([] for _ in range(len(self.blocks)))) + """ + Creates a new KVCacheBlocks instance with no blocks. + """ + return KVCacheBlocks(tuple(() for _ in range(len(self.blocks)))) class KVCacheManager: @@ -131,6 +143,15 @@ class KVCacheManager: self.block_pool = self.coordinator.block_pool self.kv_cache_config = kv_cache_config + # Pre-constructed KVCacheBlocks with no blocks, callers should use this + # via create_kv_cache_blocks instead of creating new ones to avoid GC + # overhead. + # + # We use nested tuples to ensure the empty KVCacheBlocks is immutable. + self.empty_kv_cache_blocks = KVCacheBlocks( + tuple(() for _ in range(self.num_kv_cache_groups)) + ) + @property def usage(self) -> float: """Get the KV cache usage. @@ -170,7 +191,7 @@ class KVCacheManager: request.sampling_params is not None and request.sampling_params.prompt_logprobs is not None ): - return self.create_empty_block_list(), 0 + return self.empty_kv_cache_blocks, 0 # NOTE: When all tokens hit the cache, we must recompute the last token # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1. @@ -198,7 +219,7 @@ class KVCacheManager: self.prefix_cache_stats.queries += request.num_tokens self.prefix_cache_stats.hits += num_new_computed_tokens - return KVCacheBlocks(computed_blocks), num_new_computed_tokens + return (self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens) def allocate_slots( self, @@ -251,9 +272,7 @@ class KVCacheManager: if new_computed_blocks is not None: new_computed_block_list = new_computed_blocks.blocks else: - new_computed_block_list = tuple( - [] for _ in range(len(self.kv_cache_config.kv_cache_groups)) - ) + new_computed_block_list = self.empty_kv_cache_blocks.blocks # Free the blocks that are skipped during the attention computation # (e.g., tokens outside the sliding window). @@ -305,7 +324,7 @@ class KVCacheManager: # P/D: delay caching blocks if we have to recv from # remote. Update state for locally cached blocks. if not self.enable_caching or delay_cache_blocks: - return KVCacheBlocks(new_blocks) + return self.create_kv_cache_blocks(new_blocks) # NOTE(woosuk): We want to commit (cache) up to num_computed_tokens + # num_new_tokens, but must exclude "non-committable" tokens (e.g., @@ -316,7 +335,7 @@ class KVCacheManager: ) self.coordinator.cache_blocks(request, num_tokens_to_cache) - return KVCacheBlocks(new_blocks) + return self.create_kv_cache_blocks(new_blocks) def free(self, request: Request) -> None: """Free the blocks allocated for the request. @@ -388,7 +407,7 @@ class KVCacheManager: def get_blocks(self, request_id: str) -> KVCacheBlocks: """Get the blocks of a request.""" - return KVCacheBlocks(self.coordinator.get_blocks(request_id)) + return self.create_kv_cache_blocks(self.coordinator.get_blocks(request_id)) def get_block_ids(self, request_id: str) -> tuple[list[int], ...]: """Get the block ids of a request.""" @@ -399,6 +418,8 @@ class KVCacheManager: if self.enable_caching: self.coordinator.cache_blocks(request, num_computed_tokens) - def create_empty_block_list(self) -> KVCacheBlocks: - """Creates a new KVCacheBlocks instance with no blocks.""" - return KVCacheBlocks(tuple([] for _ in range(self.num_kv_cache_groups))) + def create_kv_cache_blocks( + self, blocks: tuple[list[KVCacheBlock], ...] + ) -> KVCacheBlocks: + # Only create new KVCacheBlocks for non-empty blocks + return KVCacheBlocks(blocks) if any(blocks) else self.empty_kv_cache_blocks diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 55d7f17d5081e..9a1d31268ab7c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -421,9 +421,7 @@ class Scheduler(SchedulerInterface): # KVTransfer: WAITING reqs have num_computed_tokens > 0 # after async KV recvs are completed. else: - new_computed_blocks = ( - self.kv_cache_manager.create_empty_block_list() - ) + new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks num_new_local_computed_tokens = 0 num_computed_tokens = request.num_computed_tokens diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 7984a6ce29df7..586034182686b 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -3,6 +3,7 @@ import itertools from abc import ABC, abstractmethod from collections import defaultdict +from collections.abc import Sequence from vllm.utils import cdiv from vllm.v1.core.block_pool import BlockPool @@ -61,7 +62,10 @@ class SingleTypeKVCacheManager(ABC): self._null_block = block_pool.null_block def get_num_blocks_to_allocate( - self, request_id: str, num_tokens: int, new_computed_blocks: list[KVCacheBlock] + self, + request_id: str, + num_tokens: int, + new_computed_blocks: Sequence[KVCacheBlock], ) -> int: """ Get the number of blocks needed to be allocated for the request. @@ -93,7 +97,7 @@ class SingleTypeKVCacheManager(ABC): return num_new_blocks + num_evictable_computed_blocks def save_new_computed_blocks( - self, request_id: str, new_computed_blocks: list[KVCacheBlock] + self, request_id: str, new_computed_blocks: Sequence[KVCacheBlock] ) -> None: """ Add the new computed blocks to the request. @@ -593,7 +597,10 @@ class MambaManager(SingleTypeKVCacheManager): return 0 def get_num_blocks_to_allocate( - self, request_id: str, num_tokens: int, new_computed_blocks: list[KVCacheBlock] + self, + request_id: str, + num_tokens: int, + new_computed_blocks: Sequence[KVCacheBlock], ) -> int: # Allocate extra `num_speculative_blocks` blocks for # speculative decoding (MTP/EAGLE) with linear attention. @@ -625,7 +632,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager): """Manager for cross-attention KV cache in encoder-decoder models.""" def save_new_computed_blocks( - self, request_id: str, new_computed_blocks: list[KVCacheBlock] + self, request_id: str, new_computed_blocks: Sequence[KVCacheBlock] ) -> None: # We do not cache blocks for cross-attention to be shared between # requests, so `new_computed_blocks` should always be empty. From b92ab3deda6aca2c0f05aba841f5edb05578af94 Mon Sep 17 00:00:00 2001 From: HDCharles <39544797+HDCharles@users.noreply.github.com> Date: Tue, 14 Oct 2025 16:39:59 -0400 Subject: [PATCH 49/92] Notice for deprecation of AutoAWQ (#26820) Signed-off-by: HDCharles <39544797+HDCharles@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/features/quantization/auto_awq.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index fc998387d29aa..182127bc91cc8 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -1,5 +1,9 @@ # AutoAWQ +> ⚠️ **Warning:** + The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). + For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ). + To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. The main benefits are lower latency and memory usage. From 380f17527c273c04372bb8a7a6acbb5efb132e51 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 14 Oct 2025 14:03:21 -0700 Subject: [PATCH 50/92] [Perf] Cache vllm.env.__getattr__ result to avoid recomputation (#26146) Signed-off-by: Jialin Ouyang --- tests/test_envs.py | 49 +++++++++++++++++++++++++- vllm/envs.py | 27 +++++++++++++- vllm/v1/engine/core.py | 5 +++ vllm/v1/executor/multiproc_executor.py | 5 +++ 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index 62d529c363608..023767505f108 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -6,7 +6,54 @@ from unittest.mock import patch import pytest -from vllm.envs import env_list_with_choices, env_with_choices +import vllm.envs as envs +from vllm.envs import ( + enable_envs_cache, + env_list_with_choices, + env_with_choices, + environment_variables, +) + + +def test_getattr_without_cache(monkeypatch: pytest.MonkeyPatch): + assert envs.VLLM_HOST_IP == "" + assert envs.VLLM_PORT is None + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + monkeypatch.setenv("VLLM_PORT", "1234") + assert envs.VLLM_HOST_IP == "1.1.1.1" + assert envs.VLLM_PORT == 1234 + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + +def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + monkeypatch.setenv("VLLM_PORT", "1234") + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + # Enable envs cache and ignore ongoing environment changes + enable_envs_cache() + + # __getattr__ is not decorated with functools.cache + assert hasattr(envs.__getattr__, "cache_info") + start_hits = envs.__getattr__.cache_info().hits + + # 2 more hits due to VLLM_HOST_IP and VLLM_PORT accesses + assert envs.VLLM_HOST_IP == "1.1.1.1" + assert envs.VLLM_PORT == 1234 + assert envs.__getattr__.cache_info().hits == start_hits + 2 + + # All environment variables are cached + for environment_variable in environment_variables: + envs.__getattr__(environment_variable) + assert envs.__getattr__.cache_info().hits == start_hits + 2 + len( + environment_variables + ) + + # Reset envs.__getattr__ back to none-cached version to + # avoid affecting other tests + envs.__getattr__ = envs.__getattr__.__wrapped__ class TestEnvWithChoices: diff --git a/vllm/envs.py b/vllm/envs.py index 8d8c96297d914..b5c7f325f670d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools import hashlib import json import os @@ -1408,12 +1409,36 @@ environment_variables: dict[str, Callable[[], Any]] = { def __getattr__(name: str): - # lazy evaluation of environment variables + """ + Gets environment variables lazily. + + NOTE: After enable_envs_cache() invocation (which triggered after service + initialization), all environment variables will be cached. + """ if name in environment_variables: return environment_variables[name]() raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def enable_envs_cache() -> None: + """ + Enables caching of environment variables. This is useful for performance + reasons, as it avoids the need to re-evaluate environment variables on + every call. + + NOTE: Currently, it's invoked after service initialization to reduce + runtime overhead. This also means that environment variables should NOT + be updated after the service is initialized. + """ + # Tag __getattr__ with functools.cache + global __getattr__ + __getattr__ = functools.cache(__getattr__) + + # Cache all environment variables + for key in environment_variables: + __getattr__(key) + + def __dir__(): return list(environment_variables.keys()) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2c5d0fdc752ed..a21f0715704ad 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,6 +20,7 @@ import zmq from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group from vllm.distributed.parallel_state import is_global_first_rank +from vllm.envs import enable_envs_cache from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest @@ -601,6 +602,10 @@ class EngineCoreProc(EngineCore): # If enable, attach GC debugger after static variable freeze. maybe_attach_gc_debug_callback() + # Enable environment variable cache (e.g. assume no more + # environment variable overrides after this point) + enable_envs_cache() + @contextmanager def _perform_handshakes( self, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index e28d29c19a9c6..38e8f4ab85d9b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -33,6 +33,7 @@ from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, ) +from vllm.envs import enable_envs_cache from vllm.logger import init_logger from vllm.utils import ( _maybe_force_spawn, @@ -455,6 +456,10 @@ class WorkerProc: # Load model self.worker.load_model() + # Enable environment variable cache (e.g. assume no more + # environment variable overrides after this point) + enable_envs_cache() + @staticmethod def make_worker_process( vllm_config: VllmConfig, From 0e65818910c4e202d2ea8d7ece8b49c959ae83dc Mon Sep 17 00:00:00 2001 From: Dhruvil Bhatt <43905073+Dhruvilbhatt@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:21:03 -0700 Subject: [PATCH 51/92] Added MoE configs for llama 4, H200 device with tp=4/8 tuning (#26837) Signed-off-by: Dhruvil Bhatt --- ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=128,N=1024,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=16,N=1024,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++ .../E=16,N=2048,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++ 6 files changed, 876 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..86b49127f9bf2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..ea1ce9ad2cdc4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..2a626ac47b8d1 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..371e87f946829 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8b94452197b0f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json new file mode 100644 index 0000000000000..48f19df24cc9e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} \ No newline at end of file From 9d6964926e78e3a35da49234b9b87153239d33c4 Mon Sep 17 00:00:00 2001 From: Nan Qin Date: Tue, 14 Oct 2025 16:23:22 -0500 Subject: [PATCH 52/92] fix: response_format for completion (#23212) Signed-off-by: Nan2018 --- vllm/entrypoints/openai/protocol.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 1f2c40e703834..f41fa196acd81 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1197,6 +1197,10 @@ class CompletionRequest(OpenAIBaseModel): "Please pass `grammar` to `structured_outputs` instead." ), ) + structural_tag: str | None = Field( + default=None, + description=("If specified, the output will follow the structural tag schema."), + ) guided_decoding_backend: str | None = Field( default=None, description=( @@ -1357,10 +1361,27 @@ class CompletionRequest(OpenAIBaseModel): echo_without_generation = self.echo and self.max_tokens == 0 + guided_json_object = None + if self.response_format is not None: + if self.response_format.type == "json_object": + guided_json_object = True + elif self.response_format.type == "json_schema": + json_schema = self.response_format.json_schema + assert json_schema is not None + self.guided_json = json_schema.json_schema + elif self.response_format.type == "structural_tag": + structural_tag = self.response_format + assert structural_tag is not None and isinstance( + structural_tag, StructuralTagResponseFormat + ) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structural_tag = json.dumps(s_tag_obj) + # Forward deprecated guided_* parameters to structured_outputs if self.structured_outputs is None: kwargs = dict[str, Any]( json=self.guided_json, + json_object=guided_json_object, regex=self.guided_regex, choice=self.guided_choice, grammar=self.guided_grammar, @@ -1370,13 +1391,6 @@ class CompletionRequest(OpenAIBaseModel): if len(kwargs) > 0: self.structured_outputs = StructuredOutputsParams(**kwargs) - if ( - self.structured_outputs is not None - and self.response_format is not None - and self.response_format.type == "json_object" - ): - self.structured_outputs.json_object = True - extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: # Pass in kv_transfer_params via extra_args From ff4810ba73168237f090b0195fbe0ae4d64e0730 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 14 Oct 2025 14:46:37 -0700 Subject: [PATCH 53/92] [Minor] Group async_scheduling related fields in model runner init (#26736) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_model_runner.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 67a93fed52749..bbb63d28289c4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -375,9 +375,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) self.use_async_scheduling = self.scheduler_config.async_scheduling - self.async_output_copy_stream = ( - torch.cuda.Stream() if self.use_async_scheduling else None - ) + # Separate cuda stream for overlapping transfer of sampled token ids from + # GPU to CPU when async scheduling is enabled. + self.async_output_copy_stream: torch.cuda.Stream | None = None + # cuda event to synchronize use of reused CPU tensors between steps + # when async scheduling is enabled. + self.prepare_inputs_event: torch.cuda.Event | None = None + if self.use_async_scheduling: + self.async_output_copy_stream = torch.cuda.Stream() + self.prepare_inputs_event = torch.cuda.Event() # TODO(woosuk): Provide an option to tune the max cudagraph batch size. # The convention is different. @@ -444,14 +450,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (3, self.max_num_tokens + 1), dtype=torch.int64 ) - # CUDA event to synchronize use of reused CPU tensors between steps - # when async scheduling is enabled. - self.prepare_inputs_event: torch.cuda.Event | None = None - if self.use_async_scheduling: - self.prepare_inputs_event = torch.cuda.Event() - # Start in a completed state. - self.prepare_inputs_event.record(torch.cuda.default_stream()) - # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: IntermediateTensors | None = None From a86b4c58e8f72f4903d873d25510f53f7577366f Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 14 Oct 2025 15:53:10 -0700 Subject: [PATCH 54/92] remove attn output view kernel (#26680) Signed-off-by: Boyuan Feng Signed-off-by: Boyuan Feng Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/attention/layer.py | 6 +++--- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flex_attention.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/attention/backends/rocm_aiter_unified_attn.py | 2 +- vllm/v1/attention/backends/rocm_attn.py | 2 +- vllm/v1/attention/backends/tree_attn.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 2 +- vllm/v1/attention/backends/xformers.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 929c3b6a4906b..fe9de65b52c66 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -346,7 +346,7 @@ class Attention(nn.Module, AttentionLayerBase): if self.use_output: output_shape = output_shape if output_shape is not None else query.shape - output = torch.zeros(output_shape, dtype=output_dtype, device=query.device) + output = torch.empty(output_shape, dtype=output_dtype, device=query.device) hidden_size = output_shape[-1] # Reshape the query, key, and value tensors. # NOTE(woosuk): We do this outside the custom op to minimize the @@ -705,7 +705,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): self.calc_kv_scales(q, kv_c_normed, k_pe) if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) self.impl.forward( self, q, @@ -722,7 +722,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): ) else: if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) torch.ops.vllm.unified_mla_attention_with_output( q, kv_c_normed, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fa4e34536135d..9e0c125d9edb7 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -530,7 +530,7 @@ class FlashAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) attn_type = self.attn_type diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0fa71afa62eef..ee32f7e2904f7 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -857,7 +857,7 @@ class FlashInferImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) if self.bmm1_scale is None: self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 2595851e5042d..902872bb25b33 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -767,7 +767,7 @@ class FlexAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # query = self.view_as_4d(query).permute(0, 2, 1, 3) # return torch.empty_like(query) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index cce43b220da77..7c73611d4a58a 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -485,7 +485,7 @@ class AiterFlashAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # IMPORTANT! # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py index 14184944934fa..27b072106268b 100644 --- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py +++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py @@ -130,7 +130,7 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) assert attn_metadata.use_cascade is False diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 5245c7f449259..8b7ce90a3ccae 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -299,7 +299,7 @@ class RocmAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) assert attn_metadata.use_cascade is False diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index aab90cfd1fe0d..ee6ead9ad9b35 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -379,7 +379,7 @@ class TreeAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # Cache the input KVs. key_cache, value_cache = kv_cache.unbind(0) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 9d1d007a08e4c..9746a0eb58bd2 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -298,7 +298,7 @@ class TritonAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) assert attn_metadata.use_cascade is False diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index 41c543c18adcc..457b15ebdd82f 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -354,7 +354,7 @@ class XFormersAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # Cache the input KVs. key_cache, value_cache = kv_cache.unbind(0) From 4aed506b6538ec4f284c480bf4449e9dc5f72054 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 14 Oct 2025 16:27:44 -0700 Subject: [PATCH 55/92] [Core] Streamline some structured output related code (#26737) Signed-off-by: Nick Hill --- tests/v1/core/test_scheduler.py | 18 +++-- .../unit/test_kv_connector_lifecyle.py | 2 +- tests/v1/tpu/worker/test_tpu_model_runner.py | 24 +++---- tests/v1/worker/test_gpu_model_runner.py | 24 +++---- vllm/v1/core/sched/output.py | 5 +- vllm/v1/core/sched/scheduler.py | 65 +++++++++---------- vllm/v1/request.py | 18 +++-- vllm/v1/structured_output/__init__.py | 36 +++++----- vllm/v1/structured_output/backend_guidance.py | 2 +- vllm/v1/structured_output/request.py | 44 +++++++------ vllm/v1/structured_output/utils.py | 9 +-- vllm/v1/worker/gpu_model_runner.py | 6 +- vllm/v1/worker/tpu_model_runner.py | 6 +- 13 files changed, 121 insertions(+), 138 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 76408fba2e169..aaac2deb12ac2 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -30,7 +30,6 @@ from vllm.v1.kv_cache_interface import ( from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm.v1.structured_output.request import StructuredOutputRequest from .utils import EOS_TOKEN_ID, create_requests, create_scheduler @@ -335,10 +334,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [], requests[1].request_id: [10], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -383,10 +382,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 42], requests[1].request_id: [13], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -429,10 +428,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 11], requests[1].request_id: [], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -470,10 +469,10 @@ def test_stop_via_update_from_output(): total_num_scheduled_tokens=3, scheduled_encoder_inputs={}, scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -1941,7 +1940,6 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): sampling_params=sampling_params, pooling_params=None, eos_token_id=EOS_TOKEN_ID, - structured_output_request=StructuredOutputRequest(sampling_params), ) scheduler.add_request(request) output = scheduler.schedule() diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index 0bb67b574fa14..b5c8f378be182 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -26,7 +26,7 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, kv_connector_metadata=SharedStorageConnectorMetadata(), ) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index df9fcdc37fa37..e471174ef6744 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -89,10 +89,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -168,10 +168,10 @@ def test_update_states_request_finished(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -198,10 +198,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -225,10 +225,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -256,10 +256,10 @@ def test_update_states_no_changes(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -291,10 +291,10 @@ def test_update_states_request_unscheduled(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 817cd7f10c1c6..fe52f565c8a86 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -146,10 +146,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -212,10 +212,10 @@ def test_update_states_request_finished(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -244,10 +244,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -273,10 +273,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -366,10 +366,10 @@ def test_update_states_no_changes(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -403,10 +403,10 @@ def test_update_states_request_unscheduled(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index bce15e1a476fd..619dcd178a13a 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -165,9 +165,8 @@ class SchedulerOutput: # freed from the encoder cache. free_encoder_mm_hashes: list[str] - # Dict of request ids to their index within the batch - # for filling the next token bitmask - structured_output_request_ids: dict[str, int] + # ids of structured outputs requests included in the bitmask, in order. + structured_output_request_ids: list[str] # the bitmask for the whole batch grammar_bitmask: "npt.NDArray[np.int32] | None" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9a1d31268ab7c..08368b7d99efe 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -5,7 +5,7 @@ import itertools import time from collections import defaultdict from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any from vllm.config import VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch @@ -34,6 +34,10 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager +if TYPE_CHECKING: + import numpy as np + import numpy.typing as npt + logger = init_logger(__name__) @@ -608,11 +612,8 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens, req_to_new_blocks, ) - scheduled_requests = ( - scheduled_new_reqs + scheduled_running_reqs + scheduled_resumed_reqs - ) structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask( - scheduled_requests, scheduled_spec_decode_tokens + num_scheduled_tokens.keys(), scheduled_spec_decode_tokens ) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, @@ -876,32 +877,28 @@ class Scheduler(SchedulerInterface): def get_grammar_bitmask( self, - requests: list[Request], + scheduled_request_ids: Iterable[str], scheduled_spec_decode_tokens: dict[str, list[int]], - ): - # NOTE: structured_output_request_ids maps - # a request's (request that uses structured output) - # request_id to its index in the batch. - # This will help us determine to slice the grammar bitmask - # and only applies valid mask for requests that - # uses structured decoding. - structured_output_request_ids: dict[str, int] = {} - for i, req in enumerate(requests): - if req.use_structured_output: - # PERF: in case of chunked prefill, - # request might not include any new tokens. - # Therefore, we might introduce some additional - # cycle to fill in the bitmask, which could be a big no-op. - structured_output_request_ids[req.request_id] = i - + ) -> tuple[list[str], "npt.NDArray[np.int32] | None"]: + # Collect list of scheduled request ids that use structured output. + # The corresponding rows of the bitmask will be in this order. + # PERF: in case of chunked prefill, + # request might not include any new tokens. + # Therefore, we might introduce some additional + # cycle to fill in the bitmask, which could be a big no-op. + structured_output_request_ids = [ + req_id + for req_id in scheduled_request_ids + if (req := self.requests.get(req_id)) and req.use_structured_output + ] if not structured_output_request_ids: - bitmask = None - else: - bitmask = self.structured_output_manager.grammar_bitmask( - self.requests, - structured_output_request_ids, - scheduled_spec_decode_tokens, - ) + return structured_output_request_ids, None + + bitmask = self.structured_output_manager.grammar_bitmask( + self.requests, + structured_output_request_ids, + scheduled_spec_decode_tokens, + ) return structured_output_request_ids, bitmask def update_from_output( @@ -1013,12 +1010,10 @@ class Scheduler(SchedulerInterface): new_logprobs = logprobs.slice(req_index, req_index + 1) if new_token_ids and self.structured_output_manager.should_advance(request): - # NOTE: structured_output_request - # should not be None if use_structured_output, we have - # checked above, so safe to ignore type warning - request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr] - req_id, new_token_ids - ) + struct_output_request = request.structured_output_request + assert struct_output_request is not None + assert struct_output_request.grammar is not None + struct_output_request.grammar.accept_tokens(req_id, new_token_ids) if num_nans_in_logits is not None and req_id in num_nans_in_logits: request.num_nans_in_logits = num_nans_in_logits[req_id] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 5926bf5b46ee9..864b0eb7fa410 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -40,7 +40,6 @@ class Request: prompt_embeds: torch.Tensor | None = None, mm_features: list[MultiModalFeatureSpec] | None = None, lora_request: Optional["LoRARequest"] = None, - structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: str | None = None, priority: int = 0, trace_headers: Mapping[str, str] | None = None, @@ -54,11 +53,12 @@ class Request: # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id self.lora_request = lora_request - self.structured_output_request = structured_output_request + self.structured_output_request = StructuredOutputRequest.from_sampling_params( + sampling_params + ) self.arrival_time = arrival_time if arrival_time is not None else time.time() self.status = RequestStatus.WAITING - self.use_structured_output = False self.events: list[EngineCoreEvent] = [] self.stop_reason: int | str | None = None @@ -72,9 +72,8 @@ class Request: # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - if sampling_params.structured_outputs is not None: + if self.structured_output_request is not None: self.status = RequestStatus.WAITING_FOR_FSM - self.use_structured_output = True if sampling_params.extra_args is not None: self.kv_transfer_params = sampling_params.extra_args.get( @@ -145,11 +144,6 @@ class Request: eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, lora_request=request.lora_request, - structured_output_request=StructuredOutputRequest( - sampling_params=request.sampling_params - ) - if request.sampling_params - else None, cache_salt=request.cache_salt, priority=request.priority, trace_headers=request.trace_headers, @@ -170,6 +164,10 @@ class Request: if self.get_hash_new_full_blocks is not None: self.block_hashes.extend(self.get_hash_new_full_blocks()) + @property + def use_structured_output(self) -> bool: + return self.structured_output_request is not None + @property def is_output_corrupted(self) -> bool: return self.num_nans_in_logits > 0 diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 336a0eb98682a..8d7f4b5d68961 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -167,7 +167,7 @@ class StructuredOutputManager: def grammar_bitmask( self, requests: dict[str, Request], - structured_output_request_ids: dict[str, int], + structured_output_request_ids: list[str], scheduled_spec_decode_tokens: dict[str, list[int]], ) -> "npt.NDArray[np.int32] | None": # Prepare the structured output bitmask for this batch. @@ -196,17 +196,16 @@ class StructuredOutputManager: # masks for each request, one for each possible bonus token position. # These are stored inline in the tensor and unpacked by the gpu runner. cumulative_index = 0 - ordered_seq = sorted(structured_output_request_ids.items(), key=lambda x: x[1]) # Optimized parallel filling of bitmasks for # non-spec, large-batch-size cases if ( - len(ordered_seq) > self.fill_bitmask_parallel_threshold + len(structured_output_request_ids) > self.fill_bitmask_parallel_threshold and max_num_spec_tokens == 0 ): promises = [] batch = [] - for req_id, _ in ordered_seq: + for req_id in structured_output_request_ids: request = requests[req_id] structured_output_request = request.structured_output_request if TYPE_CHECKING: @@ -230,7 +229,7 @@ class StructuredOutputManager: promise.result() else: # Fallback to serial filling of bitmasks for small-batch-size cases - for req_id, _ in ordered_seq: + for req_id in structured_output_request_ids: request = requests[req_id] structured_output_request = request.structured_output_request @@ -295,22 +294,21 @@ class StructuredOutputManager: assert request.structured_output_request.grammar is not None # by default, we should always advance # for cases that don't use thinking mode. - if self.reasoner is not None: - structured_req = request.structured_output_request - - if structured_req.reasoning_ended: - return True - - # Check if reasoning ends in *this* step - if self.reasoner.is_reasoning_end(request.all_token_ids): - # Reasoning just ended, so we shouldn't advance til - # next pass - structured_req.reasoning_ended = True - - return False - else: + if self.reasoner is None: return True + structured_req = request.structured_output_request + if structured_req.reasoning_ended: + return True + + # Check if reasoning ends in *this* step + if self.reasoner.is_reasoning_end(request.all_token_ids): + # Reasoning just ended, so we shouldn't advance til + # next pass + structured_req.reasoning_ended = True + + return False + def clear_backend(self) -> None: if self.backend is not None: self.backend.destroy() diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index c37193e667aab..8e75b99f8481f 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -252,7 +252,7 @@ def serialize_guidance_grammar( def validate_guidance_grammar( sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None ) -> None: - tp, grm = get_structured_output_key(sampling_params) + tp, grm = get_structured_output_key(sampling_params.structured_outputs) guidance_grm = serialize_guidance_grammar(tp, grm) err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer) if err: diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 9e149b186c639..afe0e4b3f3a7f 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -7,7 +7,7 @@ from concurrent.futures import Future from concurrent.futures._base import TimeoutError from typing import cast -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.v1.structured_output.backend_types import ( StructuredOutputGrammar, StructuredOutputKey, @@ -17,10 +17,19 @@ from vllm.v1.structured_output.backend_types import ( @dataclasses.dataclass class StructuredOutputRequest: - sampling_params: SamplingParams + params: StructuredOutputsParams _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None reasoning_ended: bool | None = None + @staticmethod + def from_sampling_params( + sampling_params: SamplingParams | None, + ) -> "StructuredOutputRequest | None": + if sampling_params is None: + return None + params = sampling_params.structured_outputs + return StructuredOutputRequest(params=params) if params else None + def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports from vllm.v1.request import RequestStatus @@ -53,31 +62,28 @@ class StructuredOutputRequest: @functools.cached_property def structured_output_key(self) -> StructuredOutputKey: - return get_structured_output_key(self.sampling_params) + return get_structured_output_key(self.params) -def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.structured_outputs - assert params is not None, "params can't be None." +def get_structured_output_key(params: StructuredOutputsParams) -> StructuredOutputKey: if params.json is not None: if not isinstance(params.json, str): json_str = json.dumps(params.json) else: json_str = params.json - return (StructuredOutputOptions.JSON, json_str) - elif params.json_object: - return (StructuredOutputOptions.JSON_OBJECT, "") - elif params.regex is not None: - return (StructuredOutputOptions.REGEX, params.regex) - elif params.choice is not None: + return StructuredOutputOptions.JSON, json_str + if params.json_object: + return StructuredOutputOptions.JSON_OBJECT, "" + if params.regex is not None: + return StructuredOutputOptions.REGEX, params.regex + if params.choice is not None: if not isinstance(params.choice, str): json_str = json.dumps(params.choice) else: json_str = params.choice - return (StructuredOutputOptions.CHOICE, json_str) - elif params.grammar is not None: - return (StructuredOutputOptions.GRAMMAR, params.grammar) - elif params.structural_tag is not None: - return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) - else: - raise ValueError("No valid structured output parameter found") + return StructuredOutputOptions.CHOICE, json_str + if params.grammar is not None: + return StructuredOutputOptions.GRAMMAR, params.grammar + if params.structural_tag is not None: + return StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag + raise ValueError("No valid structured output parameter found") diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 2520dc217c798..4b793b9a72fd7 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -47,7 +47,6 @@ def apply_grammar_bitmask( scheduler_output: SchedulerOutput, input_batch: InputBatch, logits: torch.Tensor, - device: torch.device, ) -> None: """ Apply grammar bitmask to output logits of the model with xgrammar function. @@ -56,7 +55,6 @@ def apply_grammar_bitmask( scheduler_output (SchedulerOutput): The result of engine scheduling. input_batch (InputBatch): The input of model runner. logits (torch.Tensor): The output logits of model forward. - device (torch.device): The device that model runner running on. """ grammar_bitmask = scheduler_output.grammar_bitmask if grammar_bitmask is None: @@ -91,10 +89,7 @@ def apply_grammar_bitmask( dtype=grammar_bitmask.dtype, ) cumulative_index = 0 - seq = sorted( - scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1] - ) - for req_id, _ in seq: + for req_id in scheduler_output.structured_output_request_ids: num_spec_tokens = len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) ) @@ -117,7 +112,7 @@ def apply_grammar_bitmask( xgr.apply_token_bitmask_inplace( logits, - grammar_bitmask.to(device, non_blocking=True), + grammar_bitmask.to(logits.device, non_blocking=True), indices=out_indices if not skip_out_indices else None, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bbb63d28289c4..72f8824e20054 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2568,10 +2568,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits = model_output_broadcast_data["logits"] # Apply structured output bitmasks if present - if scheduler_output.grammar_bitmask is not None: - apply_grammar_bitmask( - scheduler_output, self.input_batch, logits, self.device - ) + if scheduler_output.structured_output_request_ids: + apply_grammar_bitmask(scheduler_output, self.input_batch, logits) with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 828f09cbc8d8d..2107df5fc1032 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1963,12 +1963,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.grammar_bitmask_cpu.zero_() self.require_structured_out_cpu.zero_() - sorted_struct_requests = sorted( - scheduler_output.structured_output_request_ids.items(), - key=lambda item: item[1], - ) cumulative_mask_idx = 0 - for req_id, _ in sorted_struct_requests: + for req_id in scheduler_output.structured_output_request_ids: if req_id not in self.input_batch.req_id_to_index: continue batch_index = self.input_batch.req_id_to_index[req_id] From 7e0ef4084affa9de84904ba7726c46f53f4f6379 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 19:41:43 -0400 Subject: [PATCH 56/92] [CI Failure] Fix torchao dep failure for Quantization Test (#26824) Signed-off-by: mgoin --- .buildkite/test-amd.yaml | 3 ++- .buildkite/test-pipeline.yaml | 3 ++- tests/quantization/test_compressed_tensors.py | 3 ++- vllm/model_executor/layers/quantization/rtn.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b2a3a0a775baa..91f0b850575c4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -603,7 +603,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ebe0602a1b5db..94c0944c838ce 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -527,7 +527,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index ef7164c8813da..5aeb002238cf9 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): @pytest.mark.parametrize( "args", [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), + # TODO: Enable once model is available again + # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4), ], ) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index c041d2fd0ba48..e4f7ff8339569 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase from vllm.model_executor.layers.linear import ( LinearBase, @@ -396,7 +397,7 @@ class RTNMoEMethod(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, From 0512c04aee408367a068b5960e7857c722ed204d Mon Sep 17 00:00:00 2001 From: Ye Hu Date: Tue, 14 Oct 2025 16:48:13 -0700 Subject: [PATCH 57/92] [frontend][gptoss] Add per turn stats into Harmony Context (#25061) Signed-off-by: lacora Co-authored-by: Ye Hu --- tests/entrypoints/test_context.py | 93 ++++++++++++++++++-- vllm/entrypoints/context.py | 65 +++++++++----- vllm/entrypoints/openai/protocol.py | 4 + vllm/entrypoints/openai/serving_responses.py | 88 +++++++++++------- 4 files changed, 188 insertions(+), 62 deletions(-) diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index b0faa870a9272..31ea856224f90 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch import pytest from openai_harmony import Author, Message, Role, StreamState, TextContent -from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext +from vllm.entrypoints.context import ( + HarmonyContext, + StreamingHarmonyContext, + TurnMetrics, +) from vllm.outputs import CompletionOutput, RequestOutput @@ -101,8 +105,12 @@ def test_single_turn_token_counting(): # Verify internal state tracking assert not context.is_first_turn - assert context.previous_turn.input_tokens == 5 - assert context.previous_turn.output_tokens == 3 + assert len(context.all_turn_metrics) == 1 + previous_turn = context.all_turn_metrics[0] + assert previous_turn.input_tokens == 5 + assert previous_turn.output_tokens == 3 + assert previous_turn.cached_input_tokens == 2 + assert previous_turn.tool_output_tokens == 0 @pytest.mark.asyncio @@ -156,6 +164,15 @@ async def test_multi_turn_token_counting(): assert context.num_tool_output_tokens == expected_tool_output assert context.num_cached_tokens == 5 + 15 + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == prompt_token_counts[i] + assert turn.output_tokens == output_token_counts[i] + assert turn.cached_input_tokens == cached_token_counts[i] + assert context.all_turn_metrics[1].tool_output_tokens == 7 + assert context.all_turn_metrics[2].tool_output_tokens == 1 + def test_empty_output_tokens(): """Test behavior when RequestOutput has empty output tokens.""" @@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser): # Create a streaming context context = StreamingHarmonyContext(messages=[], available_tools=["browser"]) + num_prompt_tokens = [3, 8, 13] + num_output_tokens = [3, 3, 2] + num_cached_tokens = [0, 3, 8] + # Simulate three turns of conversation: # Turn 1: stream tokens one by one, then finish the message # Turn 2: new prompt, stream more tokens with a reasoning segment @@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): create_mock_request_output( prompt_token_ids=[1, 2, 3], # 3 prompt tokens output_token_ids=[101], # Single token - num_cached_tokens=0, + num_cached_tokens=num_cached_tokens[0], finished=False, # Not end of message yet ) ) @@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 5, ], # 8 tokens (includes previous) output_token_ids=[201], - num_cached_tokens=3, # Some tokens cached + num_cached_tokens=num_cached_tokens[1], # Some tokens cached finished=False, ) ) @@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 7, ], # 13 tokens output_token_ids=[301], - num_cached_tokens=8, # More cached tokens + num_cached_tokens=num_cached_tokens[2], # More cached tokens finished=False, ) ) @@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser): ) # Final token counts check - assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts - assert context.num_output_tokens == 3 + 3 + 2 # All outputs + assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts + assert context.num_output_tokens == sum(num_output_tokens) # All outputs assert context.num_reasoning_tokens == 3 # Unchanged from second turn - assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens + assert context.num_cached_tokens == sum( + num_cached_tokens + ) # Accumulated cached tokens # Additional tool tokens from third turn # Formula: this turn prompt - last turn prompt - last turn output @@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser): context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens ) + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == num_prompt_tokens[i] + assert turn.output_tokens == num_output_tokens[i] + assert turn.cached_input_tokens == num_cached_tokens[i] + assert context.all_turn_metrics[1].tool_output_tokens == 2 + assert context.all_turn_metrics[2].tool_output_tokens == 2 + @pytest.mark.asyncio async def test_streaming_message_synchronization(mock_parser): @@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser): assert len(context._messages) == 3 assert context.num_init_messages == 1 assert context._messages[2].content[0].text == "Response 4" + + +def test_turn_metrics_copy_and_reset(): + """Test TurnMetrics copy and reset methods work correctly.""" + # Create a TurnMetrics with specific values + original_metrics = TurnMetrics( + input_tokens=10, + output_tokens=20, + cached_input_tokens=5, + tool_output_tokens=3, + ) + + # Test copy functionality + copied_metrics = original_metrics.copy() + + # Verify copy has same values + assert copied_metrics.input_tokens == 10 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 + + # Verify they are separate objects + assert copied_metrics is not original_metrics + + # Modify copy to ensure independence + copied_metrics.input_tokens = 999 + assert original_metrics.input_tokens == 10 # Original unchanged + assert copied_metrics.input_tokens == 999 + + # Test reset functionality + original_metrics.reset() + + # Verify all fields are reset to zero + assert original_metrics.input_tokens == 0 + assert original_metrics.output_tokens == 0 + assert original_metrics.cached_input_tokens == 0 + assert original_metrics.tool_output_tokens == 0 + + # Verify copied metrics are unaffected by reset + assert copied_metrics.input_tokens == 999 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index c694bcfaaa756..8f94880e431be 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -45,21 +45,36 @@ def _map_tool_name_to_tool_type(tool_name: str) -> str: return _TOOL_NAME_TO_TYPE_MAP[tool_name] -class TurnTokens: - """Tracks token counts for a single conversation turn.""" +class TurnMetrics: + """Tracks token and toolcall details for a single conversation turn.""" - def __init__(self, input_tokens=0, output_tokens=0): + def __init__( + self, + input_tokens=0, + output_tokens=0, + cached_input_tokens=0, + tool_output_tokens=0, + ): self.input_tokens = input_tokens self.output_tokens = output_tokens + self.cached_input_tokens = cached_input_tokens + self.tool_output_tokens = tool_output_tokens def reset(self): """Reset counters for a new turn.""" self.input_tokens = 0 self.output_tokens = 0 + self.cached_input_tokens = 0 + self.tool_output_tokens = 0 def copy(self): """Create a copy of this turn's token counts.""" - return TurnTokens(self.input_tokens, self.output_tokens) + return TurnMetrics( + self.input_tokens, + self.output_tokens, + self.cached_input_tokens, + self.tool_output_tokens, + ) class ConversationContext(ABC): @@ -102,6 +117,8 @@ class SimpleContext(ConversationContext): self.num_cached_tokens = 0 # todo num_reasoning_tokens is not implemented yet. self.num_reasoning_tokens = 0 + # not implemented yet for SimpleContext + self.all_turn_metrics = [] def append_output(self, output) -> None: self.last_output = output @@ -154,8 +171,9 @@ class HarmonyContext(ConversationContext): self.num_tool_output_tokens = 0 # Turn tracking - replaces multiple individual tracking variables - self.current_turn = TurnTokens() - self.previous_turn = TurnTokens() + self.current_turn_metrics = TurnMetrics() + # Track metrics for all turns + self.all_turn_metrics: list[TurnMetrics] = [] self.is_first_turn = True self.first_tok_of_message = True # For streaming support @@ -173,11 +191,10 @@ class HarmonyContext(ConversationContext): # Check if the current token is part of reasoning content self._update_num_reasoning_tokens() self._update_prefill_token_usage(output) - # Reset current turn output tokens for this turn - self.current_turn.output_tokens = 0 self._update_decode_token_usage(output) - # Move current turn to previous turn for next turn's calculations - self.previous_turn = self.current_turn.copy() + # Append current turn to all turn list for next turn's calculations + self.all_turn_metrics.append(self.current_turn_metrics.copy()) + self.current_turn_metrics.reset() # append_output is called only once before tool calling # in non-streaming case # so we can append all the parser messages to _messages @@ -213,20 +230,21 @@ class HarmonyContext(ConversationContext): logger.error("RequestOutput appended contains no prompt_token_ids.") # Update current turn input tokens - self.current_turn.input_tokens = this_turn_input_tokens + self.current_turn_metrics.input_tokens = this_turn_input_tokens self.num_prompt_tokens += this_turn_input_tokens # Calculate tool tokens (except on first turn) if self.is_first_turn: self.is_first_turn = False else: + previous_turn = self.all_turn_metrics[-1] # start counting tool after first turn # tool tokens = this turn prefill - last turn prefill - # last turn decode this_turn_tool_tokens = ( - self.current_turn.input_tokens - - self.previous_turn.input_tokens - - self.previous_turn.output_tokens + self.current_turn_metrics.input_tokens + - previous_turn.input_tokens + - previous_turn.output_tokens ) # Handle negative tool token counts (shouldn't happen in normal @@ -237,17 +255,20 @@ class HarmonyContext(ConversationContext): "(current_input=%d, previous_input=%d, " "previous_output=%d). Setting to 0.", this_turn_tool_tokens, - self.current_turn.input_tokens, - self.previous_turn.input_tokens, - self.previous_turn.output_tokens, + self.current_turn_metrics.input_tokens, + previous_turn.input_tokens, + previous_turn.output_tokens, ) this_turn_tool_tokens = 0 self.num_tool_output_tokens += this_turn_tool_tokens + self.current_turn_metrics.tool_output_tokens = this_turn_tool_tokens # Update cached tokens - if output.num_cached_tokens is not None: - self.num_cached_tokens += output.num_cached_tokens + num_cached_token = output.num_cached_tokens + if num_cached_token is not None: + self.num_cached_tokens += num_cached_token + self.current_turn_metrics.cached_input_tokens = num_cached_token def _update_decode_token_usage(self, output: RequestOutput) -> int: """Update token usage statistics for the decode phase of generation. @@ -272,7 +293,7 @@ class HarmonyContext(ConversationContext): # only keep last round updated_output_token_count += len(completion_output.token_ids) self.num_output_tokens += updated_output_token_count - self.current_turn.output_tokens += updated_output_token_count + self.current_turn_metrics.output_tokens += updated_output_token_count return updated_output_token_count @property @@ -452,7 +473,6 @@ class StreamingHarmonyContext(HarmonyContext): # so we only want to add the prompt tokens once for each message. if self.first_tok_of_message: self._update_prefill_token_usage(output) - self.current_turn.output_tokens = 0 # Reset self.first_tok_of_message if needed: # if the current token is the last one of the current message # (finished=True), then the next token processed will mark the @@ -464,7 +484,8 @@ class StreamingHarmonyContext(HarmonyContext): # For streaming, update previous turn when message is complete if output.finished: - self.previous_turn = self.current_turn.copy() + self.all_turn_metrics.append(self.current_turn_metrics.copy()) + self.current_turn_metrics.reset() # Check if the current token is part of reasoning content self._update_num_reasoning_tokens() self.last_tok = tok diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f41fa196acd81..86e1e62ff437b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2103,11 +2103,15 @@ class TranscriptionStreamResponse(OpenAIBaseModel): class InputTokensDetails(OpenAIBaseModel): cached_tokens: int + input_tokens_per_turn: list[int] = Field(default_factory=list) + cached_tokens_per_turn: list[int] = Field(default_factory=list) class OutputTokensDetails(OpenAIBaseModel): reasoning_tokens: int = 0 tool_output_tokens: int = 0 + output_tokens_per_turn: list[int] = Field(default_factory=list) + tool_output_tokens_per_turn: list[int] = Field(default_factory=list) class ResponseUsage(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 51e2856a5a9dd..6cdabff6e709b 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -589,10 +589,24 @@ class OpenAIServingResponses(OpenAIServing): input_tokens=num_prompt_tokens, output_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens, - input_tokens_details=InputTokensDetails(cached_tokens=num_cached_tokens), + input_tokens_details=InputTokensDetails( + cached_tokens=num_cached_tokens, + input_tokens_per_turn=[ + turn.input_tokens for turn in context.all_turn_metrics + ], + cached_tokens_per_turn=[ + turn.cached_input_tokens for turn in context.all_turn_metrics + ], + ), output_tokens_details=OutputTokensDetails( reasoning_tokens=num_reasoning_tokens, tool_output_tokens=num_tool_output_tokens, + output_tokens_per_turn=[ + turn.output_tokens for turn in context.all_turn_metrics + ], + tool_output_tokens_per_turn=[ + turn.tool_output_tokens for turn in context.all_turn_metrics + ], ), ) response = ResponsesResponse.from_request( @@ -665,11 +679,13 @@ class OpenAIServingResponses(OpenAIServing): token=text, logprob=max(token_logprob.logprob, -9999.0), bytes=list(text.encode("utf-8", errors="replace")), - top_logprobs=self._topk_logprobs( - logprob, top_logprobs=top_logprobs, tokenizer=tokenizer - ) - if top_logprobs - else [], + top_logprobs=( + self._topk_logprobs( + logprob, top_logprobs=top_logprobs, tokenizer=tokenizer + ) + if top_logprobs + else [] + ), ) ) return out @@ -758,14 +774,16 @@ class OpenAIServingResponses(OpenAIServing): text=content, annotations=[], # TODO type="output_text", - logprobs=self._create_response_logprobs( - token_ids=final_output.token_ids, - logprobs=final_output.logprobs, - tokenizer=tokenizer, - top_logprobs=request.top_logprobs, - ) - if request.is_include_output_logprobs() - else None, + logprobs=( + self._create_response_logprobs( + token_ids=final_output.token_ids, + logprobs=final_output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) + if request.is_include_output_logprobs() + else None + ), ) message = ResponseOutputMessage( id=f"msg_{random_uuid()}", @@ -870,15 +888,21 @@ class OpenAIServingResponses(OpenAIServing): with_custom_tools = has_custom_tools(tool_types) sys_msg = get_system_message( reasoning_effort=reasoning_effort, - browser_description=self.tool_server.get_tool_description("browser") - if enable_browser and self.tool_server is not None - else None, - python_description=self.tool_server.get_tool_description("python") - if enable_code_interpreter and self.tool_server is not None - else None, - container_description=self.tool_server.get_tool_description("container") - if enable_container and self.tool_server is not None - else None, + browser_description=( + self.tool_server.get_tool_description("browser") + if enable_browser and self.tool_server is not None + else None + ), + python_description=( + self.tool_server.get_tool_description("python") + if enable_code_interpreter and self.tool_server is not None + else None + ), + container_description=( + self.tool_server.get_tool_description("container") + if enable_container and self.tool_server is not None + else None + ), instructions=request.instructions, with_custom_tools=with_custom_tools, ) @@ -1283,14 +1307,16 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, delta=delta_message.content, - logprobs=self._create_stream_response_logprobs( - token_ids=output.token_ids, - logprobs=output.logprobs, - tokenizer=tokenizer, - top_logprobs=request.top_logprobs, - ) - if request.is_include_output_logprobs() - else [], + logprobs=( + self._create_stream_response_logprobs( + token_ids=output.token_ids, + logprobs=output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) + if request.is_include_output_logprobs() + else [] + ), ) ) current_content_index += 1 From 579d2e5458b19c442f48e0cba0ba71c5d4abf6ea Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 14 Oct 2025 19:51:54 -0400 Subject: [PATCH 58/92] [WideEP][P/D] Add usage stats for DP+EP and KV Connector (#26836) Signed-off-by: Tyler Michael Smith --- vllm/v1/utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index f03efe21098bf..6aebe295b5ce5 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -345,13 +345,17 @@ def report_usage_stats( parallel_config = vllm_config.parallel_config + # Prepare KV connector string if applicable + kv_connector = None + if vllm_config.kv_transfer_config is not None: + kv_connector = vllm_config.kv_transfer_config.kv_connector + usage_message.report_usage( get_architecture_class_name(vllm_config.model_config), usage_context, extra_kvs={ # Common configuration "dtype": str(vllm_config.model_config.dtype), - "tensor_parallel_size": parallel_config.tensor_parallel_size, "block_size": vllm_config.cache_config.block_size, "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes, @@ -363,6 +367,15 @@ def report_usage_stats( "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, "enforce_eager": vllm_config.model_config.enforce_eager, "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce, + # Distributed parallelism settings + "tensor_parallel_size": parallel_config.tensor_parallel_size, + "data_parallel_size": parallel_config.data_parallel_size, + "pipeline_parallel_size": parallel_config.pipeline_parallel_size, + "enable_expert_parallel": parallel_config.enable_expert_parallel, + # All2All backend for MoE expert parallel + "all2all_backend": parallel_config.all2all_backend, + # KV connector used + "kv_connector": kv_connector, }, ) From 2dcd12d3571b070432ad1cd321a67b840b4a34b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Tue, 14 Oct 2025 19:55:02 -0400 Subject: [PATCH 59/92] [torch.compile] Fix tests for torch==2.9 inductor partition (#26116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ProExpertProg Signed-off-by: Luka Govedič --- .../compile/piecewise/test_full_cudagraph.py | 29 +++-- .../compile/piecewise/test_multiple_graphs.py | 38 ++++-- tests/compile/piecewise/test_toy_llama.py | 117 +++++++++++------- tests/compile/silly_attention.py | 1 - tests/compile/test_decorator.py | 3 + vllm/attention/layer.py | 6 - vllm/compilation/partition_rules.py | 13 +- vllm/config/compilation.py | 3 +- 8 files changed, 138 insertions(+), 72 deletions(-) diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 84194f3ed01e8..e01b58220959f 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM, SamplingParams from vllm.config import CompilationConfig from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer @contextlib.contextmanager @@ -32,13 +33,13 @@ def temporary_environ(env_vars): os.environ[k] = v -test_params_full_cudagraph = [] +model_backends_full_cudagraph = [] # deepseek-ai/DeepSeek-V2-Lite with MLA MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"] for mla_backend in MLA_backends: - test_params_full_cudagraph.append( - pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])) + model_backends_full_cudagraph.append( + ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]) ) # Qwen/Qwen2-1.5B-Instruct with other backends @@ -46,14 +47,18 @@ other_backend_configs = [ backend_configs[c] for c in backend_configs if c not in MLA_backends ] for backend_config in other_backend_configs: - test_params_full_cudagraph.append( - pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)) - ) + model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config)) @pytest.fixture(scope="class") def llm_pair(request): - model, backend_config = request.param + model, backend_config, use_inductor_graph_partition = request.param + backend_config.comp_config["use_inductor_graph_partition"] = ( + use_inductor_graph_partition + ) + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") # Dynamically skip test if GPU capability is not met if ( @@ -104,7 +109,15 @@ def llm_pair(request): ) -@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True) +@pytest.mark.parametrize( + "llm_pair", + [ + pytest.param((model, backend_config, use_inductor_graph_partition)) + for model, backend_config in model_backends_full_cudagraph + for use_inductor_graph_partition in [True, False] + ], + indirect=True, +) class TestFullCUDAGraph: """ Use a class such that an llm pair is constructed once for all diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index d88645e3bfd62..0d265bc596386 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules are compiled and graph captured separately. """ +import pytest import torch from torch import nn @@ -190,7 +191,12 @@ def run_model( return output.cpu() -def test_multi_graph_piecewise_compile_outputs_equal(): +@pytest.mark.parametrize("use_inductor_graph_partition", [False, True]) +def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): + if use_inductor_graph_partition: + # FIXME(luka/boyuan): this currently fails + pytest.skip("Inductor graph partition not supported with multi-graph") + outputs = [] # piecewise compile @@ -200,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -220,16 +227,24 @@ def test_multi_graph_piecewise_compile_outputs_equal(): # static tensor addresses inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda() - with compilation_counter.expect( - num_graphs_seen=2, # two graphs for the model - num_piecewise_graphs_seen=6, + if use_inductor_graph_partition: + # Splitting happens at Inductor lowering level, + # total piecewise fx graphs is equal to total graphs + num_piecewise_fx = 2 + num_piecewise_capturable_fx = 2 + else: # attn_one, attn_two each has 3 piecewise graphs # (pre attn, post attn, silly_attention) each - num_piecewise_capturable_graphs_seen=4, + num_piecewise_fx = 6 # attn_one, attn_two has pre attn and post attn each, total=4 - num_backend_compilations=4, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_capturable_fx = 4 + + with compilation_counter.expect( + num_graphs_seen=2, # two graphs for the model + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) @@ -268,6 +283,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): level=CompilationLevel.PIECEWISE, use_cudagraph=False, splitting_ops=["silly::attention"], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal(): with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, num_cudagraph_captured=0, # no cudagraph captured ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index eaf0a15479e97..7ab610fa78115 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ +from copy import deepcopy from dataclasses import dataclass from typing import Any @@ -26,6 +27,7 @@ from vllm.config import ( set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -257,27 +259,13 @@ def tractable_computation( @torch.inference_mode -def run_model( - llama_config, use_compile: bool, backend: str, split_attn: bool = False -) -> torch.Tensor: - if use_compile: - compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - backend=backend, - cudagraph_capture_sizes=[1, 2], - ) - if split_attn: - compilation_config.splitting_ops = ["silly::attention"] - cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE - else: - compilation_config = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, - ) - cudagraph_runtime_mode = CUDAGraphMode.NONE +def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor: + # Start with a fresh copy to make sure there's no cache dir sharing + compile_config = deepcopy(compile_config) + cudagraph_runtime_mode = compile_config.cudagraph_mode vllm_config = VllmConfig( - compilation_config=compilation_config, additional_config=llama_config + compilation_config=compile_config, additional_config=llama_config ) with set_current_vllm_config(vllm_config): model = ( @@ -338,8 +326,25 @@ def run_model( return output.cpu() -@pytest.mark.parametrize("backend", ["inductor", "eager"]) -def test_toy_llama(backend: str): +@pytest.mark.parametrize( + "backend, use_inductor_graph_partition", + [ + ("eager", False), # No inductor + ("inductor", False), # Inductor, Dynamo partition + ("inductor", True), # Inductor, Inductor partition + ], +) +def test_toy_llama( + backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path +): + # We disable the vLLM compile cache into a new tmp dir for 2 reasons: + # 1. To make sure we can properly track the number of Inductor compilations. + # 2. Inductor partitioning does not play nicely with Autograd cache (below) + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") + # compare output with and without piecewise compilation llama_config = LlamaConfig( @@ -350,6 +355,32 @@ def test_toy_llama(backend: str): hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True ) + compile_config_no_compile = CompilationConfig( + level=CompilationLevel.NO_COMPILATION, + cudagraph_mode=CUDAGraphMode.NONE, + backend="eager", + ) + + compile_config_no_split = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor_graph_partition=use_inductor_graph_partition, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + backend=backend, + cudagraph_capture_sizes=[1, 2], + ) + + # FIXME(luka/boyuan): the graph from the previous test case + # (no inductor partition) gets cached by AotAutograd so then the + # compilation with inductor partitioning incorrectly loads an unpartitioned + # graph and never partitions. I think this is a bug with custom inductor + # partitioning but does not affect vLLM more generally as vLLM uses its own + # cache (which takes inductor partitioning into account). + if use_inductor_graph_partition: + compile_config_no_split.inductor_compile_config["force_disable_caches"] = True + + compile_config_split = deepcopy(compile_config_no_split) + compile_config_split.splitting_ops = ["silly::attention"] + outputs = [] with compilation_counter.expect( num_graphs_seen=0, @@ -358,8 +389,9 @@ def test_toy_llama(backend: str): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(llama_config, backend="eager", use_compile=False)) - run_model(tractable_config, backend="eager", use_compile=False) + outputs.append(run_model(llama_config, compile_config_no_compile)) + + run_model(tractable_config, compile_config_no_compile) if backend == "inductor": kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} @@ -367,35 +399,34 @@ def test_toy_llama(backend: str): kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} with compilation_counter.expect( - # One graph for the model - num_graphs_seen=1, + num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, num_piecewise_capturable_graphs_seen=1, - # num_piecewise_capturable_graphs_seen - num_backend_compilations=1, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_backend_compilations=1, # num_piecewise_capturable_graphs_seen num_cudagraph_captured=2, **kwargs, ): - outputs.append(run_model(llama_config, backend=backend, use_compile=True)) - run_model(tractable_config, backend=backend, use_compile=True) + outputs.append(run_model(llama_config, compile_config_no_split)) + + run_model(tractable_config, compile_config_no_split) + + if use_inductor_graph_partition: + num_piecewise_fx = 1 + num_piecewise_capturable_fx = 1 + else: + num_piecewise_fx = 2 * llama_config.num_layers + 1 + num_piecewise_capturable_fx = 1 + llama_config.num_layers with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=1 - + llama_config.num_layers, # 1 + num_layers - num_backend_compilations=1 - + llama_config.num_layers, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=2 - * ( - 1 + llama_config.num_layers - ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + # num_cudagraph_sizes * num_partitions + num_cudagraph_captured=2 * (1 + llama_config.num_layers), ): - outputs.append( - run_model(llama_config, backend=backend, use_compile=True, split_attn=True) - ) - run_model(tractable_config, backend=backend, use_compile=True, split_attn=True) + outputs.append(run_model(llama_config, compile_config_split)) + run_model(tractable_config, compile_config_split) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index c0d3f908149f6..f33c5772906a6 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -62,5 +62,4 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, - tags=(torch._C.Tag.cudagraph_unsafe,), ) diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 6b050207ec41b..63cb266094a12 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -73,6 +73,7 @@ def test_ignore_torch_compile_decorator(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=False, # TODO test both? ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -188,6 +189,7 @@ def test_conditional_compile_enable_if(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=False, # TODO test both ), ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -220,6 +222,7 @@ def test_conditional_compile_enable_if(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=False, # TODO test both? ), ) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index fe9de65b52c66..8b5b87cba4044 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -38,10 +38,6 @@ from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None -try: - tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,) -except AttributeError: - tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -879,7 +875,6 @@ direct_register_custom_op( op_name="unified_attention", op_func=unified_attention, fake_impl=unified_attention_fake, - tags=tag_cudagraph_unsafe, ) @@ -931,7 +926,6 @@ direct_register_custom_op( op_func=unified_attention_with_output, mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, - tags=tag_cudagraph_unsafe, ) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index 5ea1b30860f59..cea4f9a816377 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import logging from typing import TYPE_CHECKING from torch._library.utils import lookup_op @@ -38,8 +39,16 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]: resolved.append(lookup_op(op_name)) except Exception: # Skip operators that don't exist (e.g., model-specific ops) - logger.warning( - "Failed to resolve operator for Inductor partition: %s", op_name + # Do not warn for attention ops, warn for others + # (most likely manually specified) + from vllm.config import CompilationConfig + + logger.log( + logging.DEBUG + if op_name in CompilationConfig._attention_ops + else logging.WARNING, + "Failed to resolve operator for CUDAGraph partition: %s", + op_name, ) continue diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 60aef2f6f7e1c..fb80835ba48a1 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -201,7 +201,7 @@ class CompilationConfig: (it sees a part of the graph). The backend can not be custom for compilation level 3, i.e. the backend must be either eager or inductor. Furthermore, compilation is only piecewise if splitting ops is set accordingly and - use_inductor_cudagraphs_partition is off. Note that the default options for + use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. """ custom_ops: list[str] = field(default_factory=list) @@ -431,6 +431,7 @@ class CompilationConfig: factors.append(self.custom_ops) factors.append(self.splitting_ops) factors.append(self.use_inductor) + factors.append(self.use_inductor_graph_partition) factors.append(self.inductor_compile_config) factors.append(self.inductor_passes) factors.append(self.pass_config.uuid()) From 07ca70af8d8a0d0e20727d8de6972a7ad87cf996 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 14 Oct 2025 18:41:18 -0700 Subject: [PATCH 60/92] [Core][Easy] Use envs.__getattr__ for all Unify to environment variable access (#26810) Signed-off-by: Jialin Ouyang --- vllm/multimodal/cache.py | 6 +++--- vllm/transformers_utils/utils.py | 4 ++-- vllm/utils/gc_utils.py | 6 +++--- vllm/v1/engine/async_llm.py | 5 ++--- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index f6ef675aa7c29..a29da2a56afc1 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -10,12 +10,12 @@ from typing import TYPE_CHECKING, Generic, TypeAlias, TypeVar, cast import torch from typing_extensions import override +import vllm.envs as envs from vllm.distributed.device_communicators.shm_object_storage import ( MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer, ) -from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME from vllm.logger import init_logger from vllm.utils import GiB_bytes, MiB_bytes from vllm.utils.cache import CacheInfo, LRUCache @@ -436,7 +436,7 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache): ring_buffer = SingleWriterShmRingBuffer( data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), - name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, create=True, # sender is the writer ) self._shm_cache = SingleWriterShmObjectStorage( @@ -678,7 +678,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache): ring_buffer = SingleWriterShmRingBuffer( data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), - name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, create=False, # Server is a reader ) self._shm_cache = SingleWriterShmObjectStorage( diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index b87414d79df0f..58c754dbd3974 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -8,7 +8,7 @@ from os import PathLike from pathlib import Path from typing import Any -from vllm.envs import VLLM_MODEL_REDIRECT_PATH +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -86,7 +86,7 @@ def maybe_model_redirect(model: str) -> str: :return: maybe redirect to a local folder """ - model_redirect_path = VLLM_MODEL_REDIRECT_PATH + model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH if not model_redirect_path: return model diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py index 99c19c9db28e9..6894ccff11d93 100644 --- a/vllm/utils/gc_utils.py +++ b/vllm/utils/gc_utils.py @@ -7,7 +7,7 @@ from collections import Counter from contextlib import suppress from typing import Any -from vllm.envs import VLLM_GC_DEBUG +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -36,7 +36,7 @@ class GCDebugConfig: self.top_objects = json_conf.get("top_objects", -1) except Exception: self.enabled = False - logger.error("Failed to parse VLLM_GC_DEBUG(%s)", VLLM_GC_DEBUG) + logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG) logger.info("GC Debug Config. %s", str(self)) def __repr__(self) -> str: @@ -93,7 +93,7 @@ def maybe_attach_gc_debug_callback() -> None: """ Attached a callback for GC debug when VLLM_GC_DEBUG is enabled. """ - config = GCDebugConfig(VLLM_GC_DEBUG) + config = GCDebugConfig(envs.VLLM_GC_DEBUG) if config.enabled: debugger: GCDebugger = GCDebugger(config) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 39cd1d97c280a..0ec153e233161 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -16,7 +16,6 @@ from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.utils import _validate_truncation_size -from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -483,12 +482,12 @@ class AsyncLLM(EngineClient): # Split outputs into chunks of at most # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the # event loop for too long. - if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: slices = (outputs.outputs,) else: slices = np.array_split( outputs.outputs, - cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), + cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), ) for i, outputs_slice in enumerate(slices): From 9354660036dff11a81433f0695c71dfee75cce50 Mon Sep 17 00:00:00 2001 From: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Date: Wed, 15 Oct 2025 09:50:30 +0800 Subject: [PATCH 61/92] [Bugfix]fix Qwen3 xml tool parser (#26345) Signed-off-by: Zhikaiiii <1658973216@qq.com> --- tests/tool_use/test_qwen3coder_tool_parser.py | 88 ++++++++++++- .../tool_parsers/qwen3xml_tool_parser.py | 117 ++++++++++++++---- 2 files changed, 179 insertions(+), 26 deletions(-) diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index b4f0989b1b19c..93ef1049fc07e 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -40,7 +40,7 @@ def qwen3_xml_tool_parser(qwen3_tokenizer): return Qwen3XMLToolParser(qwen3_tokenizer) -@pytest.fixture(params=["original", "xml"]) +@pytest.fixture(params=["xml"]) def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request): """Parameterized fixture that provides both parser types for testing""" if request.param == "original": @@ -664,6 +664,9 @@ def test_extract_tool_calls_streaming( # Verify we got all expected tool calls assert len(tool_states) == len(expected_tool_calls) + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len( + expected_tool_calls + ) # Verify each tool call for idx, expected_tool in enumerate(expected_tool_calls): @@ -780,9 +783,10 @@ fahrenheit # Verify content was streamed assert "Let me check the weather for you:" in other_content - # Verify we got the tool call assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + state = tool_states[0] assert state["id"] is not None assert state["type"] == "function" @@ -892,3 +896,83 @@ def test_extract_tool_calls_complex_type_with_single_quote( args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) assert args["obj_param"] == {"key": "value"} + + +def test_extract_tool_calls_streaming_missing_opening_tag( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools +): + """Test streaming with missing opening tag + + This tests that the streaming parser correctly handles + tool calls that start directly with + """ + model_output = """I'll check the weather for you. + + + +Dallas + + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) + + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + # Verify content was streamed + assert "I'll check the weather for you." in other_content + + # Verify we got the tool call + assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + + # Verify arguments were parsed correctly despite missing opening tag + assert state["arguments"] is not None + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py index 2c5b0b6a85f76..9964d1ac25c40 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json -import uuid from collections.abc import Sequence from typing import Any from xml.parsers.expat import ParserCreate import regex as re +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, @@ -375,14 +375,21 @@ class StreamingXMLToolCallParser: return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1 else: # If currently not parsing tool calls (entering a tool_call), - # check if starts with + # check if starts with or if buffer == ""[: len(buffer)]: # Might be start of , wait for more data return None, start_pos + elif ( + buffer.startswith(" str | None: """Extract function name from various formats""" if attrs and "name" in attrs: @@ -1168,6 +1171,10 @@ class Qwen3XMLToolParser(ToolParser): super().__init__(tokenizer) self.parser = StreamingXMLToolCallParser() + # Add missing attributes for compatibility with serving_chat.py + self.prev_tool_call_arr: list[dict] = [] + self.streamed_args_for_tool: list[str] = [] + logger.info( "vLLM Successfully import tool parser %s !", self.__class__.__name__ ) @@ -1178,6 +1185,9 @@ class Qwen3XMLToolParser(ToolParser): request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new extraction + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] if request: self.parser.set_tools(request.tools) result = self.parser.parse_single_streaming_chunks(model_output) @@ -1201,6 +1211,34 @@ class Qwen3XMLToolParser(ToolParser): ), ) ) + + # Update tool call tracking arrays for compatibility + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool call information + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + self.prev_tool_call_arr[tool_index]["arguments"] = ( + tool_call.function.arguments + ) + + # Update streamed arguments + if tool_call.function.arguments: + self.streamed_args_for_tool[tool_index] = ( + tool_call.function.arguments + ) + return ExtractedToolCallInformation( tool_calls=tool_calls, tools_called=len(tool_calls) > 0, @@ -1219,6 +1257,9 @@ class Qwen3XMLToolParser(ToolParser): ) -> DeltaMessage | None: if not previous_text: self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new streaming session + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] if request: self.parser.set_tools(request.tools) @@ -1230,20 +1271,48 @@ class Qwen3XMLToolParser(ToolParser): open_calls = current_text.count( self.parser.tool_call_start_token ) - current_text.count(self.parser.tool_call_end_token) - if open_calls == 0 and self.parser.tool_call_index > 0: - # If current_call_id is None, use last_completed_call_id - call_id = ( - self.parser.current_call_id or self.parser.last_completed_call_id - ) - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.parser.tool_call_index - 1, - id=call_id, - function=DeltaFunctionCall(arguments=""), - type="function", - ) - ] - ) + if ( + open_calls == 0 + and self.parser.tool_call_index > 0 + or not self.parser.tool_call_index + and current_text + ): + return DeltaMessage(content="") + return None - return self.parser.parse_single_streaming_chunks(delta_text) + # Parse the delta text and get the result + result = self.parser.parse_single_streaming_chunks(delta_text) + + # Update tool call tracking arrays based on incremental parsing results + if result and result.tool_calls: + for tool_call in result.tool_calls: + if tool_call.function: + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool name if provided + if tool_call.function.name: + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + + # Update arguments incrementally + if tool_call.function.arguments is not None: + # Concatenate the incremental arguments + # to the existing streamed arguments + self.prev_tool_call_arr[tool_index]["arguments"] += ( + tool_call.function.arguments + ) + self.streamed_args_for_tool[tool_index] += ( + tool_call.function.arguments + ) + return result From bfad142e257be6699868f7816ca64c408bc32916 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 14 Oct 2025 21:33:25 -0500 Subject: [PATCH 62/92] [BUGFIX][NIXL] quick fix for 'assert self.connector_worker is not None' in get_kv_connector_stats (#26851) Signed-off-by: Chendi Xue --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 490f209373db3..6a2434ddce8be 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -241,7 +241,8 @@ class NixlConnector(KVConnectorBase_V1): return self.connector_worker.get_block_ids_with_load_errors() def get_kv_connector_stats(self) -> KVConnectorStats | None: - assert self.connector_worker is not None + if self.connector_worker is None: + return None return self.connector_worker.get_kv_connector_stats() @classmethod From e66d787bce22c56f995f4e2974e31ac020bc57ea Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 22:35:18 -0400 Subject: [PATCH 63/92] Disable FlashInfer sampler by default (#26859) Signed-off-by: mgoin --- vllm/v1/sample/ops/topk_topp_sampler.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index ed8bc55a3cf2f..43a40bce6847d 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module): "Falling back to default sampling implementation." ) self.forward = self.forward_native - elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: - # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for - # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by - # default it is unused). For backward compatibility, we set - # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and - # interpret it differently in V0 and V1 samplers: In V0, - # None means False, while in V1, None means True. This is - # why we use the condition - # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. + elif envs.VLLM_USE_FLASHINFER_SAMPLER: + # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1. logger.info_once("Using FlashInfer for top-p & top-k sampling.") self.forward = self.forward_cuda else: - logger.warning_once( - "FlashInfer is available, but it is not enabled. " - "Falling back to the PyTorch-native implementation of " - "top-p & top-k sampling. For the best performance, " - "please set VLLM_USE_FLASHINFER_SAMPLER=1." + logger.debug_once( + "FlashInfer top-p/top-k sampling is available but disabled " + "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in " + "after verifying accuracy for your workloads." ) self.forward = self.forward_native else: From 96b9aa5aa076e64c68765232aec343e4d0006e2a Mon Sep 17 00:00:00 2001 From: Morrison Turnansky Date: Tue, 14 Oct 2025 22:51:16 -0400 Subject: [PATCH 64/92] [Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: morrison-turnansky Signed-off-by: Morrison Turnansky Co-authored-by: Luka Govedič --- docs/configuration/conserving_memory.md | 4 +- docs/design/cuda_graphs.md | 4 +- examples/offline_inference/data_parallel.py | 2 +- .../compile/piecewise/test_multiple_graphs.py | 10 +- tests/compile/piecewise/test_simple.py | 4 +- tests/compile/piecewise/test_toy_llama.py | 10 +- tests/compile/test_aot_compile.py | 4 +- tests/compile/test_async_tp.py | 3 +- tests/compile/test_basic_correctness.py | 30 +++-- tests/compile/test_config.py | 20 ++-- tests/compile/test_decorator.py | 10 +- tests/compile/test_full_graph.py | 29 ++--- tests/compile/test_fusion.py | 4 +- tests/compile/test_fusion_all_reduce.py | 4 +- tests/compile/test_fusion_attn.py | 4 +- tests/compile/test_noop_elimination.py | 6 +- tests/compile/test_wrapper.py | 4 +- tests/distributed/test_sequence_parallel.py | 3 +- tests/engine/test_arg_utils.py | 20 ++-- tests/tpu/test_custom_dispatcher.py | 6 +- tests/utils_/test_utils.py | 10 +- tests/v1/cudagraph/test_cudagraph_dispatch.py | 22 ++-- tests/v1/cudagraph/test_cudagraph_mode.py | 39 +++---- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 6 +- vllm/compilation/backends.py | 4 +- vllm/compilation/compiler_interface.py | 2 +- vllm/compilation/counter.py | 4 +- vllm/compilation/decorators.py | 10 +- vllm/compilation/monitor.py | 6 +- vllm/compilation/wrapper.py | 8 +- vllm/config/__init__.py | 4 +- vllm/config/compilation.py | 106 ++++++++++++------ vllm/config/vllm.py | 50 ++++----- vllm/entrypoints/llm.py | 6 +- .../layers/quantization/utils/w8a8_utils.py | 4 +- vllm/platforms/cpu.py | 8 +- vllm/platforms/tpu.py | 11 +- vllm/platforms/xpu.py | 4 +- vllm/utils/__init__.py | 10 +- vllm/v1/cudagraph_dispatcher.py | 4 +- vllm/v1/spec_decode/eagle.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 15 +-- 42 files changed, 270 insertions(+), 248 deletions(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 2b0654fa6d463..85906d23dee33 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc ```python from vllm import LLM - from vllm.config import CompilationConfig, CompilationLevel + from vllm.config import CompilationConfig, CompilationMode llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # By default, it goes up to max_num_seqs cudagraph_capture_sizes=[1, 2, 4, 8, 16], ), diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index 315746b0ef674..c6d71589be985 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum): """NO CUDA Graphs support""" ``` -Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. +Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. The following table lists backends that support full CUDA Graphs at the time of writing. @@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG") import vllm from vllm.config import CUDAGraphMode -compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} +compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( model="meta-llama/Llama-3.1-8B-Instruct", dtype="auto", diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 0076d4d30ee8e..a3e671a0f4cca 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -95,7 +95,7 @@ def parse_args(): parser.add_argument( "--compilation-config", type=int, - help=("Compilation optimization (O) level 0-3."), + help=("Compilation optimization (O) mode 0-3."), ) parser.add_argument( "--quantization", diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index 0d265bc596386..d1f741479acf4 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -14,7 +14,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): outputs = [] - # piecewise compile + # vllmcompile compile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): # no compile or cudagraph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + mode=CompilationMode.NONE, ) ) cudagraph_runtime_mode = CUDAGraphMode.NONE @@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): # piecewise compile without CUDA graph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=False, splitting_ops=["silly::attention"], use_inductor_graph_partition=use_inductor_graph_partition, diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index bc65e3da0ae74..f61a0a4eb740d 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -61,7 +61,7 @@ def _run_simple_model( ): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, use_inductor=use_inductor, splitting_ops=splitting_ops, diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 7ab610fa78115..75a89d692fa8f 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -21,7 +21,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -356,13 +356,13 @@ def test_toy_llama( ) compile_config_no_compile = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + level=CompilationMode.NONE, cudagraph_mode=CUDAGraphMode.NONE, backend="eager", ) compile_config_no_split = CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_mode=CUDAGraphMode.PIECEWISE, backend=backend, @@ -458,14 +458,14 @@ def benchmark(): for piecewise in [False, True]: if piecewise: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_capture_sizes=cudagraph_sizes, ) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 08f79d90cd367..1701d85fe84e7 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -10,7 +10,7 @@ import torch from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, VllmConfig, set_current_vllm_config, ) @@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module): def make_vllm_config() -> VllmConfig: return VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, ) ) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 102a929bf2409..60856f5a58067 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -10,6 +10,7 @@ import vllm.envs as envs from vllm.compilation.collective_fusion import AsyncTPPass from vllm.config import ( CompilationConfig, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -400,7 +401,7 @@ def test_async_tp_pass_correctness( common_args.append("--enforce-eager") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [2, 4, 8], "splitting_ops": [], "pass_config": {"enable_async_tp": async_tp_enabled}, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index ab6a17e149fcd..954774a8e3983 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -4,7 +4,7 @@ import dataclasses import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings @@ -21,7 +21,7 @@ class TestSetting: # we cannot afford testing the full Cartesian product -# of all models and all levels +# of all models and all modes @pytest.mark.parametrize( "test_setting", [ @@ -121,15 +121,13 @@ def test_compile_correctness( all_args: list[list[str]] = [] all_envs: list[dict[str, str] | None] = [] - for comp_level in [ - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - for level in [CompilationLevel.NO_COMPILATION, comp_level]: - all_args.append( - final_args + [f"-O.level={level}", "-O.backend=inductor"] - ) + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"]) # inductor will change the output, so we only compare if the output # is close, not exactly the same. @@ -142,13 +140,13 @@ def test_compile_correctness( all_envs.clear() all_args.clear() - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"]) all_envs.append({}) all_envs.append({}) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index ae8b0b226c313..7f51c763da73c 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ import pytest from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.compilation import CompilationLevel +from vllm.config.compilation import CompilationMode from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer @@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked -def test_dynamo_as_is(vllm_runner, monkeypatch): +def test_stock_torch_compile(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(dynamo_as_is_count=1), + compilation_counter.expect(stock_torch_compile_count=1), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 1}, + compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE}, gpu_memory_utilization=0.4, ) as _, ): @@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 0}, + compilation_config={"mode": CompilationMode.NONE}, gpu_memory_utilization=0.4, ) as _, ): @@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 @@ -151,7 +151,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, splitting_ops=["vllm::unified_attention"], ) @@ -163,7 +163,7 @@ def test_splitting_ops_dynamic(): # When attn_fusion pass enabled, splitting_ops now default to attention ops. config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, @@ -178,7 +178,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 63cb266094a12..4d60899a628a9 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -8,7 +8,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -66,10 +66,10 @@ def run_model( def test_ignore_torch_compile_decorator(): - # piecewise + # vllmcompile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -185,7 +185,7 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=True, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -218,7 +218,7 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=False, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 2f3794c90b204..2d290771f9ad7 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.attention.backends.registry import _Backend from vllm.attention.selector import global_force_attn_backend_context_manager -from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig +from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer @@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE], ) @pytest.mark.parametrize("model_info", models_list(all=True)) @create_new_process_for_each_test() def test_full_graph( monkeypatch: pytest.MonkeyPatch, model_info: tuple[str, dict[str, Any]], - optimization_level: int, + compilation_mode: int, ): model, model_kwargs = model_info with monkeypatch.context(): print(f"MODEL={model}") - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) # TODO(luka) add other supported compilation config scenarios here @@ -104,7 +104,7 @@ def test_full_graph( [ # additional compile sizes, only some of the models ( - CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), + CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]), model, ) for model in models_list(all=False) @@ -113,7 +113,7 @@ def test_full_graph( # RMSNorm + quant fusion, only 8-bit quant models ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ), @@ -125,7 +125,8 @@ def test_full_graph( # Test depyf integration works ( CompilationConfig( - level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() + mode=CompilationMode.VLLM_COMPILE, + debug_dump_path=tempfile.gettempdir(), ), ("facebook/opt-125m", {}), ), @@ -134,7 +135,7 @@ def test_full_graph( # graph inductor partition ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # inductor graph partition uses # torch._C.Tag.cudagraph_unsafe to specify splitting ops use_inductor_graph_partition=True, @@ -164,10 +165,10 @@ def test_custom_compile_config( @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.NONE, CompilationMode.VLLM_COMPILE], ) -def test_fp8_kv_scale_compile(optimization_level: int): +def test_fp8_kv_scale_compile(compilation_mode: int): model = "Qwen/Qwen2-0.5B" model_kwargs = { "quantization": "fp8", @@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int): "calculate_kv_scales": True, "max_model_len": 512, } - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) def test_inductor_graph_partition_attn_fusion(caplog_vllm): @@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 7c22336432299..1a5eaf2639b36 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -13,7 +13,7 @@ from vllm.compilation.fusion import ( ) from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 455d1bb039057..fbcd6c71fb723 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"] ) ) vllm_config.compilation_config.pass_config = PassConfig( diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index d1ab85cfb875c..a8d78daa32a1d 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, ModelConfig, PassConfig, SchedulerConfig, @@ -321,7 +321,7 @@ def test_attention_quant_pattern( ), scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+quant_fp8"], use_inductor_graph_partition=use_inductor_graph_partition, ), diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py index 188f4514dda5f..0ccc1a0161629 100644 --- a/tests/compile/test_noop_elimination.py +++ b/tests/compile/test_noop_elimination.py @@ -6,7 +6,7 @@ import torch import vllm from vllm.compilation.noop_elimination import NoOpEliminationPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from .backend import TestBackend @@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) @@ -98,7 +98,7 @@ def test_non_noop_slice_preserved(): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index b2fff822bbbb5..da0afd9eaa49f 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -5,7 +5,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel +from vllm.config import CompilationMode class MyMod(torch.nn.Module): @@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") super().__init__( - compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE + compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE ) def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index a431bf30fc890..362e9daf5ae04 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -15,6 +15,7 @@ from typing import Literal, NamedTuple import pytest +from vllm.config.compilation import CompilationMode from vllm.config.model import RunnerOption from vllm.logger import init_logger @@ -234,7 +235,7 @@ def _compare_sp( common_args.append("--skip-tokenizer-init") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "custom_ops": ["+rms_norm"], "compile_sizes": [4, 8], "pass_config": { diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 78928a53942f9..c73083b0b5ef6 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -226,30 +226,30 @@ def test_compilation_config(): # set to O3 args = parser.parse_args(["-O0"]) - assert args.compilation_config.level == 0 + assert args.compilation_config.mode == 0 # set to O 3 (space) args = parser.parse_args(["-O", "1"]) - assert args.compilation_config.level == 1 + assert args.compilation_config.mode == 1 # set to O 3 (equals) args = parser.parse_args(["-O=2"]) - assert args.compilation_config.level == 2 + assert args.compilation_config.mode == 2 - # set to O.level 3 - args = parser.parse_args(["-O.level", "3"]) - assert args.compilation_config.level == 3 + # set to O.mode 3 + args = parser.parse_args(["-O.mode", "3"]) + assert args.compilation_config.mode == 3 # set to string form of a dict args = parser.parse_args( [ "-O", - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": false}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and not args.compilation_config.use_inductor ) @@ -258,12 +258,12 @@ def test_compilation_config(): args = parser.parse_args( [ "--compilation-config=" - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": true}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.use_inductor ) diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 102e5ddf16d6d..cf455ff3edbd3 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from ..utils import compare_two_settings @@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", + f"-O{CompilationMode.DYNAMO_TRACE_ONCE}", ], arg2=[ "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_AS_IS}", + f"-O{CompilationMode.STOCK_TORCH_COMPILE}", ], env1={}, env2={}, diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 308629ab05834..af5fc758f2c26 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -299,7 +299,7 @@ def test_dict_args(parser): "val2", "--hf-overrides.key2.key4", "val3", - # Test compile config and compilation level + # Test compile config and compilation mode "-O.use_inductor=true", "-O.backend", "custom", @@ -352,7 +352,7 @@ def test_dict_args(parser): }, } assert parsed_args.compilation_config == { - "level": 1, + "mode": 1, "use_inductor": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], @@ -367,7 +367,7 @@ def test_duplicate_dict_args(caplog_vllm, parser): "--hf-overrides.key1", "val2", "-O1", - "-O.level", + "-O.mode", "2", "-O3", ] @@ -375,12 +375,12 @@ def test_duplicate_dict_args(caplog_vllm, parser): parsed_args = parser.parse_args(args) # Should be the last value assert parsed_args.hf_overrides == {"key1": "val2"} - assert parsed_args.compilation_config == {"level": 3} + assert parsed_args.compilation_config == {"mode": 3} assert len(caplog_vllm.records) == 1 assert "duplicate" in caplog_vllm.text assert "--hf-overrides.key1" in caplog_vllm.text - assert "-O.level" in caplog_vllm.text + assert "-O.mode" in caplog_vllm.text @pytest.mark.parametrize( diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 59841a446db3e..02fa27e3f05f7 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -11,7 +11,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, ParallelConfig, SchedulerConfig, @@ -42,7 +42,7 @@ def _create_vllm_config( mock_config.parallel_config = ParallelConfig() # Mimic the behavior of VllmConfig.__post_init__() - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() return mock_config @@ -50,23 +50,23 @@ def _create_vllm_config( class TestCudagraphDispatcher: @pytest.mark.parametrize( - "case_id,cudagraph_mode_str,compilation_level", + "case_id,cudagraph_mode_str,compilation_mode", [ # Test case 0: Full CG for mixed batches, no separate routine - (0, "FULL", CompilationLevel.NO_COMPILATION), + (0, "FULL", CompilationMode.NONE), # Test case 1: Full CG for uniform batches, piecewise for mixed - (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION), + (1, "FULL_AND_PIECEWISE", CompilationMode.NONE), # Test case 2: Full CG for uniform batches, no CG for mixed - (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION), - # Test case 3: Piecewise for all - (3, "PIECEWISE", CompilationLevel.PIECEWISE), + (2, "FULL_DECODE_ONLY", CompilationMode.NONE), + # Test case 3: PIECEWISE for all + (3, "PIECEWISE", CompilationMode.VLLM_COMPILE), ], ) - def test_dispatcher(self, cudagraph_mode_str, compilation_level): + def test_dispatcher(self, cudagraph_mode_str, compilation_mode): # Setup dispatcher comp_config = CompilationConfig( cudagraph_mode=cudagraph_mode_str, - level=compilation_level, + mode=compilation_mode, cudagraph_capture_sizes=[1, 8], ) @@ -242,7 +242,7 @@ class TestCudagraphIntegration: def setup_method(self): # only FULL mode for non-uniform batches self.comp_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode="FULL", cudagraph_capture_sizes=[10, 20], ) diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 8c8148ae20948..818ae1d7ba677 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -10,7 +10,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform @@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=3, cudagraph_mode=cudagraph_mode + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) @@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ) -# test cudagraph_mode with different compilation level. -# (backend_name, cudagraph_mode, compilation_level, supported) +# test cudagraph_mode with different compilation mode. +# (backend_name, cudagraph_mode, compilation_mode, supported) combo_cases_2 = [ - ("FA2", "FULL", 0, True), # no compilation + full cudagraph - ("FA2", "FULL", 3, True), # piecewise compilation + full cudagraph - ("FA2", "PIECEWISE", 0, False), # no compilation + piecewise cudagraph - ("FA2", "PIECEWISE", 3, True), # piecewise compilation + piecewise cudagraph - ( - "FA2", - "FULL_AND_PIECEWISE", - 0, - False, - ), # piecewise cudagraph not supported without piecewise compilation - ("FA2", "FULL_AND_PIECEWISE", 3, True), - ("FA2", "FULL_DECODE_ONLY", 0, True), - ("FA2", "FULL_DECODE_ONLY", 3, True), - ("FA2", "NONE", 0, True), # no compilation + no cudagraph - ("FA2", "NONE", 3, True), # piecewise compilation + no cudagraph + ("FA2", "FULL", CompilationMode.NONE, True), + ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), + ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("FA2", "NONE", CompilationMode.NONE, True), + ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), ] @pytest.mark.parametrize( - "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2 + "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2 ) def test_cudagraph_compilation_combo(combo_case): - backend_name, cudagraph_mode, compilation_level, supported = combo_case + backend_name, cudagraph_mode, compilation_mode, supported = combo_case env_vars = backend_configs[backend_name].env_vars @@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case): gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=compilation_level, cudagraph_mode=cudagraph_mode + mode=compilation_mode, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 89e5f26ac627f..f2c6d1c1fd1a4 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -7,7 +7,7 @@ import pytest import torch from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationMode from vllm.distributed import cleanup_dist_env_and_memory from ...utils import fork_new_process_for_each_test @@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill( # This allows vLLM compilation backend to handle allocating and # managing buffers for cudagraph cudagraph_copy_inputs=True, - level=CompilationLevel.PIECEWISE + mode=CompilationMode.VLLM_COMPILE if not enforce_eager - else CompilationLevel.NO_COMPILATION, + else CompilationMode.NONE, ) with monkeypatch.context() as m: diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 46c433fe6aefb..91be7e85af518 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -56,7 +56,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: return InductorAdaptor() else: assert compilation_config.backend == "eager", ( - "Custom backends not supported with CompilationLevel.PIECEWISE" + "Custom backends not supported with CompilationMode.VLLM_COMPILE" ) logger.debug("Using EagerAdaptor") @@ -481,7 +481,7 @@ def set_model_tag(tag: str): class VllmBackend: """The compilation backend for `torch.compile` with vLLM. - It is used for compilation level of `CompilationLevel.PIECEWISE`, + It is used for compilation mode of `CompilationMode.VLLM_COMPILE`, where we customize the compilation. The major work of this backend is to split the graph into diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 4553007027e39..e2369a635ad1f 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -575,7 +575,7 @@ class InductorAdaptor(CompilerInterface): Because it is re-entrant, we always set it (even if entering via Dynamo and the context was already entered). We might want to revisit if it - should be set at a different level of compilation. + should be set at a different mode of compilation. This is likely a bug in PyTorch: public APIs should not rely on manually setting up internal contexts. But we also rely on non-public diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 9e8de831bcb29..20918099f169d 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -27,8 +27,8 @@ class CompilationCounter: num_cache_entries_updated: int = 0 # The number of standalone_compile compiled artifacts saved num_compiled_artifacts_saved: int = 0 - # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS - dynamo_as_is_count: int = 0 + # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE + stock_torch_compile_count: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index fe19d4e851294..20d4681e2c789 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -18,7 +18,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config +from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import resolve_obj_by_qualname, supports_dynamo @@ -233,11 +233,11 @@ def _support_torch_compile( old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config enable_compile = enable_if is None or enable_if(vllm_config) - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = ( - vllm_config.compilation_config.level - in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS] + vllm_config.compilation_config.mode + in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] or not supports_dynamo() or _should_ignore_torch_compile(self.__class__) or not enable_compile @@ -247,7 +247,7 @@ def _support_torch_compile( compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( - self, compilation_level=vllm_config.compilation_config.level + self, compilation_mode=vllm_config.compilation_config.mode ) cls.__init__ = __init__ diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index d3c437795fabb..1e6d0e79228b0 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -3,7 +3,7 @@ import time -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config path = vllm_config.compile_debug_dump_path() - if compilation_config.level == CompilationLevel.PIECEWISE and path: + if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: import depyf path.mkdir(parents=True, exist_ok=True) @@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): def end_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info( "torch.compile takes %.2f s in total", compilation_config.compilation_time ) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b4a0d89af0d6d..4b10c85209f63 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -11,7 +11,7 @@ from types import CodeType import torch import vllm.envs as envs -from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config +from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config from vllm.logger import init_logger logger = init_logger(__name__) @@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher: """ def __init__( - self, compiled_callable: Callable | None = None, compilation_level: int = 0 + self, compiled_callable: Callable | None = None, compilation_mode: int = 0 ): vllm_config = get_current_vllm_config() self.vllm_config = vllm_config @@ -72,7 +72,7 @@ class TorchCompileWrapperWithCustomDispatcher: # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = ( - compilation_level >= CompilationLevel.DYNAMO_ONCE + compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE ) def aot_compile(self, *args, **kwargs): @@ -85,7 +85,7 @@ class TorchCompileWrapperWithCustomDispatcher: return self.compiled_callable.aot_compile((args, kwargs)) def __call__(self, *args, **kwargs): - """Implement the dispatch logic here, beyond the torch.compile level. + """Implement the dispatch logic here, beyond the torch.compile mode. NOTE: this function can have additional arguments beyond the forward method, for directly dispatching to the compiled code. """ diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dcd..7f1cc52024205 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,7 +4,7 @@ from vllm.config.cache import CacheConfig from vllm.config.compilation import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, PassConfig, ) @@ -49,7 +49,7 @@ __all__ = [ "CacheConfig", # From vllm.config.compilation "CompilationConfig", - "CompilationLevel", + "CompilationMode", "CUDAGraphMode", "PassConfig", # From vllm.config.device diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index fb80835ba48a1..a34fb0bf920c0 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -26,12 +26,20 @@ else: logger = init_logger(__name__) -class CompilationLevel: - # constants for the levels of the compilation process - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 +class CompilationMode: + """The compilation approach used for torch.compile-based compilation of the + model.""" + + NONE = 0 + """No torch.compile compilation is applied, model runs in fully eager pytorch mode. + The model runs as-is.""" + STOCK_TORCH_COMPILE = 1 + """The standard `torch.compile` compilation pipeline.""" + DYNAMO_TRACE_ONCE = 2 + """Single Dynamo trace through the model, avoiding recompilation.""" + VLLM_COMPILE = 3 + """Custom vLLM Inductor-based backend with caching, piecewise compilation, + shape specialization, and custom passes.""" class CUDAGraphMode(enum.Enum): @@ -134,7 +142,7 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - [`level`][vllm.config.CompilationConfig.level] + - [`mode`][vllm.config.CompilationConfig.mode] - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] - [`backend`][vllm.config.CompilationConfig.backend] @@ -171,14 +179,26 @@ class CompilationConfig: # Top-level Compilation control level: int | None = None - """The level of compilation: + """ + Level is deprecated and will be removed in the next release, + either 0.12.0 or 0.11.2 whichever is soonest. + Please use mode. Currently all levels are mapped to mode. + """ + # Top-level Compilation control + mode: int | None = None + """The compilation approach used for torch.compile-based compilation of the + model. - - None: If None, we will select the default compilation level. - For V1 engine this is 3, for V0 engine this is 0. - - 0: no compilation. - - 1: dynamo as is. - - 2: dynamo once. - - 3: piecewise compilation.""" + - None: If None, we will select the default compilation mode. + For V1 engine this is 3. + - 0: NONE: No torch.compile compilation is applied, model runs in fully + eager pytorch mode. The model runs as-is. + - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline. + - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding + recompilation by removing guards. + Requires no dynamic-shape-dependent control-flow. + - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, + piecewise compilation, shape specialization, and custom passes.""" debug_dump_path: Path | None = None """The path to dump the debug information.""" cache_dir: str = "" @@ -195,11 +215,11 @@ class CompilationConfig: backend function. We use string to avoid serialization issues when using compilation in a - distributed setting. When the compilation level is 1 or 2, the backend is + distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation level is 3, the backend is used for the piecewise compilation + compilation mode is 3, the backend is used for the piecewise compilation (it sees a part of the graph). The backend can not be custom for compilation - level 3, i.e. the backend must be either eager or inductor. Furthermore, + mode 3, i.e. the backend must be either eager or inductor. Furthermore, compilation is only piecewise if splitting ops is set accordingly and use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. @@ -214,7 +234,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -249,7 +269,7 @@ class CompilationConfig: One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - This setting is ignored if level