From a86b4c58e8f72f4903d873d25510f53f7577366f Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 14 Oct 2025 15:53:10 -0700 Subject: [PATCH 01/51] remove attn output view kernel (#26680) Signed-off-by: Boyuan Feng Signed-off-by: Boyuan Feng Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/attention/layer.py | 6 +++--- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/flex_attention.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/attention/backends/rocm_aiter_unified_attn.py | 2 +- vllm/v1/attention/backends/rocm_attn.py | 2 +- vllm/v1/attention/backends/tree_attn.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 2 +- vllm/v1/attention/backends/xformers.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 929c3b6a4906b..fe9de65b52c66 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -346,7 +346,7 @@ class Attention(nn.Module, AttentionLayerBase): if self.use_output: output_shape = output_shape if output_shape is not None else query.shape - output = torch.zeros(output_shape, dtype=output_dtype, device=query.device) + output = torch.empty(output_shape, dtype=output_dtype, device=query.device) hidden_size = output_shape[-1] # Reshape the query, key, and value tensors. # NOTE(woosuk): We do this outside the custom op to minimize the @@ -705,7 +705,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): self.calc_kv_scales(q, kv_c_normed, k_pe) if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) self.impl.forward( self, q, @@ -722,7 +722,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): ) else: if self.attn_backend.accept_output_buffer: - output = torch.zeros(output_shape, dtype=q.dtype, device=q.device) + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) torch.ops.vllm.unified_mla_attention_with_output( q, kv_c_normed, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fa4e34536135d..9e0c125d9edb7 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -530,7 +530,7 @@ class FlashAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) attn_type = self.attn_type diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0fa71afa62eef..ee32f7e2904f7 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -857,7 +857,7 @@ class FlashInferImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) if self.bmm1_scale is None: self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 2595851e5042d..902872bb25b33 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -767,7 +767,7 @@ class FlexAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # query = self.view_as_4d(query).permute(0, 2, 1, 3) # return torch.empty_like(query) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index cce43b220da77..7c73611d4a58a 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -485,7 +485,7 @@ class AiterFlashAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # IMPORTANT! # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py index 14184944934fa..27b072106268b 100644 --- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py +++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py @@ -130,7 +130,7 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) assert attn_metadata.use_cascade is False diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 5245c7f449259..8b7ce90a3ccae 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -299,7 +299,7 @@ class RocmAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) assert attn_metadata.use_cascade is False diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index aab90cfd1fe0d..ee6ead9ad9b35 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -379,7 +379,7 @@ class TreeAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # Cache the input KVs. key_cache, value_cache = kv_cache.unbind(0) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 9d1d007a08e4c..9746a0eb58bd2 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -298,7 +298,7 @@ class TritonAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) assert attn_metadata.use_cascade is False diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index 41c543c18adcc..457b15ebdd82f 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -354,7 +354,7 @@ class XFormersAttentionImpl(AttentionImpl): if attn_metadata is None: # Profiling run. - return output + return output.fill_(0) # Cache the input KVs. key_cache, value_cache = kv_cache.unbind(0) From 4aed506b6538ec4f284c480bf4449e9dc5f72054 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 14 Oct 2025 16:27:44 -0700 Subject: [PATCH 02/51] [Core] Streamline some structured output related code (#26737) Signed-off-by: Nick Hill --- tests/v1/core/test_scheduler.py | 18 +++-- .../unit/test_kv_connector_lifecyle.py | 2 +- tests/v1/tpu/worker/test_tpu_model_runner.py | 24 +++---- tests/v1/worker/test_gpu_model_runner.py | 24 +++---- vllm/v1/core/sched/output.py | 5 +- vllm/v1/core/sched/scheduler.py | 65 +++++++++---------- vllm/v1/request.py | 18 +++-- vllm/v1/structured_output/__init__.py | 36 +++++----- vllm/v1/structured_output/backend_guidance.py | 2 +- vllm/v1/structured_output/request.py | 44 +++++++------ vllm/v1/structured_output/utils.py | 9 +-- vllm/v1/worker/gpu_model_runner.py | 6 +- vllm/v1/worker/tpu_model_runner.py | 6 +- 13 files changed, 121 insertions(+), 138 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 76408fba2e169..aaac2deb12ac2 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -30,7 +30,6 @@ from vllm.v1.kv_cache_interface import ( from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm.v1.structured_output.request import StructuredOutputRequest from .utils import EOS_TOKEN_ID, create_requests, create_scheduler @@ -335,10 +334,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [], requests[1].request_id: [10], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -383,10 +382,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 42], requests[1].request_id: [13], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -429,10 +428,10 @@ def test_stop_via_update_from_output(): requests[0].request_id: [10, 11], requests[1].request_id: [], }, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -470,10 +469,10 @@ def test_stop_via_update_from_output(): total_num_scheduled_tokens=3, scheduled_encoder_inputs={}, scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -1941,7 +1940,6 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): sampling_params=sampling_params, pooling_params=None, eos_token_id=EOS_TOKEN_ID, - structured_output_request=StructuredOutputRequest(sampling_params), ) scheduler.add_request(request) output = scheduler.schedule() diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index 0bb67b574fa14..b5c8f378be182 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -26,7 +26,7 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, kv_connector_metadata=SharedStorageConnectorMetadata(), ) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index df9fcdc37fa37..e471174ef6744 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -89,10 +89,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -168,10 +168,10 @@ def test_update_states_request_finished(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -198,10 +198,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -225,10 +225,10 @@ def test_update_states_request_resumed(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -256,10 +256,10 @@ def test_update_states_no_changes(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -291,10 +291,10 @@ def test_update_states_request_unscheduled(model_runner): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 817cd7f10c1c6..fe52f565c8a86 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -146,10 +146,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -212,10 +212,10 @@ def test_update_states_request_finished(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -244,10 +244,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=0, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -273,10 +273,10 @@ def test_update_states_request_resumed(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -366,10 +366,10 @@ def test_update_states_no_changes(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) @@ -403,10 +403,10 @@ def test_update_states_request_unscheduled(model_runner, dist_init): total_num_scheduled_tokens=1, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, - num_common_prefix_blocks=0, + num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids={}, + structured_output_request_ids=[], grammar_bitmask=None, ) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index bce15e1a476fd..619dcd178a13a 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -165,9 +165,8 @@ class SchedulerOutput: # freed from the encoder cache. free_encoder_mm_hashes: list[str] - # Dict of request ids to their index within the batch - # for filling the next token bitmask - structured_output_request_ids: dict[str, int] + # ids of structured outputs requests included in the bitmask, in order. + structured_output_request_ids: list[str] # the bitmask for the whole batch grammar_bitmask: "npt.NDArray[np.int32] | None" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9a1d31268ab7c..08368b7d99efe 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -5,7 +5,7 @@ import itertools import time from collections import defaultdict from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any from vllm.config import VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch @@ -34,6 +34,10 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager +if TYPE_CHECKING: + import numpy as np + import numpy.typing as npt + logger = init_logger(__name__) @@ -608,11 +612,8 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens, req_to_new_blocks, ) - scheduled_requests = ( - scheduled_new_reqs + scheduled_running_reqs + scheduled_resumed_reqs - ) structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask( - scheduled_requests, scheduled_spec_decode_tokens + num_scheduled_tokens.keys(), scheduled_spec_decode_tokens ) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, @@ -876,32 +877,28 @@ class Scheduler(SchedulerInterface): def get_grammar_bitmask( self, - requests: list[Request], + scheduled_request_ids: Iterable[str], scheduled_spec_decode_tokens: dict[str, list[int]], - ): - # NOTE: structured_output_request_ids maps - # a request's (request that uses structured output) - # request_id to its index in the batch. - # This will help us determine to slice the grammar bitmask - # and only applies valid mask for requests that - # uses structured decoding. - structured_output_request_ids: dict[str, int] = {} - for i, req in enumerate(requests): - if req.use_structured_output: - # PERF: in case of chunked prefill, - # request might not include any new tokens. - # Therefore, we might introduce some additional - # cycle to fill in the bitmask, which could be a big no-op. - structured_output_request_ids[req.request_id] = i - + ) -> tuple[list[str], "npt.NDArray[np.int32] | None"]: + # Collect list of scheduled request ids that use structured output. + # The corresponding rows of the bitmask will be in this order. + # PERF: in case of chunked prefill, + # request might not include any new tokens. + # Therefore, we might introduce some additional + # cycle to fill in the bitmask, which could be a big no-op. + structured_output_request_ids = [ + req_id + for req_id in scheduled_request_ids + if (req := self.requests.get(req_id)) and req.use_structured_output + ] if not structured_output_request_ids: - bitmask = None - else: - bitmask = self.structured_output_manager.grammar_bitmask( - self.requests, - structured_output_request_ids, - scheduled_spec_decode_tokens, - ) + return structured_output_request_ids, None + + bitmask = self.structured_output_manager.grammar_bitmask( + self.requests, + structured_output_request_ids, + scheduled_spec_decode_tokens, + ) return structured_output_request_ids, bitmask def update_from_output( @@ -1013,12 +1010,10 @@ class Scheduler(SchedulerInterface): new_logprobs = logprobs.slice(req_index, req_index + 1) if new_token_ids and self.structured_output_manager.should_advance(request): - # NOTE: structured_output_request - # should not be None if use_structured_output, we have - # checked above, so safe to ignore type warning - request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr] - req_id, new_token_ids - ) + struct_output_request = request.structured_output_request + assert struct_output_request is not None + assert struct_output_request.grammar is not None + struct_output_request.grammar.accept_tokens(req_id, new_token_ids) if num_nans_in_logits is not None and req_id in num_nans_in_logits: request.num_nans_in_logits = num_nans_in_logits[req_id] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 5926bf5b46ee9..864b0eb7fa410 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -40,7 +40,6 @@ class Request: prompt_embeds: torch.Tensor | None = None, mm_features: list[MultiModalFeatureSpec] | None = None, lora_request: Optional["LoRARequest"] = None, - structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: str | None = None, priority: int = 0, trace_headers: Mapping[str, str] | None = None, @@ -54,11 +53,12 @@ class Request: # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id self.lora_request = lora_request - self.structured_output_request = structured_output_request + self.structured_output_request = StructuredOutputRequest.from_sampling_params( + sampling_params + ) self.arrival_time = arrival_time if arrival_time is not None else time.time() self.status = RequestStatus.WAITING - self.use_structured_output = False self.events: list[EngineCoreEvent] = [] self.stop_reason: int | str | None = None @@ -72,9 +72,8 @@ class Request: # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - if sampling_params.structured_outputs is not None: + if self.structured_output_request is not None: self.status = RequestStatus.WAITING_FOR_FSM - self.use_structured_output = True if sampling_params.extra_args is not None: self.kv_transfer_params = sampling_params.extra_args.get( @@ -145,11 +144,6 @@ class Request: eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, lora_request=request.lora_request, - structured_output_request=StructuredOutputRequest( - sampling_params=request.sampling_params - ) - if request.sampling_params - else None, cache_salt=request.cache_salt, priority=request.priority, trace_headers=request.trace_headers, @@ -170,6 +164,10 @@ class Request: if self.get_hash_new_full_blocks is not None: self.block_hashes.extend(self.get_hash_new_full_blocks()) + @property + def use_structured_output(self) -> bool: + return self.structured_output_request is not None + @property def is_output_corrupted(self) -> bool: return self.num_nans_in_logits > 0 diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 336a0eb98682a..8d7f4b5d68961 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -167,7 +167,7 @@ class StructuredOutputManager: def grammar_bitmask( self, requests: dict[str, Request], - structured_output_request_ids: dict[str, int], + structured_output_request_ids: list[str], scheduled_spec_decode_tokens: dict[str, list[int]], ) -> "npt.NDArray[np.int32] | None": # Prepare the structured output bitmask for this batch. @@ -196,17 +196,16 @@ class StructuredOutputManager: # masks for each request, one for each possible bonus token position. # These are stored inline in the tensor and unpacked by the gpu runner. cumulative_index = 0 - ordered_seq = sorted(structured_output_request_ids.items(), key=lambda x: x[1]) # Optimized parallel filling of bitmasks for # non-spec, large-batch-size cases if ( - len(ordered_seq) > self.fill_bitmask_parallel_threshold + len(structured_output_request_ids) > self.fill_bitmask_parallel_threshold and max_num_spec_tokens == 0 ): promises = [] batch = [] - for req_id, _ in ordered_seq: + for req_id in structured_output_request_ids: request = requests[req_id] structured_output_request = request.structured_output_request if TYPE_CHECKING: @@ -230,7 +229,7 @@ class StructuredOutputManager: promise.result() else: # Fallback to serial filling of bitmasks for small-batch-size cases - for req_id, _ in ordered_seq: + for req_id in structured_output_request_ids: request = requests[req_id] structured_output_request = request.structured_output_request @@ -295,22 +294,21 @@ class StructuredOutputManager: assert request.structured_output_request.grammar is not None # by default, we should always advance # for cases that don't use thinking mode. - if self.reasoner is not None: - structured_req = request.structured_output_request - - if structured_req.reasoning_ended: - return True - - # Check if reasoning ends in *this* step - if self.reasoner.is_reasoning_end(request.all_token_ids): - # Reasoning just ended, so we shouldn't advance til - # next pass - structured_req.reasoning_ended = True - - return False - else: + if self.reasoner is None: return True + structured_req = request.structured_output_request + if structured_req.reasoning_ended: + return True + + # Check if reasoning ends in *this* step + if self.reasoner.is_reasoning_end(request.all_token_ids): + # Reasoning just ended, so we shouldn't advance til + # next pass + structured_req.reasoning_ended = True + + return False + def clear_backend(self) -> None: if self.backend is not None: self.backend.destroy() diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index c37193e667aab..8e75b99f8481f 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -252,7 +252,7 @@ def serialize_guidance_grammar( def validate_guidance_grammar( sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None ) -> None: - tp, grm = get_structured_output_key(sampling_params) + tp, grm = get_structured_output_key(sampling_params.structured_outputs) guidance_grm = serialize_guidance_grammar(tp, grm) err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer) if err: diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 9e149b186c639..afe0e4b3f3a7f 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -7,7 +7,7 @@ from concurrent.futures import Future from concurrent.futures._base import TimeoutError from typing import cast -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.v1.structured_output.backend_types import ( StructuredOutputGrammar, StructuredOutputKey, @@ -17,10 +17,19 @@ from vllm.v1.structured_output.backend_types import ( @dataclasses.dataclass class StructuredOutputRequest: - sampling_params: SamplingParams + params: StructuredOutputsParams _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None reasoning_ended: bool | None = None + @staticmethod + def from_sampling_params( + sampling_params: SamplingParams | None, + ) -> "StructuredOutputRequest | None": + if sampling_params is None: + return None + params = sampling_params.structured_outputs + return StructuredOutputRequest(params=params) if params else None + def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports from vllm.v1.request import RequestStatus @@ -53,31 +62,28 @@ class StructuredOutputRequest: @functools.cached_property def structured_output_key(self) -> StructuredOutputKey: - return get_structured_output_key(self.sampling_params) + return get_structured_output_key(self.params) -def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.structured_outputs - assert params is not None, "params can't be None." +def get_structured_output_key(params: StructuredOutputsParams) -> StructuredOutputKey: if params.json is not None: if not isinstance(params.json, str): json_str = json.dumps(params.json) else: json_str = params.json - return (StructuredOutputOptions.JSON, json_str) - elif params.json_object: - return (StructuredOutputOptions.JSON_OBJECT, "") - elif params.regex is not None: - return (StructuredOutputOptions.REGEX, params.regex) - elif params.choice is not None: + return StructuredOutputOptions.JSON, json_str + if params.json_object: + return StructuredOutputOptions.JSON_OBJECT, "" + if params.regex is not None: + return StructuredOutputOptions.REGEX, params.regex + if params.choice is not None: if not isinstance(params.choice, str): json_str = json.dumps(params.choice) else: json_str = params.choice - return (StructuredOutputOptions.CHOICE, json_str) - elif params.grammar is not None: - return (StructuredOutputOptions.GRAMMAR, params.grammar) - elif params.structural_tag is not None: - return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) - else: - raise ValueError("No valid structured output parameter found") + return StructuredOutputOptions.CHOICE, json_str + if params.grammar is not None: + return StructuredOutputOptions.GRAMMAR, params.grammar + if params.structural_tag is not None: + return StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag + raise ValueError("No valid structured output parameter found") diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 2520dc217c798..4b793b9a72fd7 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -47,7 +47,6 @@ def apply_grammar_bitmask( scheduler_output: SchedulerOutput, input_batch: InputBatch, logits: torch.Tensor, - device: torch.device, ) -> None: """ Apply grammar bitmask to output logits of the model with xgrammar function. @@ -56,7 +55,6 @@ def apply_grammar_bitmask( scheduler_output (SchedulerOutput): The result of engine scheduling. input_batch (InputBatch): The input of model runner. logits (torch.Tensor): The output logits of model forward. - device (torch.device): The device that model runner running on. """ grammar_bitmask = scheduler_output.grammar_bitmask if grammar_bitmask is None: @@ -91,10 +89,7 @@ def apply_grammar_bitmask( dtype=grammar_bitmask.dtype, ) cumulative_index = 0 - seq = sorted( - scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1] - ) - for req_id, _ in seq: + for req_id in scheduler_output.structured_output_request_ids: num_spec_tokens = len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) ) @@ -117,7 +112,7 @@ def apply_grammar_bitmask( xgr.apply_token_bitmask_inplace( logits, - grammar_bitmask.to(device, non_blocking=True), + grammar_bitmask.to(logits.device, non_blocking=True), indices=out_indices if not skip_out_indices else None, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bbb63d28289c4..72f8824e20054 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2568,10 +2568,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits = model_output_broadcast_data["logits"] # Apply structured output bitmasks if present - if scheduler_output.grammar_bitmask is not None: - apply_grammar_bitmask( - scheduler_output, self.input_batch, logits, self.device - ) + if scheduler_output.structured_output_request_ids: + apply_grammar_bitmask(scheduler_output, self.input_batch, logits) with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 828f09cbc8d8d..2107df5fc1032 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1963,12 +1963,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.grammar_bitmask_cpu.zero_() self.require_structured_out_cpu.zero_() - sorted_struct_requests = sorted( - scheduler_output.structured_output_request_ids.items(), - key=lambda item: item[1], - ) cumulative_mask_idx = 0 - for req_id, _ in sorted_struct_requests: + for req_id in scheduler_output.structured_output_request_ids: if req_id not in self.input_batch.req_id_to_index: continue batch_index = self.input_batch.req_id_to_index[req_id] From 7e0ef4084affa9de84904ba7726c46f53f4f6379 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 19:41:43 -0400 Subject: [PATCH 03/51] [CI Failure] Fix torchao dep failure for Quantization Test (#26824) Signed-off-by: mgoin --- .buildkite/test-amd.yaml | 3 ++- .buildkite/test-pipeline.yaml | 3 ++- tests/quantization/test_compressed_tensors.py | 3 ++- vllm/model_executor/layers/quantization/rtn.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b2a3a0a775baa..91f0b850575c4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -603,7 +603,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ebe0602a1b5db..94c0944c838ce 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -527,7 +527,8 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index ef7164c8813da..5aeb002238cf9 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): @pytest.mark.parametrize( "args", [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), + # TODO: Enable once model is available again + # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4), ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4), ], ) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index c041d2fd0ba48..e4f7ff8339569 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase from vllm.model_executor.layers.linear import ( LinearBase, @@ -396,7 +397,7 @@ class RTNMoEMethod(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - return torch.ops.vllm.fused_marlin_moe( + return fused_marlin_moe( x, layer.w13_weight, layer.w2_weight, From 0512c04aee408367a068b5960e7857c722ed204d Mon Sep 17 00:00:00 2001 From: Ye Hu Date: Tue, 14 Oct 2025 16:48:13 -0700 Subject: [PATCH 04/51] [frontend][gptoss] Add per turn stats into Harmony Context (#25061) Signed-off-by: lacora Co-authored-by: Ye Hu --- tests/entrypoints/test_context.py | 93 ++++++++++++++++++-- vllm/entrypoints/context.py | 65 +++++++++----- vllm/entrypoints/openai/protocol.py | 4 + vllm/entrypoints/openai/serving_responses.py | 88 +++++++++++------- 4 files changed, 188 insertions(+), 62 deletions(-) diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index b0faa870a9272..31ea856224f90 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch import pytest from openai_harmony import Author, Message, Role, StreamState, TextContent -from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext +from vllm.entrypoints.context import ( + HarmonyContext, + StreamingHarmonyContext, + TurnMetrics, +) from vllm.outputs import CompletionOutput, RequestOutput @@ -101,8 +105,12 @@ def test_single_turn_token_counting(): # Verify internal state tracking assert not context.is_first_turn - assert context.previous_turn.input_tokens == 5 - assert context.previous_turn.output_tokens == 3 + assert len(context.all_turn_metrics) == 1 + previous_turn = context.all_turn_metrics[0] + assert previous_turn.input_tokens == 5 + assert previous_turn.output_tokens == 3 + assert previous_turn.cached_input_tokens == 2 + assert previous_turn.tool_output_tokens == 0 @pytest.mark.asyncio @@ -156,6 +164,15 @@ async def test_multi_turn_token_counting(): assert context.num_tool_output_tokens == expected_tool_output assert context.num_cached_tokens == 5 + 15 + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == prompt_token_counts[i] + assert turn.output_tokens == output_token_counts[i] + assert turn.cached_input_tokens == cached_token_counts[i] + assert context.all_turn_metrics[1].tool_output_tokens == 7 + assert context.all_turn_metrics[2].tool_output_tokens == 1 + def test_empty_output_tokens(): """Test behavior when RequestOutput has empty output tokens.""" @@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser): # Create a streaming context context = StreamingHarmonyContext(messages=[], available_tools=["browser"]) + num_prompt_tokens = [3, 8, 13] + num_output_tokens = [3, 3, 2] + num_cached_tokens = [0, 3, 8] + # Simulate three turns of conversation: # Turn 1: stream tokens one by one, then finish the message # Turn 2: new prompt, stream more tokens with a reasoning segment @@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): create_mock_request_output( prompt_token_ids=[1, 2, 3], # 3 prompt tokens output_token_ids=[101], # Single token - num_cached_tokens=0, + num_cached_tokens=num_cached_tokens[0], finished=False, # Not end of message yet ) ) @@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 5, ], # 8 tokens (includes previous) output_token_ids=[201], - num_cached_tokens=3, # Some tokens cached + num_cached_tokens=num_cached_tokens[1], # Some tokens cached finished=False, ) ) @@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser): 7, ], # 13 tokens output_token_ids=[301], - num_cached_tokens=8, # More cached tokens + num_cached_tokens=num_cached_tokens[2], # More cached tokens finished=False, ) ) @@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser): ) # Final token counts check - assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts - assert context.num_output_tokens == 3 + 3 + 2 # All outputs + assert context.num_prompt_tokens == sum(num_prompt_tokens) # All prompts + assert context.num_output_tokens == sum(num_output_tokens) # All outputs assert context.num_reasoning_tokens == 3 # Unchanged from second turn - assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens + assert context.num_cached_tokens == sum( + num_cached_tokens + ) # Accumulated cached tokens # Additional tool tokens from third turn # Formula: this turn prompt - last turn prompt - last turn output @@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser): context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens ) + # Validate all turn metrics + assert len(context.all_turn_metrics) == 3 + for i, turn in enumerate(context.all_turn_metrics): + assert turn.input_tokens == num_prompt_tokens[i] + assert turn.output_tokens == num_output_tokens[i] + assert turn.cached_input_tokens == num_cached_tokens[i] + assert context.all_turn_metrics[1].tool_output_tokens == 2 + assert context.all_turn_metrics[2].tool_output_tokens == 2 + @pytest.mark.asyncio async def test_streaming_message_synchronization(mock_parser): @@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser): assert len(context._messages) == 3 assert context.num_init_messages == 1 assert context._messages[2].content[0].text == "Response 4" + + +def test_turn_metrics_copy_and_reset(): + """Test TurnMetrics copy and reset methods work correctly.""" + # Create a TurnMetrics with specific values + original_metrics = TurnMetrics( + input_tokens=10, + output_tokens=20, + cached_input_tokens=5, + tool_output_tokens=3, + ) + + # Test copy functionality + copied_metrics = original_metrics.copy() + + # Verify copy has same values + assert copied_metrics.input_tokens == 10 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 + + # Verify they are separate objects + assert copied_metrics is not original_metrics + + # Modify copy to ensure independence + copied_metrics.input_tokens = 999 + assert original_metrics.input_tokens == 10 # Original unchanged + assert copied_metrics.input_tokens == 999 + + # Test reset functionality + original_metrics.reset() + + # Verify all fields are reset to zero + assert original_metrics.input_tokens == 0 + assert original_metrics.output_tokens == 0 + assert original_metrics.cached_input_tokens == 0 + assert original_metrics.tool_output_tokens == 0 + + # Verify copied metrics are unaffected by reset + assert copied_metrics.input_tokens == 999 + assert copied_metrics.output_tokens == 20 + assert copied_metrics.cached_input_tokens == 5 + assert copied_metrics.tool_output_tokens == 3 diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index c694bcfaaa756..8f94880e431be 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -45,21 +45,36 @@ def _map_tool_name_to_tool_type(tool_name: str) -> str: return _TOOL_NAME_TO_TYPE_MAP[tool_name] -class TurnTokens: - """Tracks token counts for a single conversation turn.""" +class TurnMetrics: + """Tracks token and toolcall details for a single conversation turn.""" - def __init__(self, input_tokens=0, output_tokens=0): + def __init__( + self, + input_tokens=0, + output_tokens=0, + cached_input_tokens=0, + tool_output_tokens=0, + ): self.input_tokens = input_tokens self.output_tokens = output_tokens + self.cached_input_tokens = cached_input_tokens + self.tool_output_tokens = tool_output_tokens def reset(self): """Reset counters for a new turn.""" self.input_tokens = 0 self.output_tokens = 0 + self.cached_input_tokens = 0 + self.tool_output_tokens = 0 def copy(self): """Create a copy of this turn's token counts.""" - return TurnTokens(self.input_tokens, self.output_tokens) + return TurnMetrics( + self.input_tokens, + self.output_tokens, + self.cached_input_tokens, + self.tool_output_tokens, + ) class ConversationContext(ABC): @@ -102,6 +117,8 @@ class SimpleContext(ConversationContext): self.num_cached_tokens = 0 # todo num_reasoning_tokens is not implemented yet. self.num_reasoning_tokens = 0 + # not implemented yet for SimpleContext + self.all_turn_metrics = [] def append_output(self, output) -> None: self.last_output = output @@ -154,8 +171,9 @@ class HarmonyContext(ConversationContext): self.num_tool_output_tokens = 0 # Turn tracking - replaces multiple individual tracking variables - self.current_turn = TurnTokens() - self.previous_turn = TurnTokens() + self.current_turn_metrics = TurnMetrics() + # Track metrics for all turns + self.all_turn_metrics: list[TurnMetrics] = [] self.is_first_turn = True self.first_tok_of_message = True # For streaming support @@ -173,11 +191,10 @@ class HarmonyContext(ConversationContext): # Check if the current token is part of reasoning content self._update_num_reasoning_tokens() self._update_prefill_token_usage(output) - # Reset current turn output tokens for this turn - self.current_turn.output_tokens = 0 self._update_decode_token_usage(output) - # Move current turn to previous turn for next turn's calculations - self.previous_turn = self.current_turn.copy() + # Append current turn to all turn list for next turn's calculations + self.all_turn_metrics.append(self.current_turn_metrics.copy()) + self.current_turn_metrics.reset() # append_output is called only once before tool calling # in non-streaming case # so we can append all the parser messages to _messages @@ -213,20 +230,21 @@ class HarmonyContext(ConversationContext): logger.error("RequestOutput appended contains no prompt_token_ids.") # Update current turn input tokens - self.current_turn.input_tokens = this_turn_input_tokens + self.current_turn_metrics.input_tokens = this_turn_input_tokens self.num_prompt_tokens += this_turn_input_tokens # Calculate tool tokens (except on first turn) if self.is_first_turn: self.is_first_turn = False else: + previous_turn = self.all_turn_metrics[-1] # start counting tool after first turn # tool tokens = this turn prefill - last turn prefill - # last turn decode this_turn_tool_tokens = ( - self.current_turn.input_tokens - - self.previous_turn.input_tokens - - self.previous_turn.output_tokens + self.current_turn_metrics.input_tokens + - previous_turn.input_tokens + - previous_turn.output_tokens ) # Handle negative tool token counts (shouldn't happen in normal @@ -237,17 +255,20 @@ class HarmonyContext(ConversationContext): "(current_input=%d, previous_input=%d, " "previous_output=%d). Setting to 0.", this_turn_tool_tokens, - self.current_turn.input_tokens, - self.previous_turn.input_tokens, - self.previous_turn.output_tokens, + self.current_turn_metrics.input_tokens, + previous_turn.input_tokens, + previous_turn.output_tokens, ) this_turn_tool_tokens = 0 self.num_tool_output_tokens += this_turn_tool_tokens + self.current_turn_metrics.tool_output_tokens = this_turn_tool_tokens # Update cached tokens - if output.num_cached_tokens is not None: - self.num_cached_tokens += output.num_cached_tokens + num_cached_token = output.num_cached_tokens + if num_cached_token is not None: + self.num_cached_tokens += num_cached_token + self.current_turn_metrics.cached_input_tokens = num_cached_token def _update_decode_token_usage(self, output: RequestOutput) -> int: """Update token usage statistics for the decode phase of generation. @@ -272,7 +293,7 @@ class HarmonyContext(ConversationContext): # only keep last round updated_output_token_count += len(completion_output.token_ids) self.num_output_tokens += updated_output_token_count - self.current_turn.output_tokens += updated_output_token_count + self.current_turn_metrics.output_tokens += updated_output_token_count return updated_output_token_count @property @@ -452,7 +473,6 @@ class StreamingHarmonyContext(HarmonyContext): # so we only want to add the prompt tokens once for each message. if self.first_tok_of_message: self._update_prefill_token_usage(output) - self.current_turn.output_tokens = 0 # Reset self.first_tok_of_message if needed: # if the current token is the last one of the current message # (finished=True), then the next token processed will mark the @@ -464,7 +484,8 @@ class StreamingHarmonyContext(HarmonyContext): # For streaming, update previous turn when message is complete if output.finished: - self.previous_turn = self.current_turn.copy() + self.all_turn_metrics.append(self.current_turn_metrics.copy()) + self.current_turn_metrics.reset() # Check if the current token is part of reasoning content self._update_num_reasoning_tokens() self.last_tok = tok diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f41fa196acd81..86e1e62ff437b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2103,11 +2103,15 @@ class TranscriptionStreamResponse(OpenAIBaseModel): class InputTokensDetails(OpenAIBaseModel): cached_tokens: int + input_tokens_per_turn: list[int] = Field(default_factory=list) + cached_tokens_per_turn: list[int] = Field(default_factory=list) class OutputTokensDetails(OpenAIBaseModel): reasoning_tokens: int = 0 tool_output_tokens: int = 0 + output_tokens_per_turn: list[int] = Field(default_factory=list) + tool_output_tokens_per_turn: list[int] = Field(default_factory=list) class ResponseUsage(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 51e2856a5a9dd..6cdabff6e709b 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -589,10 +589,24 @@ class OpenAIServingResponses(OpenAIServing): input_tokens=num_prompt_tokens, output_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens, - input_tokens_details=InputTokensDetails(cached_tokens=num_cached_tokens), + input_tokens_details=InputTokensDetails( + cached_tokens=num_cached_tokens, + input_tokens_per_turn=[ + turn.input_tokens for turn in context.all_turn_metrics + ], + cached_tokens_per_turn=[ + turn.cached_input_tokens for turn in context.all_turn_metrics + ], + ), output_tokens_details=OutputTokensDetails( reasoning_tokens=num_reasoning_tokens, tool_output_tokens=num_tool_output_tokens, + output_tokens_per_turn=[ + turn.output_tokens for turn in context.all_turn_metrics + ], + tool_output_tokens_per_turn=[ + turn.tool_output_tokens for turn in context.all_turn_metrics + ], ), ) response = ResponsesResponse.from_request( @@ -665,11 +679,13 @@ class OpenAIServingResponses(OpenAIServing): token=text, logprob=max(token_logprob.logprob, -9999.0), bytes=list(text.encode("utf-8", errors="replace")), - top_logprobs=self._topk_logprobs( - logprob, top_logprobs=top_logprobs, tokenizer=tokenizer - ) - if top_logprobs - else [], + top_logprobs=( + self._topk_logprobs( + logprob, top_logprobs=top_logprobs, tokenizer=tokenizer + ) + if top_logprobs + else [] + ), ) ) return out @@ -758,14 +774,16 @@ class OpenAIServingResponses(OpenAIServing): text=content, annotations=[], # TODO type="output_text", - logprobs=self._create_response_logprobs( - token_ids=final_output.token_ids, - logprobs=final_output.logprobs, - tokenizer=tokenizer, - top_logprobs=request.top_logprobs, - ) - if request.is_include_output_logprobs() - else None, + logprobs=( + self._create_response_logprobs( + token_ids=final_output.token_ids, + logprobs=final_output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) + if request.is_include_output_logprobs() + else None + ), ) message = ResponseOutputMessage( id=f"msg_{random_uuid()}", @@ -870,15 +888,21 @@ class OpenAIServingResponses(OpenAIServing): with_custom_tools = has_custom_tools(tool_types) sys_msg = get_system_message( reasoning_effort=reasoning_effort, - browser_description=self.tool_server.get_tool_description("browser") - if enable_browser and self.tool_server is not None - else None, - python_description=self.tool_server.get_tool_description("python") - if enable_code_interpreter and self.tool_server is not None - else None, - container_description=self.tool_server.get_tool_description("container") - if enable_container and self.tool_server is not None - else None, + browser_description=( + self.tool_server.get_tool_description("browser") + if enable_browser and self.tool_server is not None + else None + ), + python_description=( + self.tool_server.get_tool_description("python") + if enable_code_interpreter and self.tool_server is not None + else None + ), + container_description=( + self.tool_server.get_tool_description("container") + if enable_container and self.tool_server is not None + else None + ), instructions=request.instructions, with_custom_tools=with_custom_tools, ) @@ -1283,14 +1307,16 @@ class OpenAIServingResponses(OpenAIServing): output_index=current_output_index, item_id=current_item_id, delta=delta_message.content, - logprobs=self._create_stream_response_logprobs( - token_ids=output.token_ids, - logprobs=output.logprobs, - tokenizer=tokenizer, - top_logprobs=request.top_logprobs, - ) - if request.is_include_output_logprobs() - else [], + logprobs=( + self._create_stream_response_logprobs( + token_ids=output.token_ids, + logprobs=output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) + if request.is_include_output_logprobs() + else [] + ), ) ) current_content_index += 1 From 579d2e5458b19c442f48e0cba0ba71c5d4abf6ea Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 14 Oct 2025 19:51:54 -0400 Subject: [PATCH 05/51] [WideEP][P/D] Add usage stats for DP+EP and KV Connector (#26836) Signed-off-by: Tyler Michael Smith --- vllm/v1/utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index f03efe21098bf..6aebe295b5ce5 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -345,13 +345,17 @@ def report_usage_stats( parallel_config = vllm_config.parallel_config + # Prepare KV connector string if applicable + kv_connector = None + if vllm_config.kv_transfer_config is not None: + kv_connector = vllm_config.kv_transfer_config.kv_connector + usage_message.report_usage( get_architecture_class_name(vllm_config.model_config), usage_context, extra_kvs={ # Common configuration "dtype": str(vllm_config.model_config.dtype), - "tensor_parallel_size": parallel_config.tensor_parallel_size, "block_size": vllm_config.cache_config.block_size, "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes, @@ -363,6 +367,15 @@ def report_usage_stats( "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, "enforce_eager": vllm_config.model_config.enforce_eager, "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce, + # Distributed parallelism settings + "tensor_parallel_size": parallel_config.tensor_parallel_size, + "data_parallel_size": parallel_config.data_parallel_size, + "pipeline_parallel_size": parallel_config.pipeline_parallel_size, + "enable_expert_parallel": parallel_config.enable_expert_parallel, + # All2All backend for MoE expert parallel + "all2all_backend": parallel_config.all2all_backend, + # KV connector used + "kv_connector": kv_connector, }, ) From 2dcd12d3571b070432ad1cd321a67b840b4a34b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Tue, 14 Oct 2025 19:55:02 -0400 Subject: [PATCH 06/51] [torch.compile] Fix tests for torch==2.9 inductor partition (#26116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ProExpertProg Signed-off-by: Luka Govedič --- .../compile/piecewise/test_full_cudagraph.py | 29 +++-- .../compile/piecewise/test_multiple_graphs.py | 38 ++++-- tests/compile/piecewise/test_toy_llama.py | 117 +++++++++++------- tests/compile/silly_attention.py | 1 - tests/compile/test_decorator.py | 3 + vllm/attention/layer.py | 6 - vllm/compilation/partition_rules.py | 13 +- vllm/config/compilation.py | 3 +- 8 files changed, 138 insertions(+), 72 deletions(-) diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 84194f3ed01e8..e01b58220959f 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM, SamplingParams from vllm.config import CompilationConfig from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer @contextlib.contextmanager @@ -32,13 +33,13 @@ def temporary_environ(env_vars): os.environ[k] = v -test_params_full_cudagraph = [] +model_backends_full_cudagraph = [] # deepseek-ai/DeepSeek-V2-Lite with MLA MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"] for mla_backend in MLA_backends: - test_params_full_cudagraph.append( - pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])) + model_backends_full_cudagraph.append( + ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]) ) # Qwen/Qwen2-1.5B-Instruct with other backends @@ -46,14 +47,18 @@ other_backend_configs = [ backend_configs[c] for c in backend_configs if c not in MLA_backends ] for backend_config in other_backend_configs: - test_params_full_cudagraph.append( - pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)) - ) + model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config)) @pytest.fixture(scope="class") def llm_pair(request): - model, backend_config = request.param + model, backend_config, use_inductor_graph_partition = request.param + backend_config.comp_config["use_inductor_graph_partition"] = ( + use_inductor_graph_partition + ) + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") # Dynamically skip test if GPU capability is not met if ( @@ -104,7 +109,15 @@ def llm_pair(request): ) -@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True) +@pytest.mark.parametrize( + "llm_pair", + [ + pytest.param((model, backend_config, use_inductor_graph_partition)) + for model, backend_config in model_backends_full_cudagraph + for use_inductor_graph_partition in [True, False] + ], + indirect=True, +) class TestFullCUDAGraph: """ Use a class such that an llm pair is constructed once for all diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index d88645e3bfd62..0d265bc596386 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules are compiled and graph captured separately. """ +import pytest import torch from torch import nn @@ -190,7 +191,12 @@ def run_model( return output.cpu() -def test_multi_graph_piecewise_compile_outputs_equal(): +@pytest.mark.parametrize("use_inductor_graph_partition", [False, True]) +def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): + if use_inductor_graph_partition: + # FIXME(luka/boyuan): this currently fails + pytest.skip("Inductor graph partition not supported with multi-graph") + outputs = [] # piecewise compile @@ -200,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -220,16 +227,24 @@ def test_multi_graph_piecewise_compile_outputs_equal(): # static tensor addresses inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda() - with compilation_counter.expect( - num_graphs_seen=2, # two graphs for the model - num_piecewise_graphs_seen=6, + if use_inductor_graph_partition: + # Splitting happens at Inductor lowering level, + # total piecewise fx graphs is equal to total graphs + num_piecewise_fx = 2 + num_piecewise_capturable_fx = 2 + else: # attn_one, attn_two each has 3 piecewise graphs # (pre attn, post attn, silly_attention) each - num_piecewise_capturable_graphs_seen=4, + num_piecewise_fx = 6 # attn_one, attn_two has pre attn and post attn each, total=4 - num_backend_compilations=4, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=8, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_capturable_fx = 4 + + with compilation_counter.expect( + num_graphs_seen=2, # two graphs for the model + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) @@ -268,6 +283,7 @@ def test_multi_graph_piecewise_compile_outputs_equal(): level=CompilationLevel.PIECEWISE, use_cudagraph=False, splitting_ops=["silly::attention"], + use_inductor_graph_partition=use_inductor_graph_partition, ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal(): with compilation_counter.expect( num_graphs_seen=2, - num_piecewise_graphs_seen=6, - num_piecewise_capturable_graphs_seen=4, - num_backend_compilations=4, + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, num_cudagraph_captured=0, # no cudagraph captured ): outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode)) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index eaf0a15479e97..7ab610fa78115 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ +from copy import deepcopy from dataclasses import dataclass from typing import Any @@ -26,6 +27,7 @@ from vllm.config import ( set_current_vllm_config, ) from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from .. import silly_attention # noqa: F401 @@ -257,27 +259,13 @@ def tractable_computation( @torch.inference_mode -def run_model( - llama_config, use_compile: bool, backend: str, split_attn: bool = False -) -> torch.Tensor: - if use_compile: - compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, - use_cudagraph=True, - backend=backend, - cudagraph_capture_sizes=[1, 2], - ) - if split_attn: - compilation_config.splitting_ops = ["silly::attention"] - cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE - else: - compilation_config = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, - ) - cudagraph_runtime_mode = CUDAGraphMode.NONE +def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor: + # Start with a fresh copy to make sure there's no cache dir sharing + compile_config = deepcopy(compile_config) + cudagraph_runtime_mode = compile_config.cudagraph_mode vllm_config = VllmConfig( - compilation_config=compilation_config, additional_config=llama_config + compilation_config=compile_config, additional_config=llama_config ) with set_current_vllm_config(vllm_config): model = ( @@ -338,8 +326,25 @@ def run_model( return output.cpu() -@pytest.mark.parametrize("backend", ["inductor", "eager"]) -def test_toy_llama(backend: str): +@pytest.mark.parametrize( + "backend, use_inductor_graph_partition", + [ + ("eager", False), # No inductor + ("inductor", False), # Inductor, Dynamo partition + ("inductor", True), # Inductor, Inductor partition + ], +) +def test_toy_llama( + backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path +): + # We disable the vLLM compile cache into a new tmp dir for 2 reasons: + # 1. To make sure we can properly track the number of Inductor compilations. + # 2. Inductor partitioning does not play nicely with Autograd cache (below) + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition only supported in torch>=2.9") + # compare output with and without piecewise compilation llama_config = LlamaConfig( @@ -350,6 +355,32 @@ def test_toy_llama(backend: str): hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True ) + compile_config_no_compile = CompilationConfig( + level=CompilationLevel.NO_COMPILATION, + cudagraph_mode=CUDAGraphMode.NONE, + backend="eager", + ) + + compile_config_no_split = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor_graph_partition=use_inductor_graph_partition, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + backend=backend, + cudagraph_capture_sizes=[1, 2], + ) + + # FIXME(luka/boyuan): the graph from the previous test case + # (no inductor partition) gets cached by AotAutograd so then the + # compilation with inductor partitioning incorrectly loads an unpartitioned + # graph and never partitions. I think this is a bug with custom inductor + # partitioning but does not affect vLLM more generally as vLLM uses its own + # cache (which takes inductor partitioning into account). + if use_inductor_graph_partition: + compile_config_no_split.inductor_compile_config["force_disable_caches"] = True + + compile_config_split = deepcopy(compile_config_no_split) + compile_config_split.splitting_ops = ["silly::attention"] + outputs = [] with compilation_counter.expect( num_graphs_seen=0, @@ -358,8 +389,9 @@ def test_toy_llama(backend: str): num_backend_compilations=0, num_cudagraph_captured=0, ): - outputs.append(run_model(llama_config, backend="eager", use_compile=False)) - run_model(tractable_config, backend="eager", use_compile=False) + outputs.append(run_model(llama_config, compile_config_no_compile)) + + run_model(tractable_config, compile_config_no_compile) if backend == "inductor": kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} @@ -367,35 +399,34 @@ def test_toy_llama(backend: str): kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} with compilation_counter.expect( - # One graph for the model - num_graphs_seen=1, + num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, num_piecewise_capturable_graphs_seen=1, - # num_piecewise_capturable_graphs_seen - num_backend_compilations=1, - # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_backend_compilations=1, # num_piecewise_capturable_graphs_seen num_cudagraph_captured=2, **kwargs, ): - outputs.append(run_model(llama_config, backend=backend, use_compile=True)) - run_model(tractable_config, backend=backend, use_compile=True) + outputs.append(run_model(llama_config, compile_config_no_split)) + + run_model(tractable_config, compile_config_no_split) + + if use_inductor_graph_partition: + num_piecewise_fx = 1 + num_piecewise_capturable_fx = 1 + else: + num_piecewise_fx = 2 * llama_config.num_layers + 1 + num_piecewise_capturable_fx = 1 + llama_config.num_layers with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=1 - + llama_config.num_layers, # 1 + num_layers - num_backend_compilations=1 - + llama_config.num_layers, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured=2 - * ( - 1 + llama_config.num_layers - ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=num_piecewise_fx, + num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx, + num_backend_compilations=num_piecewise_capturable_fx, + # num_cudagraph_sizes * num_partitions + num_cudagraph_captured=2 * (1 + llama_config.num_layers), ): - outputs.append( - run_model(llama_config, backend=backend, use_compile=True, split_attn=True) - ) - run_model(tractable_config, backend=backend, use_compile=True, split_attn=True) + outputs.append(run_model(llama_config, compile_config_split)) + run_model(tractable_config, compile_config_split) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index c0d3f908149f6..f33c5772906a6 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -62,5 +62,4 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, - tags=(torch._C.Tag.cudagraph_unsafe,), ) diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 6b050207ec41b..63cb266094a12 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -73,6 +73,7 @@ def test_ignore_torch_compile_decorator(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=False, # TODO test both? ) ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -188,6 +189,7 @@ def test_conditional_compile_enable_if(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=False, # TODO test both ), ) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE @@ -220,6 +222,7 @@ def test_conditional_compile_enable_if(): use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], + use_inductor_graph_partition=False, # TODO test both? ), ) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index fe9de65b52c66..8b5b87cba4044 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -38,10 +38,6 @@ from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None -try: - tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,) -except AttributeError: - tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -879,7 +875,6 @@ direct_register_custom_op( op_name="unified_attention", op_func=unified_attention, fake_impl=unified_attention_fake, - tags=tag_cudagraph_unsafe, ) @@ -931,7 +926,6 @@ direct_register_custom_op( op_func=unified_attention_with_output, mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, - tags=tag_cudagraph_unsafe, ) diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py index 5ea1b30860f59..cea4f9a816377 100644 --- a/vllm/compilation/partition_rules.py +++ b/vllm/compilation/partition_rules.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import logging from typing import TYPE_CHECKING from torch._library.utils import lookup_op @@ -38,8 +39,16 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]: resolved.append(lookup_op(op_name)) except Exception: # Skip operators that don't exist (e.g., model-specific ops) - logger.warning( - "Failed to resolve operator for Inductor partition: %s", op_name + # Do not warn for attention ops, warn for others + # (most likely manually specified) + from vllm.config import CompilationConfig + + logger.log( + logging.DEBUG + if op_name in CompilationConfig._attention_ops + else logging.WARNING, + "Failed to resolve operator for CUDAGraph partition: %s", + op_name, ) continue diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 60aef2f6f7e1c..fb80835ba48a1 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -201,7 +201,7 @@ class CompilationConfig: (it sees a part of the graph). The backend can not be custom for compilation level 3, i.e. the backend must be either eager or inductor. Furthermore, compilation is only piecewise if splitting ops is set accordingly and - use_inductor_cudagraphs_partition is off. Note that the default options for + use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. """ custom_ops: list[str] = field(default_factory=list) @@ -431,6 +431,7 @@ class CompilationConfig: factors.append(self.custom_ops) factors.append(self.splitting_ops) factors.append(self.use_inductor) + factors.append(self.use_inductor_graph_partition) factors.append(self.inductor_compile_config) factors.append(self.inductor_passes) factors.append(self.pass_config.uuid()) From 07ca70af8d8a0d0e20727d8de6972a7ad87cf996 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 14 Oct 2025 18:41:18 -0700 Subject: [PATCH 07/51] [Core][Easy] Use envs.__getattr__ for all Unify to environment variable access (#26810) Signed-off-by: Jialin Ouyang --- vllm/multimodal/cache.py | 6 +++--- vllm/transformers_utils/utils.py | 4 ++-- vllm/utils/gc_utils.py | 6 +++--- vllm/v1/engine/async_llm.py | 5 ++--- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index f6ef675aa7c29..a29da2a56afc1 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -10,12 +10,12 @@ from typing import TYPE_CHECKING, Generic, TypeAlias, TypeVar, cast import torch from typing_extensions import override +import vllm.envs as envs from vllm.distributed.device_communicators.shm_object_storage import ( MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer, ) -from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME from vllm.logger import init_logger from vllm.utils import GiB_bytes, MiB_bytes from vllm.utils.cache import CacheInfo, LRUCache @@ -436,7 +436,7 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache): ring_buffer = SingleWriterShmRingBuffer( data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), - name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, create=True, # sender is the writer ) self._shm_cache = SingleWriterShmObjectStorage( @@ -678,7 +678,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache): ring_buffer = SingleWriterShmRingBuffer( data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes), - name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, + name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME, create=False, # Server is a reader ) self._shm_cache = SingleWriterShmObjectStorage( diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index b87414d79df0f..58c754dbd3974 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -8,7 +8,7 @@ from os import PathLike from pathlib import Path from typing import Any -from vllm.envs import VLLM_MODEL_REDIRECT_PATH +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -86,7 +86,7 @@ def maybe_model_redirect(model: str) -> str: :return: maybe redirect to a local folder """ - model_redirect_path = VLLM_MODEL_REDIRECT_PATH + model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH if not model_redirect_path: return model diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py index 99c19c9db28e9..6894ccff11d93 100644 --- a/vllm/utils/gc_utils.py +++ b/vllm/utils/gc_utils.py @@ -7,7 +7,7 @@ from collections import Counter from contextlib import suppress from typing import Any -from vllm.envs import VLLM_GC_DEBUG +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) @@ -36,7 +36,7 @@ class GCDebugConfig: self.top_objects = json_conf.get("top_objects", -1) except Exception: self.enabled = False - logger.error("Failed to parse VLLM_GC_DEBUG(%s)", VLLM_GC_DEBUG) + logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG) logger.info("GC Debug Config. %s", str(self)) def __repr__(self) -> str: @@ -93,7 +93,7 @@ def maybe_attach_gc_debug_callback() -> None: """ Attached a callback for GC debug when VLLM_GC_DEBUG is enabled. """ - config = GCDebugConfig(VLLM_GC_DEBUG) + config = GCDebugConfig(envs.VLLM_GC_DEBUG) if config.enabled: debugger: GCDebugger = GCDebugger(config) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 39cd1d97c280a..0ec153e233161 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -16,7 +16,6 @@ from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.utils import _validate_truncation_size -from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -483,12 +482,12 @@ class AsyncLLM(EngineClient): # Split outputs into chunks of at most # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the # event loop for too long. - if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: slices = (outputs.outputs,) else: slices = np.array_split( outputs.outputs, - cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), + cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE), ) for i, outputs_slice in enumerate(slices): From 9354660036dff11a81433f0695c71dfee75cce50 Mon Sep 17 00:00:00 2001 From: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com> Date: Wed, 15 Oct 2025 09:50:30 +0800 Subject: [PATCH 08/51] [Bugfix]fix Qwen3 xml tool parser (#26345) Signed-off-by: Zhikaiiii <1658973216@qq.com> --- tests/tool_use/test_qwen3coder_tool_parser.py | 88 ++++++++++++- .../tool_parsers/qwen3xml_tool_parser.py | 117 ++++++++++++++---- 2 files changed, 179 insertions(+), 26 deletions(-) diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index b4f0989b1b19c..93ef1049fc07e 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -40,7 +40,7 @@ def qwen3_xml_tool_parser(qwen3_tokenizer): return Qwen3XMLToolParser(qwen3_tokenizer) -@pytest.fixture(params=["original", "xml"]) +@pytest.fixture(params=["xml"]) def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request): """Parameterized fixture that provides both parser types for testing""" if request.param == "original": @@ -664,6 +664,9 @@ def test_extract_tool_calls_streaming( # Verify we got all expected tool calls assert len(tool_states) == len(expected_tool_calls) + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len( + expected_tool_calls + ) # Verify each tool call for idx, expected_tool in enumerate(expected_tool_calls): @@ -780,9 +783,10 @@ fahrenheit # Verify content was streamed assert "Let me check the weather for you:" in other_content - # Verify we got the tool call assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + state = tool_states[0] assert state["id"] is not None assert state["type"] == "function" @@ -892,3 +896,83 @@ def test_extract_tool_calls_complex_type_with_single_quote( args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) assert args["obj_param"] == {"key": "value"} + + +def test_extract_tool_calls_streaming_missing_opening_tag( + qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools +): + """Test streaming with missing opening tag + + This tests that the streaming parser correctly handles + tool calls that start directly with + """ + model_output = """I'll check the weather for you. + + + +Dallas + + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools) + + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + # Verify content was streamed + assert "I'll check the weather for you." in other_content + + # Verify we got the tool call + assert len(tool_states) == 1 + assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 + + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + + # Verify arguments were parsed correctly despite missing opening tag + assert state["arguments"] is not None + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py index 2c5b0b6a85f76..9964d1ac25c40 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json -import uuid from collections.abc import Sequence from typing import Any from xml.parsers.expat import ParserCreate import regex as re +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, @@ -375,14 +375,21 @@ class StreamingXMLToolCallParser: return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1 else: # If currently not parsing tool calls (entering a tool_call), - # check if starts with + # check if starts with or if buffer == ""[: len(buffer)]: # Might be start of , wait for more data return None, start_pos + elif ( + buffer.startswith(" str | None: """Extract function name from various formats""" if attrs and "name" in attrs: @@ -1168,6 +1171,10 @@ class Qwen3XMLToolParser(ToolParser): super().__init__(tokenizer) self.parser = StreamingXMLToolCallParser() + # Add missing attributes for compatibility with serving_chat.py + self.prev_tool_call_arr: list[dict] = [] + self.streamed_args_for_tool: list[str] = [] + logger.info( "vLLM Successfully import tool parser %s !", self.__class__.__name__ ) @@ -1178,6 +1185,9 @@ class Qwen3XMLToolParser(ToolParser): request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new extraction + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] if request: self.parser.set_tools(request.tools) result = self.parser.parse_single_streaming_chunks(model_output) @@ -1201,6 +1211,34 @@ class Qwen3XMLToolParser(ToolParser): ), ) ) + + # Update tool call tracking arrays for compatibility + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool call information + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + self.prev_tool_call_arr[tool_index]["arguments"] = ( + tool_call.function.arguments + ) + + # Update streamed arguments + if tool_call.function.arguments: + self.streamed_args_for_tool[tool_index] = ( + tool_call.function.arguments + ) + return ExtractedToolCallInformation( tool_calls=tool_calls, tools_called=len(tool_calls) > 0, @@ -1219,6 +1257,9 @@ class Qwen3XMLToolParser(ToolParser): ) -> DeltaMessage | None: if not previous_text: self.parser.reset_streaming_state() + # Reset tool call tracking arrays for new streaming session + self.prev_tool_call_arr = [] + self.streamed_args_for_tool = [] if request: self.parser.set_tools(request.tools) @@ -1230,20 +1271,48 @@ class Qwen3XMLToolParser(ToolParser): open_calls = current_text.count( self.parser.tool_call_start_token ) - current_text.count(self.parser.tool_call_end_token) - if open_calls == 0 and self.parser.tool_call_index > 0: - # If current_call_id is None, use last_completed_call_id - call_id = ( - self.parser.current_call_id or self.parser.last_completed_call_id - ) - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.parser.tool_call_index - 1, - id=call_id, - function=DeltaFunctionCall(arguments=""), - type="function", - ) - ] - ) + if ( + open_calls == 0 + and self.parser.tool_call_index > 0 + or not self.parser.tool_call_index + and current_text + ): + return DeltaMessage(content="") + return None - return self.parser.parse_single_streaming_chunks(delta_text) + # Parse the delta text and get the result + result = self.parser.parse_single_streaming_chunks(delta_text) + + # Update tool call tracking arrays based on incremental parsing results + if result and result.tool_calls: + for tool_call in result.tool_calls: + if tool_call.function: + tool_index = ( + tool_call.index + if tool_call.index is not None + else len(self.prev_tool_call_arr) - 1 + ) + + # Ensure we have enough entries in our tracking arrays + while len(self.prev_tool_call_arr) <= tool_index: + self.prev_tool_call_arr.append({"name": "", "arguments": ""}) + while len(self.streamed_args_for_tool) <= tool_index: + self.streamed_args_for_tool.append("") + + # Update tool name if provided + if tool_call.function.name: + self.prev_tool_call_arr[tool_index]["name"] = ( + tool_call.function.name + ) + + # Update arguments incrementally + if tool_call.function.arguments is not None: + # Concatenate the incremental arguments + # to the existing streamed arguments + self.prev_tool_call_arr[tool_index]["arguments"] += ( + tool_call.function.arguments + ) + self.streamed_args_for_tool[tool_index] += ( + tool_call.function.arguments + ) + return result From bfad142e257be6699868f7816ca64c408bc32916 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 14 Oct 2025 21:33:25 -0500 Subject: [PATCH 09/51] [BUGFIX][NIXL] quick fix for 'assert self.connector_worker is not None' in get_kv_connector_stats (#26851) Signed-off-by: Chendi Xue --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 490f209373db3..6a2434ddce8be 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -241,7 +241,8 @@ class NixlConnector(KVConnectorBase_V1): return self.connector_worker.get_block_ids_with_load_errors() def get_kv_connector_stats(self) -> KVConnectorStats | None: - assert self.connector_worker is not None + if self.connector_worker is None: + return None return self.connector_worker.get_kv_connector_stats() @classmethod From e66d787bce22c56f995f4e2974e31ac020bc57ea Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 14 Oct 2025 22:35:18 -0400 Subject: [PATCH 10/51] Disable FlashInfer sampler by default (#26859) Signed-off-by: mgoin --- vllm/v1/sample/ops/topk_topp_sampler.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index ed8bc55a3cf2f..43a40bce6847d 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module): "Falling back to default sampling implementation." ) self.forward = self.forward_native - elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False: - # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for - # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by - # default it is unused). For backward compatibility, we set - # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and - # interpret it differently in V0 and V1 samplers: In V0, - # None means False, while in V1, None means True. This is - # why we use the condition - # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. + elif envs.VLLM_USE_FLASHINFER_SAMPLER: + # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1. logger.info_once("Using FlashInfer for top-p & top-k sampling.") self.forward = self.forward_cuda else: - logger.warning_once( - "FlashInfer is available, but it is not enabled. " - "Falling back to the PyTorch-native implementation of " - "top-p & top-k sampling. For the best performance, " - "please set VLLM_USE_FLASHINFER_SAMPLER=1." + logger.debug_once( + "FlashInfer top-p/top-k sampling is available but disabled " + "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in " + "after verifying accuracy for your workloads." ) self.forward = self.forward_native else: From 96b9aa5aa076e64c68765232aec343e4d0006e2a Mon Sep 17 00:00:00 2001 From: Morrison Turnansky Date: Tue, 14 Oct 2025 22:51:16 -0400 Subject: [PATCH 11/51] [Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: morrison-turnansky Signed-off-by: Morrison Turnansky Co-authored-by: Luka Govedič --- docs/configuration/conserving_memory.md | 4 +- docs/design/cuda_graphs.md | 4 +- examples/offline_inference/data_parallel.py | 2 +- .../compile/piecewise/test_multiple_graphs.py | 10 +- tests/compile/piecewise/test_simple.py | 4 +- tests/compile/piecewise/test_toy_llama.py | 10 +- tests/compile/test_aot_compile.py | 4 +- tests/compile/test_async_tp.py | 3 +- tests/compile/test_basic_correctness.py | 30 +++-- tests/compile/test_config.py | 20 ++-- tests/compile/test_decorator.py | 10 +- tests/compile/test_full_graph.py | 29 ++--- tests/compile/test_fusion.py | 4 +- tests/compile/test_fusion_all_reduce.py | 4 +- tests/compile/test_fusion_attn.py | 4 +- tests/compile/test_noop_elimination.py | 6 +- tests/compile/test_wrapper.py | 4 +- tests/distributed/test_sequence_parallel.py | 3 +- tests/engine/test_arg_utils.py | 20 ++-- tests/tpu/test_custom_dispatcher.py | 6 +- tests/utils_/test_utils.py | 10 +- tests/v1/cudagraph/test_cudagraph_dispatch.py | 22 ++-- tests/v1/cudagraph/test_cudagraph_mode.py | 39 +++---- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 6 +- vllm/compilation/backends.py | 4 +- vllm/compilation/compiler_interface.py | 2 +- vllm/compilation/counter.py | 4 +- vllm/compilation/decorators.py | 10 +- vllm/compilation/monitor.py | 6 +- vllm/compilation/wrapper.py | 8 +- vllm/config/__init__.py | 4 +- vllm/config/compilation.py | 106 ++++++++++++------ vllm/config/vllm.py | 50 ++++----- vllm/entrypoints/llm.py | 6 +- .../layers/quantization/utils/w8a8_utils.py | 4 +- vllm/platforms/cpu.py | 8 +- vllm/platforms/tpu.py | 11 +- vllm/platforms/xpu.py | 4 +- vllm/utils/__init__.py | 10 +- vllm/v1/cudagraph_dispatcher.py | 4 +- vllm/v1/spec_decode/eagle.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 15 +-- 42 files changed, 270 insertions(+), 248 deletions(-) diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 2b0654fa6d463..85906d23dee33 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc ```python from vllm import LLM - from vllm.config import CompilationConfig, CompilationLevel + from vllm.config import CompilationConfig, CompilationMode llm = LLM( model="meta-llama/Llama-3.1-8B-Instruct", compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # By default, it goes up to max_num_seqs cudagraph_capture_sizes=[1, 2, 4, 8, 16], ), diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md index 315746b0ef674..c6d71589be985 100644 --- a/docs/design/cuda_graphs.md +++ b/docs/design/cuda_graphs.md @@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum): """NO CUDA Graphs support""" ``` -Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. +Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture]. The following table lists backends that support full CUDA Graphs at the time of writing. @@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG") import vllm from vllm.config import CUDAGraphMode -compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} +compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} model = vllm.LLM( model="meta-llama/Llama-3.1-8B-Instruct", dtype="auto", diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 0076d4d30ee8e..a3e671a0f4cca 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -95,7 +95,7 @@ def parse_args(): parser.add_argument( "--compilation-config", type=int, - help=("Compilation optimization (O) level 0-3."), + help=("Compilation optimization (O) mode 0-3."), ) parser.add_argument( "--quantization", diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index 0d265bc596386..d1f741479acf4 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -14,7 +14,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): outputs = [] - # piecewise compile + # vllmcompile compile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): # no compile or cudagraph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + mode=CompilationMode.NONE, ) ) cudagraph_runtime_mode = CUDAGraphMode.NONE @@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool): # piecewise compile without CUDA graph vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=False, splitting_ops=["silly::attention"], use_inductor_graph_partition=use_inductor_graph_partition, diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index bc65e3da0ae74..f61a0a4eb740d 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -61,7 +61,7 @@ def _run_simple_model( ): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, use_inductor=use_inductor, splitting_ops=splitting_ops, diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 7ab610fa78115..75a89d692fa8f 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -21,7 +21,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -356,13 +356,13 @@ def test_toy_llama( ) compile_config_no_compile = CompilationConfig( - level=CompilationLevel.NO_COMPILATION, + level=CompilationMode.NONE, cudagraph_mode=CUDAGraphMode.NONE, backend="eager", ) compile_config_no_split = CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_mode=CUDAGraphMode.PIECEWISE, backend=backend, @@ -458,14 +458,14 @@ def benchmark(): for piecewise in [False, True]: if piecewise: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_capture_sizes=cudagraph_sizes, ) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 08f79d90cd367..1701d85fe84e7 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -10,7 +10,7 @@ import torch from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, VllmConfig, set_current_vllm_config, ) @@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module): def make_vllm_config() -> VllmConfig: return VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, ) ) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 102a929bf2409..60856f5a58067 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -10,6 +10,7 @@ import vllm.envs as envs from vllm.compilation.collective_fusion import AsyncTPPass from vllm.config import ( CompilationConfig, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -400,7 +401,7 @@ def test_async_tp_pass_correctness( common_args.append("--enforce-eager") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "compile_sizes": [2, 4, 8], "splitting_ops": [], "pass_config": {"enable_async_tp": async_tp_enabled}, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index ab6a17e149fcd..954774a8e3983 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -4,7 +4,7 @@ import dataclasses import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings @@ -21,7 +21,7 @@ class TestSetting: # we cannot afford testing the full Cartesian product -# of all models and all levels +# of all models and all modes @pytest.mark.parametrize( "test_setting", [ @@ -121,15 +121,13 @@ def test_compile_correctness( all_args: list[list[str]] = [] all_envs: list[dict[str, str] | None] = [] - for comp_level in [ - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - for level in [CompilationLevel.NO_COMPILATION, comp_level]: - all_args.append( - final_args + [f"-O.level={level}", "-O.backend=inductor"] - ) + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"]) # inductor will change the output, so we only compare if the output # is close, not exactly the same. @@ -142,13 +140,13 @@ def test_compile_correctness( all_envs.clear() all_args.clear() - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - CompilationLevel.PIECEWISE, + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, ]: - all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"]) + all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"]) all_envs.append({}) all_envs.append({}) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index ae8b0b226c313..7f51c763da73c 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -4,7 +4,7 @@ import pytest from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.compilation import CompilationLevel +from vllm.config.compilation import CompilationMode from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer @@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked -def test_dynamo_as_is(vllm_runner, monkeypatch): +def test_stock_torch_compile(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(dynamo_as_is_count=1), + compilation_counter.expect(stock_torch_compile_count=1), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 1}, + compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE}, gpu_memory_utilization=0.4, ) as _, ): @@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch): # Disable multiprocessing so that the counter is in the same process monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", - compilation_config={"level": 0}, + compilation_config={"mode": CompilationMode.NONE}, gpu_memory_utilization=0.4, ) as _, ): @@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") with ( - compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0), + compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0), # loading the model causes compilation (if enabled) to happen vllm_runner( "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4 @@ -151,7 +151,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, splitting_ops=["vllm::unified_attention"], ) @@ -163,7 +163,7 @@ def test_splitting_ops_dynamic(): # When attn_fusion pass enabled, splitting_ops now default to attention ops. config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, @@ -178,7 +178,7 @@ def test_splitting_ops_dynamic(): if is_torch_equal_or_newer("2.9.0.dev"): config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + level=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, pass_config={"enable_attn_fusion": True, "enable_noop": True}, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py index 63cb266094a12..4d60899a628a9 100644 --- a/tests/compile/test_decorator.py +++ b/tests/compile/test_decorator.py @@ -8,7 +8,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, VllmConfig, set_current_vllm_config, @@ -66,10 +66,10 @@ def run_model( def test_ignore_torch_compile_decorator(): - # piecewise + # vllmcompile vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -185,7 +185,7 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=True, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], @@ -218,7 +218,7 @@ def test_conditional_compile_enable_if(): kv_sharing_fast_prefill=False, ), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_cudagraph=True, splitting_ops=["silly::attention"], cudagraph_capture_sizes=[1, 2], diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 2f3794c90b204..2d290771f9ad7 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.attention.backends.registry import _Backend from vllm.attention.selector import global_force_attn_backend_context_manager -from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig +from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils import is_torch_equal_or_newer @@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE], ) @pytest.mark.parametrize("model_info", models_list(all=True)) @create_new_process_for_each_test() def test_full_graph( monkeypatch: pytest.MonkeyPatch, model_info: tuple[str, dict[str, Any]], - optimization_level: int, + compilation_mode: int, ): model, model_kwargs = model_info with monkeypatch.context(): print(f"MODEL={model}") - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) # TODO(luka) add other supported compilation config scenarios here @@ -104,7 +104,7 @@ def test_full_graph( [ # additional compile sizes, only some of the models ( - CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]), + CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]), model, ) for model in models_list(all=False) @@ -113,7 +113,7 @@ def test_full_graph( # RMSNorm + quant fusion, only 8-bit quant models ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ), @@ -125,7 +125,8 @@ def test_full_graph( # Test depyf integration works ( CompilationConfig( - level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir() + mode=CompilationMode.VLLM_COMPILE, + debug_dump_path=tempfile.gettempdir(), ), ("facebook/opt-125m", {}), ), @@ -134,7 +135,7 @@ def test_full_graph( # graph inductor partition ( CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, # inductor graph partition uses # torch._C.Tag.cudagraph_unsafe to specify splitting ops use_inductor_graph_partition=True, @@ -164,10 +165,10 @@ def test_custom_compile_config( @pytest.mark.parametrize( - "optimization_level", - [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE], + "compilation_mode", + [CompilationMode.NONE, CompilationMode.VLLM_COMPILE], ) -def test_fp8_kv_scale_compile(optimization_level: int): +def test_fp8_kv_scale_compile(compilation_mode: int): model = "Qwen/Qwen2-0.5B" model_kwargs = { "quantization": "fp8", @@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int): "calculate_kv_scales": True, "max_model_len": 512, } - run_model(optimization_level, model, model_kwargs) + run_model(compilation_mode, model, model_kwargs) def test_inductor_graph_partition_attn_fusion(caplog_vllm): @@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, use_inductor_graph_partition=True, cudagraph_mode=CUDAGraphMode.PIECEWISE, custom_ops=["+quant_fp8"], diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 7c22336432299..1a5eaf2639b36 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -13,7 +13,7 @@ from vllm.compilation.fusion import ( ) from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"], pass_config=PassConfig(enable_fusion=True, enable_noop=True), ) diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 455d1bb039057..fbcd6c71fb723 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, DeviceConfig, ModelConfig, PassConfig, @@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"] + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"] ) ) vllm_config.compilation_config.pass_config = PassConfig( diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index d1ab85cfb875c..a8d78daa32a1d 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( CacheConfig, CompilationConfig, - CompilationLevel, + CompilationMode, ModelConfig, PassConfig, SchedulerConfig, @@ -321,7 +321,7 @@ def test_attention_quant_pattern( ), scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, custom_ops=["+quant_fp8"], use_inductor_graph_partition=use_inductor_graph_partition, ), diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py index 188f4514dda5f..0ccc1a0161629 100644 --- a/tests/compile/test_noop_elimination.py +++ b/tests/compile/test_noop_elimination.py @@ -6,7 +6,7 @@ import torch import vllm from vllm.compilation.noop_elimination import NoOpEliminationPass -from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig from .backend import TestBackend @@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) @@ -98,7 +98,7 @@ def test_non_noop_slice_preserved(): vllm_config = VllmConfig( compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig(enable_noop=True), ) ) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index b2fff822bbbb5..da0afd9eaa49f 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -5,7 +5,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel +from vllm.config import CompilationMode class MyMod(torch.nn.Module): @@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") super().__init__( - compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE + compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE ) def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index a431bf30fc890..362e9daf5ae04 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -15,6 +15,7 @@ from typing import Literal, NamedTuple import pytest +from vllm.config.compilation import CompilationMode from vllm.config.model import RunnerOption from vllm.logger import init_logger @@ -234,7 +235,7 @@ def _compare_sp( common_args.append("--skip-tokenizer-init") compilation_config = { - "level": 3, + "mode": CompilationMode.VLLM_COMPILE, "custom_ops": ["+rms_norm"], "compile_sizes": [4, 8], "pass_config": { diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 78928a53942f9..c73083b0b5ef6 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -226,30 +226,30 @@ def test_compilation_config(): # set to O3 args = parser.parse_args(["-O0"]) - assert args.compilation_config.level == 0 + assert args.compilation_config.mode == 0 # set to O 3 (space) args = parser.parse_args(["-O", "1"]) - assert args.compilation_config.level == 1 + assert args.compilation_config.mode == 1 # set to O 3 (equals) args = parser.parse_args(["-O=2"]) - assert args.compilation_config.level == 2 + assert args.compilation_config.mode == 2 - # set to O.level 3 - args = parser.parse_args(["-O.level", "3"]) - assert args.compilation_config.level == 3 + # set to O.mode 3 + args = parser.parse_args(["-O.mode", "3"]) + assert args.compilation_config.mode == 3 # set to string form of a dict args = parser.parse_args( [ "-O", - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": false}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and not args.compilation_config.use_inductor ) @@ -258,12 +258,12 @@ def test_compilation_config(): args = parser.parse_args( [ "--compilation-config=" - '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '"use_inductor": true}', ] ) assert ( - args.compilation_config.level == 3 + args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.use_inductor ) diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 102e5ddf16d6d..cf455ff3edbd3 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import CompilationLevel +from vllm.config import CompilationMode from ..utils import compare_two_settings @@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", + f"-O{CompilationMode.DYNAMO_TRACE_ONCE}", ], arg2=[ "--max-model-len=256", "--max-num-seqs=32", "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_AS_IS}", + f"-O{CompilationMode.STOCK_TORCH_COMPILE}", ], env1={}, env2={}, diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 308629ab05834..af5fc758f2c26 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -299,7 +299,7 @@ def test_dict_args(parser): "val2", "--hf-overrides.key2.key4", "val3", - # Test compile config and compilation level + # Test compile config and compilation mode "-O.use_inductor=true", "-O.backend", "custom", @@ -352,7 +352,7 @@ def test_dict_args(parser): }, } assert parsed_args.compilation_config == { - "level": 1, + "mode": 1, "use_inductor": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], @@ -367,7 +367,7 @@ def test_duplicate_dict_args(caplog_vllm, parser): "--hf-overrides.key1", "val2", "-O1", - "-O.level", + "-O.mode", "2", "-O3", ] @@ -375,12 +375,12 @@ def test_duplicate_dict_args(caplog_vllm, parser): parsed_args = parser.parse_args(args) # Should be the last value assert parsed_args.hf_overrides == {"key1": "val2"} - assert parsed_args.compilation_config == {"level": 3} + assert parsed_args.compilation_config == {"mode": 3} assert len(caplog_vllm.records) == 1 assert "duplicate" in caplog_vllm.text assert "--hf-overrides.key1" in caplog_vllm.text - assert "-O.level" in caplog_vllm.text + assert "-O.mode" in caplog_vllm.text @pytest.mark.parametrize( diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 59841a446db3e..02fa27e3f05f7 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -11,7 +11,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled from vllm.config import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, ParallelConfig, SchedulerConfig, @@ -42,7 +42,7 @@ def _create_vllm_config( mock_config.parallel_config = ParallelConfig() # Mimic the behavior of VllmConfig.__post_init__() - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: compilation_config.set_splitting_ops_for_v1() return mock_config @@ -50,23 +50,23 @@ def _create_vllm_config( class TestCudagraphDispatcher: @pytest.mark.parametrize( - "case_id,cudagraph_mode_str,compilation_level", + "case_id,cudagraph_mode_str,compilation_mode", [ # Test case 0: Full CG for mixed batches, no separate routine - (0, "FULL", CompilationLevel.NO_COMPILATION), + (0, "FULL", CompilationMode.NONE), # Test case 1: Full CG for uniform batches, piecewise for mixed - (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION), + (1, "FULL_AND_PIECEWISE", CompilationMode.NONE), # Test case 2: Full CG for uniform batches, no CG for mixed - (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION), - # Test case 3: Piecewise for all - (3, "PIECEWISE", CompilationLevel.PIECEWISE), + (2, "FULL_DECODE_ONLY", CompilationMode.NONE), + # Test case 3: PIECEWISE for all + (3, "PIECEWISE", CompilationMode.VLLM_COMPILE), ], ) - def test_dispatcher(self, cudagraph_mode_str, compilation_level): + def test_dispatcher(self, cudagraph_mode_str, compilation_mode): # Setup dispatcher comp_config = CompilationConfig( cudagraph_mode=cudagraph_mode_str, - level=compilation_level, + mode=compilation_mode, cudagraph_capture_sizes=[1, 8], ) @@ -242,7 +242,7 @@ class TestCudagraphIntegration: def setup_method(self): # only FULL mode for non-uniform batches self.comp_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode="FULL", cudagraph_capture_sizes=[10, 20], ) diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 8c8148ae20948..818ae1d7ba677 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -10,7 +10,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM -from vllm.config import CompilationConfig +from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform @@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=3, cudagraph_mode=cudagraph_mode + mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) @@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ) -# test cudagraph_mode with different compilation level. -# (backend_name, cudagraph_mode, compilation_level, supported) +# test cudagraph_mode with different compilation mode. +# (backend_name, cudagraph_mode, compilation_mode, supported) combo_cases_2 = [ - ("FA2", "FULL", 0, True), # no compilation + full cudagraph - ("FA2", "FULL", 3, True), # piecewise compilation + full cudagraph - ("FA2", "PIECEWISE", 0, False), # no compilation + piecewise cudagraph - ("FA2", "PIECEWISE", 3, True), # piecewise compilation + piecewise cudagraph - ( - "FA2", - "FULL_AND_PIECEWISE", - 0, - False, - ), # piecewise cudagraph not supported without piecewise compilation - ("FA2", "FULL_AND_PIECEWISE", 3, True), - ("FA2", "FULL_DECODE_ONLY", 0, True), - ("FA2", "FULL_DECODE_ONLY", 3, True), - ("FA2", "NONE", 0, True), # no compilation + no cudagraph - ("FA2", "NONE", 3, True), # piecewise compilation + no cudagraph + ("FA2", "FULL", CompilationMode.NONE, True), + ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), + ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("FA2", "NONE", CompilationMode.NONE, True), + ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), ] @pytest.mark.parametrize( - "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2 + "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2 ) def test_cudagraph_compilation_combo(combo_case): - backend_name, cudagraph_mode, compilation_level, supported = combo_case + backend_name, cudagraph_mode, compilation_mode, supported = combo_case env_vars = backend_configs[backend_name].env_vars @@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case): gpu_memory_utilization=0.45, max_model_len=1024, compilation_config=CompilationConfig( - level=compilation_level, cudagraph_mode=cudagraph_mode + mode=compilation_mode, cudagraph_mode=cudagraph_mode ), ) llm.generate(["Hello, my name is"] * 10) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 89e5f26ac627f..f2c6d1c1fd1a4 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -7,7 +7,7 @@ import pytest import torch from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig, CompilationMode from vllm.distributed import cleanup_dist_env_and_memory from ...utils import fork_new_process_for_each_test @@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill( # This allows vLLM compilation backend to handle allocating and # managing buffers for cudagraph cudagraph_copy_inputs=True, - level=CompilationLevel.PIECEWISE + mode=CompilationMode.VLLM_COMPILE if not enforce_eager - else CompilationLevel.NO_COMPILATION, + else CompilationMode.NONE, ) with monkeypatch.context() as m: diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 46c433fe6aefb..91be7e85af518 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -56,7 +56,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: return InductorAdaptor() else: assert compilation_config.backend == "eager", ( - "Custom backends not supported with CompilationLevel.PIECEWISE" + "Custom backends not supported with CompilationMode.VLLM_COMPILE" ) logger.debug("Using EagerAdaptor") @@ -481,7 +481,7 @@ def set_model_tag(tag: str): class VllmBackend: """The compilation backend for `torch.compile` with vLLM. - It is used for compilation level of `CompilationLevel.PIECEWISE`, + It is used for compilation mode of `CompilationMode.VLLM_COMPILE`, where we customize the compilation. The major work of this backend is to split the graph into diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 4553007027e39..e2369a635ad1f 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -575,7 +575,7 @@ class InductorAdaptor(CompilerInterface): Because it is re-entrant, we always set it (even if entering via Dynamo and the context was already entered). We might want to revisit if it - should be set at a different level of compilation. + should be set at a different mode of compilation. This is likely a bug in PyTorch: public APIs should not rely on manually setting up internal contexts. But we also rely on non-public diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 9e8de831bcb29..20918099f169d 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -27,8 +27,8 @@ class CompilationCounter: num_cache_entries_updated: int = 0 # The number of standalone_compile compiled artifacts saved num_compiled_artifacts_saved: int = 0 - # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS - dynamo_as_is_count: int = 0 + # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE + stock_torch_compile_count: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index fe19d4e851294..20d4681e2c789 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -18,7 +18,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config +from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import resolve_obj_by_qualname, supports_dynamo @@ -233,11 +233,11 @@ def _support_torch_compile( old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) self.vllm_config = vllm_config enable_compile = enable_if is None or enable_if(vllm_config) - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = ( - vllm_config.compilation_config.level - in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS] + vllm_config.compilation_config.mode + in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE] or not supports_dynamo() or _should_ignore_torch_compile(self.__class__) or not enable_compile @@ -247,7 +247,7 @@ def _support_torch_compile( compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( - self, compilation_level=vllm_config.compilation_config.level + self, compilation_mode=vllm_config.compilation_config.mode ) cls.__init__ = __init__ diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index d3c437795fabb..1e6d0e79228b0 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -3,7 +3,7 @@ import time -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.config import CompilationConfig, CompilationMode, VllmConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config path = vllm_config.compile_debug_dump_path() - if compilation_config.level == CompilationLevel.PIECEWISE and path: + if compilation_config.mode == CompilationMode.VLLM_COMPILE and path: import depyf path.mkdir(parents=True, exist_ok=True) @@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig): def end_monitoring_torch_compile(vllm_config: VllmConfig): compilation_config: CompilationConfig = vllm_config.compilation_config - if compilation_config.level == CompilationLevel.PIECEWISE: + if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info( "torch.compile takes %.2f s in total", compilation_config.compilation_time ) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b4a0d89af0d6d..4b10c85209f63 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -11,7 +11,7 @@ from types import CodeType import torch import vllm.envs as envs -from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config +from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config from vllm.logger import init_logger logger = init_logger(__name__) @@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher: """ def __init__( - self, compiled_callable: Callable | None = None, compilation_level: int = 0 + self, compiled_callable: Callable | None = None, compilation_mode: int = 0 ): vllm_config = get_current_vllm_config() self.vllm_config = vllm_config @@ -72,7 +72,7 @@ class TorchCompileWrapperWithCustomDispatcher: # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = ( - compilation_level >= CompilationLevel.DYNAMO_ONCE + compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE ) def aot_compile(self, *args, **kwargs): @@ -85,7 +85,7 @@ class TorchCompileWrapperWithCustomDispatcher: return self.compiled_callable.aot_compile((args, kwargs)) def __call__(self, *args, **kwargs): - """Implement the dispatch logic here, beyond the torch.compile level. + """Implement the dispatch logic here, beyond the torch.compile mode. NOTE: this function can have additional arguments beyond the forward method, for directly dispatching to the compiled code. """ diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dcd..7f1cc52024205 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -4,7 +4,7 @@ from vllm.config.cache import CacheConfig from vllm.config.compilation import ( CompilationConfig, - CompilationLevel, + CompilationMode, CUDAGraphMode, PassConfig, ) @@ -49,7 +49,7 @@ __all__ = [ "CacheConfig", # From vllm.config.compilation "CompilationConfig", - "CompilationLevel", + "CompilationMode", "CUDAGraphMode", "PassConfig", # From vllm.config.device diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index fb80835ba48a1..a34fb0bf920c0 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -26,12 +26,20 @@ else: logger = init_logger(__name__) -class CompilationLevel: - # constants for the levels of the compilation process - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 +class CompilationMode: + """The compilation approach used for torch.compile-based compilation of the + model.""" + + NONE = 0 + """No torch.compile compilation is applied, model runs in fully eager pytorch mode. + The model runs as-is.""" + STOCK_TORCH_COMPILE = 1 + """The standard `torch.compile` compilation pipeline.""" + DYNAMO_TRACE_ONCE = 2 + """Single Dynamo trace through the model, avoiding recompilation.""" + VLLM_COMPILE = 3 + """Custom vLLM Inductor-based backend with caching, piecewise compilation, + shape specialization, and custom passes.""" class CUDAGraphMode(enum.Enum): @@ -134,7 +142,7 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - [`level`][vllm.config.CompilationConfig.level] + - [`mode`][vllm.config.CompilationConfig.mode] - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] - [`backend`][vllm.config.CompilationConfig.backend] @@ -171,14 +179,26 @@ class CompilationConfig: # Top-level Compilation control level: int | None = None - """The level of compilation: + """ + Level is deprecated and will be removed in the next release, + either 0.12.0 or 0.11.2 whichever is soonest. + Please use mode. Currently all levels are mapped to mode. + """ + # Top-level Compilation control + mode: int | None = None + """The compilation approach used for torch.compile-based compilation of the + model. - - None: If None, we will select the default compilation level. - For V1 engine this is 3, for V0 engine this is 0. - - 0: no compilation. - - 1: dynamo as is. - - 2: dynamo once. - - 3: piecewise compilation.""" + - None: If None, we will select the default compilation mode. + For V1 engine this is 3. + - 0: NONE: No torch.compile compilation is applied, model runs in fully + eager pytorch mode. The model runs as-is. + - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline. + - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding + recompilation by removing guards. + Requires no dynamic-shape-dependent control-flow. + - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching, + piecewise compilation, shape specialization, and custom passes.""" debug_dump_path: Path | None = None """The path to dump the debug information.""" cache_dir: str = "" @@ -195,11 +215,11 @@ class CompilationConfig: backend function. We use string to avoid serialization issues when using compilation in a - distributed setting. When the compilation level is 1 or 2, the backend is + distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation level is 3, the backend is used for the piecewise compilation + compilation mode is 3, the backend is used for the piecewise compilation (it sees a part of the graph). The backend can not be custom for compilation - level 3, i.e. the backend must be either eager or inductor. Furthermore, + mode 3, i.e. the backend must be either eager or inductor. Furthermore, compilation is only piecewise if splitting ops is set accordingly and use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. @@ -214,7 +234,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -249,7 +269,7 @@ class CompilationConfig: One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - This setting is ignored if level