From c32a18cbe7342ac0700802b94ae98bbf928a00f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Tue, 25 Nov 2025 20:23:36 +0100 Subject: [PATCH 1/5] Attempt to fix GPU OOM in a spec-decoding test (#29419) Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> --- examples/offline_inference/spec_decode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 67a0732459709..29b2e95d262f8 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -133,7 +133,7 @@ def main(args): tensor_parallel_size=args.tp, enable_chunked_prefill=args.enable_chunked_prefill, enforce_eager=args.enforce_eager, - gpu_memory_utilization=0.8, + gpu_memory_utilization=0.9, speculative_config=speculative_config, disable_log_stats=False, max_model_len=args.max_model_len, From e7d776273de379bb6c9fc11ce070c57e0fcd84f9 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 25 Nov 2025 20:58:56 +0100 Subject: [PATCH 2/5] [Compile] Refactor. Move PostGradPassManager out of Compilation config (#29340) Signed-off-by: ilmarkov --- vllm/compilation/backends.py | 33 ++++++++++++++++----------- vllm/compilation/piecewise_backend.py | 2 +- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 2d8dd4c51c7ef..1773913d0b6c6 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -11,6 +11,7 @@ import pprint import time from collections.abc import Callable, Sequence from contextlib import contextmanager +from copy import deepcopy from functools import partial from typing import Any @@ -429,7 +430,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): self.vllm_backend.compiler_manager.compile( submod, args, - self.compilation_config.inductor_compile_config, + self.vllm_backend.inductor_config, self.compilation_config, graph_index=index, num_graphs=len(self.compile_submod_names), @@ -531,6 +532,9 @@ class VllmBackend: sym_tensor_indices: list[int] input_buffers: list[torch.Tensor] compiler_manager: CompilerManager + # Copy of CompilationConfig.inductor_compile_config + + # an entry for PostGradPassManager + inductor_config: dict[str, Any] def __init__( self, @@ -561,25 +565,30 @@ class VllmBackend: self.compilation_config ) + # Deepcopy the inductor config to detach the post-grad custom pass + # from CompilationConfig. + # We want to avoid PostGradPassManager in CompilationConfig because + # in future we need PostGradPassManager.uuid() to be executed + # only at compile time. + self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config) # `torch.compile` is JIT compiled, so we don't need to # do anything here def configure_post_pass(self): - config = self.compilation_config self.pass_manager.configure(self.vllm_config) # Post-grad custom passes are run using the post_grad_custom_post_pass # hook. If a pass for that hook exists, add it to the pass manager. - inductor_config = config.inductor_compile_config - if self.pass_key in inductor_config: - if isinstance(inductor_config[self.pass_key], PostGradPassManager): - # PassManager already added to config, make sure it's correct - assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid() + if self.pass_key in self.inductor_config: + if isinstance(self.inductor_config[self.pass_key], PostGradPassManager): + raise ValueError( + "PostGradPassManager can not be kept in CompilationConfig." + ) else: # Config should automatically wrap all inductor passes - assert isinstance(inductor_config[self.pass_key], InductorPass) - self.pass_manager.add(inductor_config[self.pass_key]) - inductor_config[self.pass_key] = self.pass_manager + assert isinstance(self.inductor_config[self.pass_key], InductorPass) + self.pass_manager.add(self.inductor_config[self.pass_key]) + self.inductor_config[self.pass_key] = self.pass_manager def __call__( self, graph: fx.GraphModule, example_inputs @@ -638,9 +647,7 @@ class VllmBackend: self.compilation_config.local_cache_dir = local_cache_dir # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE. - disable_cache = not is_compile_cache_enabled( - self.compilation_config.inductor_compile_config - ) + disable_cache = not is_compile_cache_enabled(self.inductor_config) if disable_cache: logger.info_once("vLLM's torch.compile cache is disabled.", scope="local") diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 2931580afbbb0..e535d2c461c6e 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -107,7 +107,7 @@ class PiecewiseBackend: entry.runnable = self.vllm_backend.compiler_manager.compile( self.graph, args, - self.compilation_config.inductor_compile_config, + self.vllm_backend.inductor_config, self.compilation_config, graph_index=self.piecewise_compile_index, num_graphs=self.total_piecewise_compiles, From 4e57c6587fe062211177f6b5d6785f00c3aea562 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 25 Nov 2025 12:55:24 -0800 Subject: [PATCH 3/5] [Core] Support logprobs with spec decode + async scheduling (#29223) Signed-off-by: Nick Hill --- tests/v1/e2e/test_async_scheduling.py | 7 ++++- vllm/v1/core/sched/scheduler.py | 2 -- vllm/v1/sample/rejection_sampler.py | 14 ++++++++-- vllm/v1/worker/gpu_model_runner.py | 37 ++++++++++++--------------- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 00d93e1ba0b53..945276376d665 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): # Set small draft model len to force doesn't-fit-in-drafter case. spec_config_short = spec_config | {"max_model_len": 50} + test_sampling_params = [ + dict(), + dict(logprobs=2), + ] + # test_preemption, executor, async_scheduling, # spec_config, test_prefill_chunking test_configs = [ @@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): (True, "uni", True, spec_config_short, True), ] - run_tests(monkeypatch, MTP_MODEL, test_configs, [{}]) + run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params) @dynamo_config.patch(cache_size_limit=16) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index bea2f865bad46..0304a8ec48bf7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1089,8 +1089,6 @@ class Scheduler(SchedulerInterface): and request.sampling_params.logprobs is not None and logprobs ): - # NOTE: once we support N tokens per step (spec decode), - # the outer lists can be of length > 1. new_logprobs = logprobs.slice(req_index, req_index + 1) if new_token_ids and self.structured_output_manager.should_advance(request): diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 926305d25f56b..ccaf07e18c468 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from dataclasses import replace import torch @@ -204,7 +205,9 @@ class RejectionSampler(nn.Module): def parse_output( output_token_ids: torch.Tensor, vocab_size: int, - ) -> list[list[int]]: + discard_req_indices: Sequence[int] = (), + return_cu_num_tokens: bool = False, + ) -> tuple[list[list[int]], list[int] | None]: """Parse the output of the rejection sampler. Args: output_token_ids: The sampled token IDs in shape @@ -212,6 +215,8 @@ class RejectionSampler(nn.Module): replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler and will be filtered out in this function. vocab_size: The size of the vocabulary. + discard_req_indices: Optional row indices to discard tokens in. + return_cu_num_tokens: Whether to also return cumulative token counts. Returns: A list of lists of token IDs. """ @@ -220,10 +225,15 @@ class RejectionSampler(nn.Module): valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & ( output_token_ids_np < vocab_size ) + cu_num_tokens = None + if return_cu_num_tokens: + cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist() + if len(discard_req_indices) > 0: + valid_mask[discard_req_indices] = False outputs = [ row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np) ] - return outputs + return outputs, cu_num_tokens def apply_logits_processors( self, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e78d3c71af77a..bb44c5ad84cc1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -183,7 +183,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): self, model_runner_output: ModelRunnerOutput, sampled_token_ids: torch.Tensor, - logprobs_tensors: torch.Tensor | None, + logprobs_tensors: LogprobsTensors | None, invalid_req_indices: list[int], async_output_copy_stream: torch.cuda.Stream, vocab_size: int, @@ -219,28 +219,29 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): This function blocks until the copy is finished. """ + max_gen_len = self.sampled_token_ids_cpu.shape[-1] self.async_copy_ready_event.synchronize() # Release the device tensors once the copy has completed. del self._logprobs_tensors del self._sampled_token_ids - max_gen_len = self.sampled_token_ids_cpu.shape[-1] if max_gen_len == 1: valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist() + for i in self._invalid_req_indices: + valid_sampled_token_ids[i].clear() + cu_num_tokens = None else: - valid_sampled_token_ids = RejectionSampler.parse_output( + valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( self.sampled_token_ids_cpu, self.vocab_size, + self._invalid_req_indices, + return_cu_num_tokens=self._logprobs_tensors_cpu is not None, ) - for i in self._invalid_req_indices: - valid_sampled_token_ids[i].clear() output = self._model_runner_output output.sampled_token_ids = valid_sampled_token_ids if self._logprobs_tensors_cpu: - # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens - # for async sched + spec decode + logprobs compatibility. - output.logprobs = self._logprobs_tensors_cpu.tolists() + output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens) return output @@ -2597,28 +2598,24 @@ class GPUModelRunner( sampled_token_ids = sampler_output.sampled_token_ids logprobs_tensors = sampler_output.logprobs_tensors invalid_req_indices = [] - cu_num_new_tokens: list[int] | None = None + cu_num_tokens: list[int] | None = None if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] if max_gen_len == 1: # No spec decode tokens. valid_sampled_token_ids = self._to_list(sampled_token_ids) + # Mask out the sampled tokens that should not be sampled. + for i in discard_sampled_tokens_req_indices: + valid_sampled_token_ids[int(i)].clear() else: # Includes spec decode tokens. - valid_sampled_token_ids = self.rejection_sampler.parse_output( + valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( sampled_token_ids, self.input_batch.vocab_size, + discard_sampled_tokens_req_indices, + return_cu_num_tokens=logprobs_tensors is not None, ) - if logprobs_tensors: - # Needed for extracting logprobs when spec decoding. - # This must be done prior to discarding sampled tokens. - cu_num_new_tokens = [0] - for toks in valid_sampled_token_ids: - cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks)) - # Mask out the sampled tokens that should not be sampled. - for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[int(i)].clear() else: valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() @@ -2672,7 +2669,7 @@ class GPUModelRunner( req_state.output_token_ids.extend(sampled_ids) logprobs_lists = ( - logprobs_tensors.tolists(cu_num_new_tokens) + logprobs_tensors.tolists(cu_num_tokens) if not self.use_async_scheduling and logprobs_tensors is not None else None ) From 0abc79482a6d476f58530907443c46134ba6e2e1 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Tue, 25 Nov 2025 16:46:41 -0500 Subject: [PATCH 4/5] [caching] Add enable_prompt_embeds and cpu_offload_gb to compile hashes. (#29435) Signed-off-by: zhxchen17 --- vllm/config/cache.py | 4 +--- vllm/config/model.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index ef6928d8ebd5c..00530846fce00 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -144,7 +144,7 @@ class CacheConfig: kv_offloading_backend: KVOffloadingBackend | None = None """The backend to use for KV cache offloading. Supported backends include - 'native' (vLLM native CPU offloading), 'lmcache' This option must be used + 'native' (vLLM native CPU offloading), 'lmcache' This option must be used together with kv_offloading_size.""" def compute_hash(self) -> str: @@ -167,8 +167,6 @@ class CacheConfig: "num_gpu_blocks_override", "enable_prefix_caching", "prefix_caching_hash_algo", - # `cpu_offload_gb` does not use `torch.compile` yet. - "cpu_offload_gb", "cpu_kvcache_space_bytes", "mamba_page_size_padded", # Post-init/derived counters diff --git a/vllm/config/model.py b/vllm/config/model.py index ce5e824da5c22..25972f097f53d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -345,7 +345,6 @@ class ModelConfig: "logprobs_mode", "disable_cascade_attn", "skip_tokenizer_init", - "enable_prompt_embeds", "served_model_name", "config_format", "hf_token", From 7df0289782ab500b2713b7521979c28de2b21cac Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Nov 2025 17:52:31 -0500 Subject: [PATCH 5/5] Change warning logs to debug for unimplemented MXFP4 Linear/Attention (#29441) Signed-off-by: Michael Goin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 198feb03be3e4..d975131f7cff7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -196,9 +196,10 @@ class Mxfp4Config(QuantizationConfig): # TODO: Add support for MXFP4 Linear Method. # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation # if you are interested in enabling MXFP4 here. - logger.warning_once( + logger.debug_once( "MXFP4 linear layer is not implemented - falling back to " - "UnquantizedLinearMethod." + "UnquantizedLinearMethod.", + scope="local", ) return UnquantizedLinearMethod() elif isinstance(layer, FusedMoE): @@ -208,9 +209,10 @@ class Mxfp4Config(QuantizationConfig): return Mxfp4MoEMethod(layer.moe_config) elif isinstance(layer, Attention): # TODO: Add support for MXFP4 Attention. - logger.warning_once( + logger.debug_once( "MXFP4 attention layer is not implemented. " - "Skipping quantization for this layer." + "Skipping quantization for this layer.", + scope="local", ) return None