From 71b25b0d482e8c5e49e4b586bd36fd52cc9951dc Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 26 Sep 2025 01:29:51 +0800 Subject: [PATCH] [V0 deprecation] Clean up V0 fallback in compilation config (#25675) Signed-off-by: Isotr0py --- vllm/config/__init__.py | 90 +++++++++----------------------------- vllm/config/compilation.py | 5 +-- 2 files changed, 22 insertions(+), 73 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index bf2cb325a23d..958df4c66955 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -384,19 +384,7 @@ class VllmConfig: else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE - if self.cache_config.cpu_offload_gb > 0 and \ - self.compilation_config.level != CompilationLevel.NO_COMPILATION \ - and not envs.VLLM_USE_V1: - logger.warning( - "CPU offload is not supported with `torch.compile` in v0 yet." - " Disabling `torch.compile`.") - self.compilation_config.level = CompilationLevel.NO_COMPILATION - if self.cache_config.kv_sharing_fast_prefill: - if not envs.VLLM_USE_V1: - raise NotImplementedError( - "Fast prefill optimization for KV sharing is not supported " - "in V0 currently.") if self.speculative_config is not None and \ self.speculative_config.use_eagle(): @@ -410,14 +398,6 @@ class VllmConfig: "--kv-sharing-fast-prefill requires changes on model side for " "correctness and to realize prefill savings. ") - if ((not envs.VLLM_USE_V1) and self.lora_config is not None - and self.compilation_config.level - != CompilationLevel.NO_COMPILATION): - logger.warning( - "LoRA for V0 is not supported with `torch.compile` yet. " - "Disabling `torch.compile`.") - self.compilation_config.level = CompilationLevel.NO_COMPILATION - disable_chunked_prefill_reasons: list[str] = [] if self.model_config: @@ -604,57 +584,27 @@ class VllmConfig: """ # calculate the default `batch_size_capture_list` - if not envs.VLLM_USE_V1: - batch_size_capture_list = [] - if self.scheduler_config is not None and \ - self.model_config is not None and \ - not self.model_config.enforce_eager: - - possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)] - if self.parallel_config.tensor_parallel_size > 1 and \ - self.compilation_config.pass_config.enable_sequence_parallelism: - possible_sizes = self.update_sizes_for_sequence_parallelism( - possible_sizes) - - # find the minimum size that is larger than max_num_seqs, - # which then becomes the max_batchsize_to_capture - larger_sizes = [ - x for x in possible_sizes - if x >= self.scheduler_config.max_num_seqs - ] - if larger_sizes: - max_batchsize_to_capture = larger_sizes[0] - else: - max_batchsize_to_capture = possible_sizes[-1] - - # filter out the sizes that are - # larger than max_batchsize_to_capture - batch_size_capture_list = [ - size for size in possible_sizes - if size <= max_batchsize_to_capture - ] - else: - batch_size_capture_list = [] - if self.model_config is not None and \ - not self.model_config.enforce_eager: - cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes - if len(cuda_graph_sizes) == 1: - batch_size_capture_list = [1, 2, 4] + [ - i for i in range(8, cuda_graph_sizes[0] + 1, 8) - ] - elif len(cuda_graph_sizes) > 1: - batch_size_capture_list = sorted(cuda_graph_sizes) - else: - raise TypeError(f"Invalid value for {cuda_graph_sizes=}.") - if self.parallel_config.tensor_parallel_size > 1 and \ - self.compilation_config.pass_config.enable_sequence_parallelism: - batch_size_capture_list = \ - self.update_sizes_for_sequence_parallelism(batch_size_capture_list) - max_num_tokens = self.scheduler_config.max_num_batched_tokens - batch_size_capture_list = [ - size for size in batch_size_capture_list - if size <= max_num_tokens + batch_size_capture_list = [] + if self.model_config is not None and \ + not self.model_config.enforce_eager: + cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes + if len(cuda_graph_sizes) == 1: + batch_size_capture_list = [1, 2, 4] + [ + i for i in range(8, cuda_graph_sizes[0] + 1, 8) ] + elif len(cuda_graph_sizes) > 1: + batch_size_capture_list = sorted(cuda_graph_sizes) + else: + raise TypeError(f"Invalid value for {cuda_graph_sizes=}.") + if self.parallel_config.tensor_parallel_size > 1 and \ + self.compilation_config.pass_config.enable_sequence_parallelism: + batch_size_capture_list = \ + self.update_sizes_for_sequence_parallelism(batch_size_capture_list) + max_num_tokens = self.scheduler_config.max_num_batched_tokens + batch_size_capture_list = [ + size for size in batch_size_capture_list + if size <= max_num_tokens + ] self.compilation_config.init_with_cudagraph_sizes( batch_size_capture_list) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 0441745e8b36..50fde9461a13 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union from pydantic import TypeAdapter, field_validator from pydantic.dataclasses import dataclass -import vllm.envs as envs from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.config.utils import config from vllm.logger import init_logger @@ -75,11 +74,11 @@ class PassConfig: don't all have access to full configuration - that would create a cycle as the `PassManager` is set as a property of config.""" - enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1) + enable_fusion: bool = False """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass.""" enable_attn_fusion: bool = False """Whether to enable the custom attention+quant fusion pass.""" - enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1) + enable_noop: bool = False """Whether to enable the custom no-op elimination pass.""" enable_sequence_parallelism: bool = False """Whether to enable sequence parallelism."""