[V0 deprecation] Clean up V0 fallback in compilation config (#25675)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py 2025-09-26 01:29:51 +08:00 committed by GitHub
parent 0ea80c87d9
commit 71b25b0d48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 73 deletions

View File

@ -384,19 +384,7 @@ class VllmConfig:
else:
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
if self.cache_config.cpu_offload_gb > 0 and \
self.compilation_config.level != CompilationLevel.NO_COMPILATION \
and not envs.VLLM_USE_V1:
logger.warning(
"CPU offload is not supported with `torch.compile` in v0 yet."
" Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION
if self.cache_config.kv_sharing_fast_prefill:
if not envs.VLLM_USE_V1:
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently.")
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
@ -410,14 +398,6 @@ class VllmConfig:
"--kv-sharing-fast-prefill requires changes on model side for "
"correctness and to realize prefill savings. ")
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
and self.compilation_config.level
!= CompilationLevel.NO_COMPILATION):
logger.warning(
"LoRA for V0 is not supported with `torch.compile` yet. "
"Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION
disable_chunked_prefill_reasons: list[str] = []
if self.model_config:
@ -604,36 +584,6 @@ class VllmConfig:
"""
# calculate the default `batch_size_capture_list`
if not envs.VLLM_USE_V1:
batch_size_capture_list = []
if self.scheduler_config is not None and \
self.model_config is not None and \
not self.model_config.enforce_eager:
possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
if self.parallel_config.tensor_parallel_size > 1 and \
self.compilation_config.pass_config.enable_sequence_parallelism:
possible_sizes = self.update_sizes_for_sequence_parallelism(
possible_sizes)
# find the minimum size that is larger than max_num_seqs,
# which then becomes the max_batchsize_to_capture
larger_sizes = [
x for x in possible_sizes
if x >= self.scheduler_config.max_num_seqs
]
if larger_sizes:
max_batchsize_to_capture = larger_sizes[0]
else:
max_batchsize_to_capture = possible_sizes[-1]
# filter out the sizes that are
# larger than max_batchsize_to_capture
batch_size_capture_list = [
size for size in possible_sizes
if size <= max_batchsize_to_capture
]
else:
batch_size_capture_list = []
if self.model_config is not None and \
not self.model_config.enforce_eager:

View File

@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
from pydantic import TypeAdapter, field_validator
from pydantic.dataclasses import dataclass
import vllm.envs as envs
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.config.utils import config
from vllm.logger import init_logger
@ -75,11 +74,11 @@ class PassConfig:
don't all have access to full configuration - that would create a cycle as
the `PassManager` is set as a property of config."""
enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
enable_fusion: bool = False
"""Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
enable_attn_fusion: bool = False
"""Whether to enable the custom attention+quant fusion pass."""
enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
enable_noop: bool = False
"""Whether to enable the custom no-op elimination pass."""
enable_sequence_parallelism: bool = False
"""Whether to enable sequence parallelism."""