From 71b25b0d482e8c5e49e4b586bd36fd52cc9951dc Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 26 Sep 2025 01:29:51 +0800
Subject: [PATCH] [V0 deprecation] Clean up V0 fallback in compilation config
 (#25675)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/config/__init__.py    | 90 +++++++++-----------------------------
 vllm/config/compilation.py |  5 +--
 2 files changed, 22 insertions(+), 73 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index bf2cb325a23d..958df4c66955 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -384,19 +384,7 @@ class VllmConfig:
         else:
             self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
-        if self.cache_config.cpu_offload_gb > 0 and \
-            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
-                and not envs.VLLM_USE_V1:
-            logger.warning(
-                "CPU offload is not supported with `torch.compile` in v0 yet."
-                " Disabling `torch.compile`.")
-            self.compilation_config.level = CompilationLevel.NO_COMPILATION
-
         if self.cache_config.kv_sharing_fast_prefill:
-            if not envs.VLLM_USE_V1:
-                raise NotImplementedError(
-                    "Fast prefill optimization for KV sharing is not supported "
-                    "in V0 currently.")
 
             if self.speculative_config is not None and \
                 self.speculative_config.use_eagle():
@@ -410,14 +398,6 @@ class VllmConfig:
                 "--kv-sharing-fast-prefill requires changes on model side for "
                 "correctness and to realize prefill savings. ")
 
-        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
-                and self.compilation_config.level
-                != CompilationLevel.NO_COMPILATION):
-            logger.warning(
-                "LoRA for V0 is not supported with `torch.compile` yet. "
-                "Disabling `torch.compile`.")
-            self.compilation_config.level = CompilationLevel.NO_COMPILATION
-
         disable_chunked_prefill_reasons: list[str] = []
 
         if self.model_config:
@@ -604,57 +584,27 @@ class VllmConfig:
         """
 
         # calculate the default `batch_size_capture_list`
-        if not envs.VLLM_USE_V1:
-            batch_size_capture_list = []
-            if self.scheduler_config is not None and \
-                self.model_config is not None and \
-                    not self.model_config.enforce_eager:
-
-                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
-                if self.parallel_config.tensor_parallel_size > 1 and \
-                    self.compilation_config.pass_config.enable_sequence_parallelism:
-                    possible_sizes = self.update_sizes_for_sequence_parallelism(
-                        possible_sizes)
-
-                # find the minimum size that is larger than max_num_seqs,
-                # which then becomes the max_batchsize_to_capture
-                larger_sizes = [
-                    x for x in possible_sizes
-                    if x >= self.scheduler_config.max_num_seqs
-                ]
-                if larger_sizes:
-                    max_batchsize_to_capture = larger_sizes[0]
-                else:
-                    max_batchsize_to_capture = possible_sizes[-1]
-
-                # filter out the sizes that are
-                # larger than max_batchsize_to_capture
-                batch_size_capture_list = [
-                    size for size in possible_sizes
-                    if size <= max_batchsize_to_capture
-                ]
-        else:
-            batch_size_capture_list = []
-            if self.model_config is not None and \
-                not self.model_config.enforce_eager:
-                cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
-                if len(cuda_graph_sizes) == 1:
-                    batch_size_capture_list = [1, 2, 4] + [
-                        i for i in range(8, cuda_graph_sizes[0] + 1, 8)
-                    ]
-                elif len(cuda_graph_sizes) > 1:
-                    batch_size_capture_list = sorted(cuda_graph_sizes)
-                else:
-                    raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
-                if self.parallel_config.tensor_parallel_size > 1 and \
-                    self.compilation_config.pass_config.enable_sequence_parallelism:
-                    batch_size_capture_list = \
-                        self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
-                max_num_tokens = self.scheduler_config.max_num_batched_tokens
-                batch_size_capture_list = [
-                    size for size in batch_size_capture_list
-                    if size <= max_num_tokens
+        batch_size_capture_list = []
+        if self.model_config is not None and \
+            not self.model_config.enforce_eager:
+            cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
+            if len(cuda_graph_sizes) == 1:
+                batch_size_capture_list = [1, 2, 4] + [
+                    i for i in range(8, cuda_graph_sizes[0] + 1, 8)
                 ]
+            elif len(cuda_graph_sizes) > 1:
+                batch_size_capture_list = sorted(cuda_graph_sizes)
+            else:
+                raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
+            if self.parallel_config.tensor_parallel_size > 1 and \
+                self.compilation_config.pass_config.enable_sequence_parallelism:
+                batch_size_capture_list = \
+                    self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
+            max_num_tokens = self.scheduler_config.max_num_batched_tokens
+            batch_size_capture_list = [
+                size for size in batch_size_capture_list
+                if size <= max_num_tokens
+            ]
 
         self.compilation_config.init_with_cudagraph_sizes(
             batch_size_capture_list)
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 0441745e8b36..50fde9461a13 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
-import vllm.envs as envs
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.config.utils import config
 from vllm.logger import init_logger
@@ -75,11 +74,11 @@ class PassConfig:
     don't all have access to full configuration - that would create a cycle as
     the `PassManager` is set as a property of config."""
 
-    enable_fusion: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+    enable_fusion: bool = False
     """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
     enable_attn_fusion: bool = False
     """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = field(default_factory=lambda: not envs.VLLM_USE_V1)
+    enable_noop: bool = False
     """Whether to enable the custom no-op elimination pass."""
     enable_sequence_parallelism: bool = False
     """Whether to enable sequence parallelism."""