From af826e082045e8bcd3ab2ea3129bcf91da7d58de Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 30 Oct 2025 17:42:49 +0800 Subject: [PATCH] [V0 deprecation] Remove VLLM_USE_V1 usage in config module (#27784) Signed-off-by: wangxiyuan --- vllm/config/lora.py | 5 ----- vllm/config/model.py | 25 ++----------------------- vllm/config/speculative.py | 7 ------- vllm/config/vllm.py | 34 +++++++--------------------------- 4 files changed, 9 insertions(+), 62 deletions(-) diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 2f9d638542b65..84e92eef40077 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -9,7 +9,6 @@ from pydantic import ConfigDict, Field, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self -import vllm.envs as envs from vllm.config.utils import config from vllm.logger import init_logger from vllm.platforms import current_platform @@ -106,10 +105,6 @@ class LoRAConfig: return self - def verify_with_cache_config(self, cache_config: CacheConfig): - if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: - raise ValueError("V0 LoRA does not support CPU offload, please use V1.") - def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): self.lora_dtype = model_config.dtype diff --git a/vllm/config/model.py b/vllm/config/model.py index e22c218c769da..2151939d5a9f6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,7 +32,6 @@ from vllm.transformers_utils.config import ( get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, try_get_dense_modules, try_get_generation_config, try_get_safetensors_metadata, @@ -442,15 +441,12 @@ class ModelConfig: self.enforce_eager = True # Set the default seed to 0 in V1. - # NOTE(woosuk): In V0, we set the default seed to None because the - # driver worker shares the same process as the user process, and thus - # setting a seed affects the user process as well. - # In V1, we use separate processes for workers (unless + # NOTE(woosuk): In V1, we use separate processes for workers (unless # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here # doesn't affect the user process. However, without a consistent seed, # different tensor parallel workers would sample different tokens, # leading to inconsistent results. - if envs.VLLM_USE_V1 and self.seed is None: + if self.seed is None: self.seed = 0 if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( @@ -703,23 +699,6 @@ class ModelConfig: revision=self.revision, ) - # Interleaved attention is not supported by some backends in V0 - if ( - not self.disable_sliding_window - and is_interleaved(self.hf_text_config) - and not envs.VLLM_USE_V1 - and (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER") - ): - logger.warning_once( - "%s has interleaved attention, which is currently not " - "supported by the %s backend. Disabling sliding window and " - "capping the max length to the sliding window size (%d).", - self.hf_text_config.model_type, - backend, - self.hf_text_config.sliding_window, - ) - self.disable_sliding_window = True - self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) # Init multimodal config if needed diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 4c7b7369ed4b5..903b9a26fab88 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -9,7 +9,6 @@ from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self -import vllm.envs as envs from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger @@ -366,12 +365,6 @@ class SpeculativeConfig: # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): - if self.enable_chunked_prefill and not envs.VLLM_USE_V1: - raise ValueError( - "Chunked prefill and EAGLE are not compatible " - "when using V0." - ) - from vllm.transformers_utils.configs import SpeculatorsConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c46f409edab61..f592a708a02b5 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -130,7 +130,6 @@ class VllmConfig: from vllm import __version__ vllm_factors.append(__version__) - vllm_factors.append(envs.VLLM_USE_V1) if self.model_config: vllm_factors.append(self.model_config.compute_hash()) else: @@ -306,7 +305,6 @@ class VllmConfig: self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config is not None: - self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) if self.quant_config is None and self.model_config is not None: @@ -332,18 +330,9 @@ class VllmConfig: # we use the default mode. The default mode depends on other # settings (see the below code). if self.compilation_config.mode is None: - if envs.VLLM_USE_V1: - if ( - self.model_config is not None - and not self.model_config.enforce_eager - ): - self.compilation_config.mode = CompilationMode.VLLM_COMPILE - else: - self.compilation_config.mode = CompilationMode.NONE - + if self.model_config is not None and not self.model_config.enforce_eager: + self.compilation_config.mode = CompilationMode.VLLM_COMPILE else: - # NB: Passing both --enforce-eager and a compilation mode - # in V0 means the compilation mode wins out. self.compilation_config.mode = CompilationMode.NONE else: assert self.compilation_config.mode >= CompilationMode.NONE @@ -371,10 +360,7 @@ class VllmConfig: # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: - if ( - envs.VLLM_USE_V1 - and self.compilation_config.mode == CompilationMode.VLLM_COMPILE - ): + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: # default to full and piecewise for most models self.compilation_config.cudagraph_mode = ( CUDAGraphMode.FULL_AND_PIECEWISE @@ -428,7 +414,7 @@ class VllmConfig: # override related settings when enforce eager self.compilation_config.max_cudagraph_capture_size = 0 self.compilation_config.cudagraph_capture_sizes = [] - elif envs.VLLM_USE_V1: + else: self.compilation_config.cudagraph_num_of_warmups = 1 self._set_cudagraph_sizes() @@ -535,14 +521,11 @@ class VllmConfig: current_platform.check_and_update_config(self) # Do this after all the updates to compilation_config.mode - if ( - envs.VLLM_USE_V1 - and self.compilation_config.mode == CompilationMode.VLLM_COMPILE - ): + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: self.compilation_config.set_splitting_ops_for_v1() # final check of cudagraph mode after all possible updates - if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): + if current_platform.is_cuda_alike(): if ( self.compilation_config.cudagraph_mode.has_full_cudagraphs() and self.model_config is not None @@ -587,10 +570,7 @@ class VllmConfig: if not self.instance_id: self.instance_id = random_uuid()[:5] - if ( - envs.VLLM_USE_V1 - and not self.scheduler_config.disable_hybrid_kv_cache_manager - ): + if not self.scheduler_config.disable_hybrid_kv_cache_manager: # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later.