From d7219bcda3e6508cb14881bec303e2d0ab68c898 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 23:27:44 +0800 Subject: [PATCH] [Misc] Move dynamic seed initialization to `EngineArgs` (#29165) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 34 +++++++--------------------------- vllm/config/speculative.py | 7 +------ vllm/engine/arg_utils.py | 16 +++++++++++++++- vllm/v1/worker/tpu_worker.py | 3 --- 4 files changed, 23 insertions(+), 37 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 97cba6ea7295e..8f59673f4e1c3 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -146,9 +146,12 @@ class ModelConfig: - "bfloat16" for a balance between precision and range.\n - "float" is shorthand for FP32 precision.\n - "float32" for FP32 precision.""" - seed: int | None = None - """Random seed for reproducibility. Initialized to None in V0, but - initialized to 0 in V1.""" + seed: int = 0 + """Random seed for reproducibility. + + We must set the global seed because otherwise, + different tensor parallel workers would sample different tokens, + leading to inconsistent results.""" hf_config: PretrainedConfig = field(init=False) """The Hugging Face config of the model.""" hf_text_config: PretrainedConfig = field(init=False) @@ -415,7 +418,7 @@ class ModelConfig: def __post_init__( self, # Multimodal config init vars - limit_mm_per_prompt: dict[str, int] | None, + limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, @@ -428,23 +431,6 @@ class ModelConfig: skip_mm_profiling: bool | None, video_pruning_rate: float | None, ) -> None: - # Set the default seed to 0 in V1. - # NOTE(woosuk): In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. However, without a consistent seed, - # different tensor parallel workers would sample different tokens, - # leading to inconsistent results. - if self.seed is None: - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", - self.seed, - ) - # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name( self.model, self.served_model_name @@ -1151,12 +1137,6 @@ class ModelConfig: self, parallel_config: ParallelConfig, ) -> None: - if parallel_config.distributed_executor_backend == "external_launcher": - assert self.seed is not None, ( - "Seed must be set when using external launcher backend to " - "make sure sampling results are the same across workers." - ) - total_num_attention_heads = getattr( self.hf_text_config, "num_attention_heads", 0 ) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index a0c65b6049e1e..d7c019c73d598 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -9,6 +9,7 @@ from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self +from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger @@ -18,10 +19,8 @@ if TYPE_CHECKING: from transformers import PretrainedConfig import vllm.model_executor.layers.quantization as me_quant - from vllm.config import ModelConfig else: PretrainedConfig = Any - ModelConfig = Any me_quant = LazyLoader( "model_executor", globals(), "vllm.model_executor.layers.quantization" @@ -316,10 +315,6 @@ class SpeculativeConfig: self.prompt_lookup_min = 0 if self.model is not None: - # TODO: Move this import to the top once `ModelConfig` - # lives in `vllm.config.model`. - from vllm.config import ModelConfig - self.draft_model_config = ModelConfig( model=self.model, runner="draft", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bcb90119f9b04..6eaf328eb1655 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -367,7 +367,7 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = ModelConfig.seed + seed: int | None = None max_model_len: int | None = ModelConfig.max_model_len cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes cudagraph_capture_sizes: list[int] | None = ( @@ -1188,6 +1188,20 @@ class EngineArgs: if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" + # NOTE(woosuk): In V1, we use separate processes for workers (unless + # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here + # doesn't affect the user process. + if self.seed is None: + self.seed = 0 + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, + ) + if self.disable_mm_preprocessor_cache: logger.warning( "`--disable-mm-preprocessor-cache` is deprecated " diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 569b2aaa766e4..e1a109eca0a88 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -106,9 +106,6 @@ class TPUWorker: "Profiling enabled. Traces will be saved to: %s", self.profile_dir ) - if self.model_config.seed is None: - self.model_config.seed = 0 - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks