From 7e24e5d4d65abbe5ffc7e653fdfd670c7e300944 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 11:59:39 +0800 Subject: [PATCH] [Deprecation] Remove deprecated task, seed and MM settings (#30397) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_ngram_proposer.py | 2 +- examples/offline_inference/audio_language.py | 2 +- .../encoder_decoder_multimodal.py | 2 +- .../qwen2_5_omni/only_thinker.py | 2 +- .../qwen3_omni/only_thinker.py | 2 +- examples/offline_inference/vision_language.py | 2 +- .../vision_language_multi_image.py | 6 +- .../plugin/prithvi_geospatial_mae_client.py | 2 +- .../pooling/vision_language_pooling.py | 6 +- tests/conftest.py | 2 +- tests/test_config.py | 58 -------- tests/utils.py | 4 +- vllm/config/model.py | 131 ------------------ vllm/engine/arg_utils.py | 73 ++-------- vllm/entrypoints/llm.py | 2 +- vllm/envs.py | 5 - 16 files changed, 25 insertions(+), 276 deletions(-) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index cac401456b62a..872a263318ff7 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -37,7 +37,7 @@ def benchmark_propose(args): tokenizer="facebook/opt-125m", tokenizer_mode="auto", dtype="auto", - seed=None, + seed=0, trust_remote_code=False, ) proposer = NgramProposer( diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index df6e96ca375fc..40462c78ae8c2 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -422,7 +422,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index c1d6c6db53dfb..857767ac3c628 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -77,7 +77,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index ed005e6a69b80..cee83519fadcc 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py index 88a61ed694c2e..62131633da8aa 100644 --- a/examples/offline_inference/qwen3_omni/only_thinker.py +++ b/examples/offline_inference/qwen3_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 22802dddf7893..9142279140e56 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -2031,7 +2031,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 28c466c03dfa5..3c01806baa203 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1382,7 +1382,7 @@ def run_generate( model, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1416,7 +1416,7 @@ def run_chat( model: str, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1494,7 +1494,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/pooling/plugin/prithvi_geospatial_mae_client.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py index a6246999c14d6..1ba1fd6a92ca4 100644 --- a/examples/pooling/plugin/prithvi_geospatial_mae_client.py +++ b/examples/pooling/plugin/prithvi_geospatial_mae_client.py @@ -16,7 +16,7 @@ import requests # - start vllm in serving mode with the below args # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model-impl terratorch -# --task embed --trust-remote-code +# --trust-remote-code # --skip-tokenizer-init --enforce-eager # --io-processor-plugin terratorch_segmentation # --enable-mm-embeds diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py index 530aad4bc031c..dda56bc34df2e 100644 --- a/examples/pooling/pooling/vision_language_pooling.py +++ b/examples/pooling/pooling/vision_language_pooling.py @@ -305,7 +305,7 @@ def get_query(modality: QueryModality): raise ValueError(msg) -def run_encode(model: str, modality: QueryModality, seed: int | None): +def run_encode(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None): print("-" * 50) -def run_score(model: str, modality: QueryModality, seed: int | None): +def run_score(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -390,7 +390,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/tests/conftest.py b/tests/conftest.py index 9f811d5d8db2a..5b26a02823c56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -741,7 +741,7 @@ class VllmRunner: tokenizer_name: str | None = None, tokenizer_mode: str = "auto", trust_remote_code: bool = True, - seed: int | None = 0, + seed: int = 0, max_model_len: int | None = 1024, dtype: str = "auto", disable_log_stats: bool = True, diff --git a/tests/test_config.py b/tests/test_config.py index 0768c6d2cddf5..ee706ab3d9c87 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -89,64 +89,6 @@ def test_update_config(): new_config3 = update_config(config3, {"a": "new_value"}) -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "generate", "none", "generate"), - ("intfloat/multilingual-e5-small", "pooling", "none", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"), - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_auto_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="auto") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "pooling", "embed", "embed"), - ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"), - ("openai/whisper-small", "pooling", "embed", "embed"), - ], -) -def test_score_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="score") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_transcription_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="transcription") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - @pytest.mark.parametrize( ("model_id", "expected_runner_type", "expected_convert_type"), [ diff --git a/tests/utils.py b/tests/utils.py index ea3675b1461b8..d8102331b3612 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -119,7 +119,7 @@ class RemoteOpenAIServer: vllm_serve_args: list[str], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, override_hf_configs: dict[str, Any] | None = None, @@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, ) -> None: diff --git a/vllm/config/model.py b/vllm/config/model.py index 764bdf7000561..bd98111ffb5db 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -73,17 +73,6 @@ logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] ConvertType = Literal["none", "embed", "classify", "reward"] ConvertOption = Literal["auto", ConvertType] -TaskOption = Literal[ - "auto", - "generate", - "embedding", - "embed", - "classify", - "score", - "reward", - "transcription", - "draft", -] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ @@ -93,12 +82,6 @@ HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig] ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] LayerBlockType = Literal["attention", "linear_attention", "mamba"] -_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { - "generate": ["generate", "transcription"], - "pooling": ["embedding", "embed", "classify", "score", "reward"], - "draft": ["draft"], -} - _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "generate": [], "pooling": ["embed", "classify", "reward"], @@ -126,12 +109,6 @@ class ModelConfig: """Convert the model using adapters defined in [vllm.model_executor.models.adapters][]. The most common use case is to adapt a text generation model to be used for pooling tasks.""" - task: TaskOption | None = None - """[DEPRECATED] The task to use the model for. If the model supports more - than one model runner, this is used to select which model runner to run. - - Note that the model may support other tasks using the same model runner. - """ tokenizer: SkipValidation[str] = None # type: ignore """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" @@ -335,7 +312,6 @@ class ModelConfig: ignored_factors = { "runner", "convert", - "task", "tokenizer", "tokenizer_mode", "seed", @@ -510,97 +486,6 @@ class ModelConfig: is_generative_model = registry.is_text_generation_model(architectures, self) is_pooling_model = registry.is_pooling_model(architectures, self) - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - logger.warning( - "Pooling models now default support all pooling; " - "you can use it without any settings." - ) - return "embed" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ( - "The 'task' option has been deprecated and will be " - "removed in v0.13.0 or v1.0, whichever comes first." - ) - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model." - ) - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - else: - # Neither generative nor pooling model - try to convert if possible - if is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--runner pooling " - f"--convert {convert}` to continue using this model " - "as a pooling model." - ) - else: - debug_info = { - "architectures": architectures, - "is_generative_model": is_generative_model, - "is_pooling_model": is_pooling_model, - } - raise AssertionError( - "The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}. Found: {debug_info}" - ) - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - self.runner_type = self._get_runner_type(architectures, self.runner) self.convert_type = self._get_convert_type( architectures, self.runner_type, self.convert @@ -918,22 +803,6 @@ class ModelConfig: return convert_type - def _get_default_pooling_task( - self, - architectures: list[str], - ) -> Literal["embed", "classify", "reward"]: - if self.registry.is_cross_encoder_model(architectures, self): - return "classify" - - for arch in architectures: - match = try_match_architecture_defaults(arch, runner_type="pooling") - if match: - _, (_, convert_type) = match - assert convert_type != "none" - return convert_type - - return "embed" - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): quant_cfg = getattr(hf_config, "quantization_config", None) if quant_cfg is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cbb4862434a98..f303bef17b6a9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,7 +71,6 @@ from vllm.config.model import ( LogprobsMode, ModelDType, RunnerOption, - TaskOption, TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode @@ -360,7 +359,6 @@ class EngineArgs: hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert - task: TaskOption | None = ModelConfig.task skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode @@ -373,7 +371,7 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = 0 + seed: int = ModelConfig.seed max_model_len: int | None = ModelConfig.max_model_len cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes @@ -462,7 +460,6 @@ class EngineArgs: MultiModalConfig, "media_io_kwargs" ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs - disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb mm_processor_cache_type: MMCacheType | None = ( MultiModalConfig.mm_processor_cache_type @@ -558,9 +555,6 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - # DEPRECATED - enable_multimodal_encoder_data_parallel: bool = False - logits_processors: list[str | type[LogitsProcessor]] | None = ( ModelConfig.logits_processors ) @@ -628,7 +622,6 @@ class EngineArgs: model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) - model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( @@ -882,11 +875,6 @@ class EngineArgs: parallel_group.add_argument( "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"] ) - parallel_group.add_argument( - "--enable-multimodal-encoder-data-parallel", - action="store_true", - deprecated=True, - ) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -960,9 +948,6 @@ class EngineArgs: multimodal_group.add_argument( "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"] ) - multimodal_group.add_argument( - "--disable-mm-preprocessor-cache", action="store_true", deprecated=True - ) multimodal_group.add_argument( "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"] ) @@ -1192,62 +1177,20 @@ class EngineArgs: if is_gguf(self.model): self.quantization = self.load_format = "gguf" - # NOTE(woosuk): In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. - if self.seed is None: - logger.warning_once( - "`seed=None` is equivalent to `seed=0` in V1 Engine. " - "You will no longer be allowed to pass `None` in v0.13.", - scope="local", + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, ) - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", - self.seed, - ) - - if self.disable_mm_preprocessor_cache: - logger.warning_once( - "`--disable-mm-preprocessor-cache` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb 0` instead.", - scope="local", - ) - - self.mm_processor_cache_gb = 0 - elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: - logger.warning_once( - "VLLM_MM_INPUT_CACHE_GIB` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb %d` instead.", - envs.VLLM_MM_INPUT_CACHE_GIB, - scope="local", - ) - - self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB - - if self.enable_multimodal_encoder_data_parallel: - logger.warning_once( - "--enable-multimodal-encoder-data-parallel` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-encoder-tp-mode data` instead.", - scope="local", - ) - - self.mm_encoder_tp_mode = "data" - return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, - task=self.task, tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3fce3338503ef..6440b702f4fa6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -198,7 +198,7 @@ class LLM: quantization: QuantizationMethods | None = None, revision: str | None = None, tokenizer_revision: str | None = None, - seed: int | None = None, + seed: int = 0, gpu_memory_utilization: float = 0.9, swap_space: float = 4, cpu_offload_gb: float = 0, diff --git a/vllm/envs.py b/vllm/envs.py index 230f2cf3450a9..0cf0408054063 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -72,7 +72,6 @@ if TYPE_CHECKING: VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MEDIA_CONNECTOR: str = "http" - VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" @@ -786,9 +785,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # imported at runtime. # If a non-existing backend is used, an AssertionError will be thrown. "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), - # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache - # Default is 4 GiB per API process + 4 GiB per engine core process - "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser( @@ -1681,7 +1677,6 @@ def compile_factors() -> dict[str, object]: "VLLM_MEDIA_CONNECTOR", "VLLM_ASSETS_CACHE", "VLLM_ASSETS_CACHE_MODEL_CLEAN", - "VLLM_MM_INPUT_CACHE_GIB", "VLLM_WORKER_MULTIPROC_METHOD", "VLLM_ENABLE_V1_MULTIPROCESSING", "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",