diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index cb87c44cc3999..46f7d58c438cb 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -32,10 +32,6 @@ def _test_stopping(llm: LLM, assert output.stop_reason == expected_reason -def _set_async_mode(llm, is_async): - llm.llm_engine.scheduler[0].use_async_output_proc = is_async - - def _stop_basic(llm): _test_stopping(llm, stop=["."], @@ -103,40 +99,8 @@ def test_stop_strings(): # async output processing below. llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) - if envs.VLLM_USE_V1: - _stop_basic(llm) - else: - _set_async_mode(llm, True) - _stop_basic(llm) - - _set_async_mode(llm, False) - _stop_basic(llm) - - if envs.VLLM_USE_V1: - _stop_multi_tokens(llm) - else: - _set_async_mode(llm, True) - _stop_multi_tokens(llm) - - _set_async_mode(llm, False) - _stop_multi_tokens(llm) - - if envs.VLLM_USE_V1: - _stop_partial_token(llm) - else: - _set_async_mode(llm, True) - _stop_partial_token(llm) - - _set_async_mode(llm, False) - _stop_partial_token(llm) - - if envs.VLLM_USE_V1: - # FIXME: this does not respect include_in_output=False - # _stop_token_id(llm) - pass - else: - _set_async_mode(llm, True) - _stop_token_id(llm) - - _set_async_mode(llm, False) - _stop_token_id(llm) + _stop_basic(llm) + _stop_multi_tokens(llm) + _stop_partial_token(llm) + # FIXME: this does not respect include_in_output=False + # _stop_token_id(llm) diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py index bdd41eece2317..3a7bcb9571825 100644 --- a/tests/v1/engine/test_processor_multi_modal_uuids.py +++ b/tests/v1/engine/test_processor_multi_modal_uuids.py @@ -6,7 +6,6 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig -from vllm.platforms.interface import UnspecifiedPlatform from vllm.sampling_params import SamplingParams from vllm.v1.engine import processor as processor_mod from vllm.v1.engine.processor import Processor @@ -33,15 +32,6 @@ def _mk_processor(monkeypatch, "__post_init__", lambda self, *args: None, raising=True) - monkeypatch.setattr(UnspecifiedPlatform, - "is_async_output_supported", - classmethod(lambda cls, enforce_eager: True), - raising=True) - monkeypatch.setattr( - ModelConfig, - "verify_async_output_proc", - lambda self, parallel_config, speculative_config, device_config: None, - raising=True) monkeypatch.setattr(ModelConfig, "verify_with_parallel_config", lambda self, parallel_config: None, diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 28c24f62895ab..f6b8a18dd7c2c 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -29,24 +29,6 @@ def test_unsupported_configs(monkeypatch): }, ).create_engine_config() - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - preemption_mode="swap", - ).create_engine_config() - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - disable_async_output_proc=True, - ).create_engine_config() - - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - scheduler_delay_factor=1.2, - ).create_engine_config() - def test_enable_by_default_fallback(monkeypatch): with monkeypatch.context() as m: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index ddd8de4324f6f..e31a78ba33baa 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -454,9 +454,6 @@ class VllmConfig: self.try_verify_and_update_config() if self.model_config is not None: - self.model_config.verify_async_output_proc(self.parallel_config, - self.speculative_config, - self.device_config) self.model_config.verify_with_parallel_config(self.parallel_config) self.model_config.verify_dual_chunk_attention_config( self.load_config) @@ -877,7 +874,6 @@ class VllmConfig: f"served_model_name={self.model_config.served_model_name}, " f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa - f"use_async_output_proc={self.model_config.use_async_output_proc}, " f"pooler_config={self.model_config.pooler_config!r}, " f"compilation_config={self.compilation_config!r}") diff --git a/vllm/config/model.py b/vllm/config/model.py index 921322bb475c5..b53029dc8c3ec 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -223,8 +223,6 @@ class ModelConfig: that this name(s) will also be used in `model_name` tag content of prometheus metrics, if multiple names provided, metrics tag will take the first one.""" - use_async_output_proc: bool = True - """Whether to use async output processor.""" config_format: Union[str, ConfigFormat] = "auto" """The format of the model config to load:\n - "auto" will try to load the config in hf format if available else it @@ -1119,37 +1117,6 @@ class ModelConfig: raise ValueError("please set VLLM_ATTENTION_BACKEND to " f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") - def verify_async_output_proc(self, parallel_config, speculative_config, - device_config) -> None: - if not self.use_async_output_proc: - # Nothing to check - return - - if parallel_config.pipeline_parallel_size > 1: - self.use_async_output_proc = False - return - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - from vllm.platforms import current_platform - if not current_platform.is_async_output_supported(self.enforce_eager): - self.use_async_output_proc = False - return - - if envs.VLLM_USE_RAY_SPMD_WORKER: - self.use_async_output_proc = False - return - - # Async postprocessor is not necessary for pooling models - # since there is no token generation - if self.runner_type == "pooling": - self.use_async_output_proc = False - - # Reminder: Please update docs/features/compatibility_matrix.md - # If the feature combo become valid - if speculative_config: - self.use_async_output_proc = False - def verify_with_parallel_config( self, parallel_config: ParallelConfig, @@ -1173,15 +1140,12 @@ class ModelConfig: self._verify_with_expert_parallelism() pipeline_parallel_size = parallel_config.pipeline_parallel_size - if pipeline_parallel_size > 1: - if not self.registry.is_pp_supported_model(self.architectures, - self): - raise NotImplementedError( - "Pipeline parallelism is not supported for this model. " - "Supported models implement the `SupportsPP` interface.") - - if self.use_async_output_proc: - self.use_async_output_proc = False + if (pipeline_parallel_size > 1 + and not self.registry.is_pp_supported_model( + self.architectures, self)): + raise NotImplementedError( + "Pipeline parallelism is not supported for this model. " + "Supported models implement the `SupportsPP` interface.") def get_sliding_window(self) -> Optional[int]: """Get the sliding window size from the HF text config if present.""" diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index f0f67bab9d6ff..daf094d2df5c8 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -3,7 +3,7 @@ import hashlib from dataclasses import field -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Union from pydantic import SkipValidation, model_validator from pydantic.dataclasses import dataclass @@ -18,7 +18,6 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, logger = init_logger(__name__) RunnerType = Literal["generate", "pooling", "draft"] -PreemptionMode = Literal["swap", "recompute"] SchedulerPolicy = Literal["fcfs", "priority"] @@ -78,10 +77,6 @@ class SchedulerConfig: 3. more than one value (e.g. 1 2 128) is provided, then the capture list will follow the provided list.""" - delay_factor: float = 0.0 - """Apply a delay (of delay factor multiplied by previous - prompt latency) before scheduling next prompt.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -103,14 +98,6 @@ class SchedulerConfig: NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.""" - preemption_mode: Optional[PreemptionMode] = None - """Whether to perform preemption by swapping or - recomputation. If not specified, we determine the mode as follows: - We use recomputation by default since it incurs lower overhead than - swapping. However, when the sequence group has multiple sequences - (e.g., beam search), recomputation is not currently supported. In - such a case, we use swapping instead.""" - send_delta_data: bool = False """Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 242fcf501bfc4..fef4177b3a333 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -409,9 +409,7 @@ class EngineArgs: get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: Optional[Union[str, List[str]]] = LoadConfig.ignore_patterns - preemption_mode: Optional[str] = SchedulerConfig.preemption_mode - scheduler_delay_factor: float = SchedulerConfig.delay_factor enable_chunked_prefill: Optional[ bool] = SchedulerConfig.enable_chunked_prefill disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input @@ -439,7 +437,6 @@ class EngineArgs: ObservabilityConfig.otlp_traces_endpoint collect_detailed_traces: Optional[list[DetailedTraceModules]] = \ ObservabilityConfig.collect_detailed_traces - disable_async_output_proc: bool = not ModelConfig.use_async_output_proc scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls @@ -561,14 +558,6 @@ class EngineArgs: **model_kwargs["enable_prompt_embeds"]) model_group.add_argument("--served-model-name", **model_kwargs["served_model_name"]) - # This one is a special case because it is the - # opposite of ModelConfig.use_async_output_proc - model_group.add_argument( - "--disable-async-output-proc", - action="store_true", - default=EngineArgs.disable_async_output_proc, - help="Disable async output processing. This may result in " - "lower performance.") model_group.add_argument("--config-format", **model_kwargs["config_format"]) # This one is a special case because it can bool @@ -897,10 +886,6 @@ class EngineArgs: **scheduler_kwargs["long_prefill_token_threshold"]) scheduler_group.add_argument("--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]) - scheduler_group.add_argument("--scheduler-delay-factor", - **scheduler_kwargs["delay_factor"]) - scheduler_group.add_argument("--preemption-mode", - **scheduler_kwargs["preemption_mode"]) # multi-step scheduling has been removed; corresponding arguments # are no longer supported. scheduler_group.add_argument("--scheduling-policy", @@ -1029,7 +1014,6 @@ class EngineArgs: interleave_mm_strings=self.interleave_mm_strings, media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, - use_async_output_proc=not self.disable_async_output_proc, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_cache_gb=self.mm_processor_cache_gb, @@ -1395,11 +1379,9 @@ class EngineArgs: max_model_len=model_config.max_model_len, cuda_graph_sizes=self.cuda_graph_sizes, num_lookahead_slots=num_lookahead_slots, - delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model, - preemption_mode=self.preemption_mode, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), policy=self.scheduling_policy, @@ -1492,22 +1474,6 @@ class EngineArgs: recommend_to_remove=False) return False - if self.preemption_mode != SchedulerConfig.preemption_mode: - _raise_or_fallback(feature_name="--preemption-mode", - recommend_to_remove=True) - return False - - if (self.disable_async_output_proc - != EngineArgs.disable_async_output_proc): - _raise_or_fallback(feature_name="--disable-async-output-proc", - recommend_to_remove=True) - return False - - if self.scheduler_delay_factor != SchedulerConfig.delay_factor: - _raise_or_fallback(feature_name="--scheduler-delay-factor", - recommend_to_remove=True) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0ab806fcb8b5c..092d3f276d1c5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -137,8 +137,6 @@ class LLM: back to the eager mode. disable_custom_all_reduce: See [ParallelConfig][vllm.config.ParallelConfig]. - disable_async_output_proc: Disable async output processing. - This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files . If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). @@ -188,7 +186,6 @@ class LLM: enforce_eager: bool = False, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = False, hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, @@ -286,7 +283,6 @@ class LLM: enforce_eager=enforce_eager, max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, - disable_async_output_proc=disable_async_output_proc, hf_token=hf_token, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 3b566e88a9ec2..7a753d608a43a 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -137,10 +137,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor): def _init_executor(self) -> None: """Initialize the worker and load the model. """ - assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ - ("ExecutorWithExternalLauncher needs deterministic " - "execution, so it" - "does not support delay_factor in scheduling") if envs.VLLM_USE_V1: assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \ ("To get deterministic execution in V1, " diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 544e091491bf5..cd41832bc2ea4 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -126,10 +126,6 @@ class CpuPlatform(Platform): """ torch.cpu.set_device(device) - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - @classmethod def inference_mode(cls): return torch.no_grad() diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 87d8f2b7481bb..c263e2afe83b0 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -96,16 +96,6 @@ class CudaPlatformBase(Platform): def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - if enforce_eager and not envs.VLLM_USE_V1: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - return False - return True - @classmethod def is_fully_connected(cls, device_ids: list[int]) -> bool: raise NotImplementedError diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 53fc762dce540..c43580ac5da13 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -275,13 +275,6 @@ class Platform: """Get the total memory of a device in bytes.""" raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - """ - Check if the current platform supports async output. - """ - raise NotImplementedError - @classmethod def inference_mode(cls): """A device-specific wrapper of `torch.inference_mode`. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 4f540fe965e22..dce2924ac7a9d 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -310,16 +310,6 @@ class RocmPlatform(Platform): device_props = torch.cuda.get_device_properties(device_id) return device_props.total_memory - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - if enforce_eager and not envs.VLLM_USE_V1: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - return False - return True - @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: from vllm.config.compilation import CUDAGraphMode diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 4e4db116abca0..9852d948bc4b8 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -75,10 +75,6 @@ class TpuPlatform(Platform): def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - @classmethod def get_punica_wrapper(cls) -> str: return "vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPU" diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 67ef058df10f1..4d3bef4b42947 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -98,10 +98,6 @@ class XPUPlatform(Platform): device_props = torch.xpu.get_device_properties(device_id) return device_props.total_memory - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return True - @classmethod def inference_mode(cls): return torch.no_grad()