diff --git a/docs/api/README.md b/docs/api/README.md index 2b5142e0bcd0..245c925f7f50 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -8,7 +8,6 @@ API documentation for vLLM's configuration classes. - [vllm.config.ModelConfig][] - [vllm.config.CacheConfig][] -- [vllm.config.TokenizerPoolConfig][] - [vllm.config.LoadConfig][] - [vllm.config.ParallelConfig][] - [vllm.config.SchedulerConfig][] diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 38ecaf2233d9..76c94bdf80ca 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -29,7 +29,7 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): +def api_server(distributed_executor_backend: str): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() commands = [ @@ -40,8 +40,6 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): "facebook/opt-125m", "--host", "127.0.0.1", - "--tokenizer-pool-size", - str(tokenizer_pool_size), "--distributed-executor-backend", distributed_executor_backend, ] @@ -54,10 +52,8 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): uvicorn_process.terminate() -@pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) -def test_api_server(api_server, tokenizer_pool_size: int, - distributed_executor_backend: str): +def test_api_server(api_server, distributed_executor_backend: str): """ Run the API server and test it. diff --git a/vllm/config.py b/vllm/config.py index 2d84f6875cd9..766d7708625e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1730,35 +1730,6 @@ class CacheConfig: logger.warning("Possibly too large swap space. %s", msg) -@config -@dataclass -class TokenizerPoolConfig: - """This config is deprecated and will be removed in a future release. - - Passing these parameters will have no effect. Please remove them from your - configurations. - """ - - pool_size: int = 0 - """This parameter is deprecated and will be removed in a future release. - Passing this parameter will have no effect. Please remove it from your - configurations.""" - pool_type: str = "ray" - """This parameter is deprecated and will be removed in a future release. - Passing this parameter will have no effect. Please remove it from your - configurations.""" - extra_config: dict = field(default_factory=dict) - """This parameter is deprecated and will be removed in a future release. - Passing this parameter will have no effect. Please remove it from your - configurations.""" - - def __post_init__(self) -> None: - logger.warning_once( - "TokenizerPoolConfig is deprecated and will be removed in a " - "future release. Passing this parameter will have no effect. " - "Please remove it from your configurations.") - - class LoadFormat(str, enum.Enum): AUTO = "auto" PT = "pt" @@ -1922,10 +1893,6 @@ class ParallelConfig: disable_custom_all_reduce: bool = False """Disable the custom all-reduce kernel and fall back to NCCL.""" - tokenizer_pool_config: Optional[TokenizerPoolConfig] = None - """This parameter is deprecated and will be removed in a future release. - Please remove it from your configs""" - ray_workers_use_nsight: bool = False """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 269477c48481..998a352497f7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -32,8 +32,8 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ObservabilityConfig, ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, PromptAdapterConfig, SchedulerConfig, SchedulerPolicy, SpeculativeConfig, - TaskOption, TokenizerMode, TokenizerPoolConfig, - VllmConfig, get_attr_docs, get_field) + TaskOption, TokenizerMode, VllmConfig, get_attr_docs, + get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins @@ -373,13 +373,6 @@ class EngineArgs: enforce_eager: bool = ModelConfig.enforce_eager max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce - # The following three fields are deprecated and will be removed in a future - # release. Setting them will have no effect. Please remove them from your - # configurations. - tokenizer_pool_size: int = TokenizerPoolConfig.pool_size - tokenizer_pool_type: str = TokenizerPoolConfig.pool_type - tokenizer_pool_extra_config: dict = \ - get_field(TokenizerPoolConfig, "extra_config") limit_mm_per_prompt: dict[str, int] = \ get_field(MultiModalConfig, "limit_per_prompt") interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings @@ -751,19 +744,6 @@ class EngineArgs: cache_group.add_argument("--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]) - # Tokenizer arguments - tokenizer_kwargs = get_kwargs(TokenizerPoolConfig) - tokenizer_group = parser.add_argument_group( - title="TokenizerPoolConfig", - description=TokenizerPoolConfig.__doc__, - ) - tokenizer_group.add_argument("--tokenizer-pool-size", - **tokenizer_kwargs["pool_size"]) - tokenizer_group.add_argument("--tokenizer-pool-type", - **tokenizer_kwargs["pool_type"]) - tokenizer_group.add_argument("--tokenizer-pool-extra-config", - **tokenizer_kwargs["extra_config"]) - # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) multimodal_group = parser.add_argument_group(