diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 20b8eb57f743..959a0342817c 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import enum from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Iterable, Mapping from typing import Any @@ -15,13 +16,17 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import Device from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.processor import Processor logger = init_logger(__name__) +class Device(enum.Enum): + GPU = enum.auto() + CPU = enum.auto() + + class EngineClient(ABC): """Protocol class for Clients to Engine""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index c15b70a06809..ce5cf0aae3a3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -31,6 +31,7 @@ from vllm.config.model import ( TokenizerMode, ) from vllm.engine.arg_utils import EngineArgs +from vllm.engine.protocol import Device from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ChatTemplateContentFormatOption, @@ -75,8 +76,8 @@ from vllm.transformers_utils.tokenizer import ( get_cached_tokenizer, ) from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter, Device from vllm.utils.collection_utils import as_iter, is_list_of +from vllm.utils.counter import Counter from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.llm_engine import LLMEngine from vllm.v1.sample.logits_processor import LogitsProcessor @@ -1490,8 +1491,8 @@ class LLM: def stop_profile(self) -> None: self.llm_engine.stop_profile() - def reset_prefix_cache(self, device: Device | None = None) -> bool: - return self.llm_engine.reset_prefix_cache(device) + def reset_prefix_cache(self, device: Device | None = None) -> None: + self.llm_engine.reset_prefix_cache(device) def sleep(self, level: int = 1): """ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 632bd741290b..71939d6c41df 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -40,7 +40,7 @@ from typing_extensions import assert_never import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.protocol import EngineClient +from vllm.engine.protocol import Device, EngineClient from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args @@ -108,7 +108,6 @@ from vllm.entrypoints.utils import ( from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import is_valid_ipv6_address from vllm.utils.system_utils import decorate_logs, set_ulimit diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 9b7deb40b93f..24b9587010ca 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -19,7 +19,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry -from vllm.utils import AtomicCounter +from vllm.utils.counter import AtomicCounter logger = init_logger(__name__) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index eaa78839cf3f..549827a927d9 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3,7 +3,6 @@ import enum import inspect -import threading import uuid import warnings from functools import wraps @@ -68,54 +67,11 @@ STR_INVALID_VAL: str = "INVALID" T = TypeVar("T") -class Device(enum.Enum): - GPU = enum.auto() - CPU = enum.auto() - - class LayerBlockType(enum.Enum): attention = "attention" mamba = "mamba" -class Counter: - def __init__(self, start: int = 0) -> None: - self.counter = start - - def __next__(self) -> int: - i = self.counter - self.counter += 1 - return i - - def reset(self) -> None: - self.counter = 0 - - -class AtomicCounter: - """An atomic, thread-safe counter""" - - def __init__(self, initial=0): - """Initialize a new atomic counter to given initial value""" - self._value = initial - self._lock = threading.Lock() - - def inc(self, num=1): - """Atomically increment the counter by num and return the new value""" - with self._lock: - self._value += num - return self._value - - def dec(self, num=1): - """Atomically decrement the counter by num and return the new value""" - with self._lock: - self._value -= num - return self._value - - @property - def value(self): - return self._value - - def random_uuid() -> str: return str(uuid.uuid4().hex) diff --git a/vllm/utils/counter.py b/vllm/utils/counter.py new file mode 100644 index 000000000000..c2dce32e97e1 --- /dev/null +++ b/vllm/utils/counter.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import threading + + +class Counter: + def __init__(self, start: int = 0) -> None: + super().__init__() + + self.counter = start + + def __next__(self) -> int: + i = self.counter + self.counter += 1 + return i + + def reset(self) -> None: + self.counter = 0 + + +class AtomicCounter: + """An atomic, thread-safe counter""" + + def __init__(self, initial: int = 0) -> None: + """Initialize a new atomic counter to given initial value""" + super().__init__() + + self._value = initial + self._lock = threading.Lock() + + @property + def value(self) -> int: + return self._value + + def inc(self, num: int = 1) -> int: + """Atomically increment the counter by num and return the new value""" + with self._lock: + self._value += num + return self._value + + def dec(self, num: int = 1) -> int: + """Atomically decrement the counter by num and return the new value""" + with self._lock: + self._value -= num + return self._value diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fd0a9b395e5f..cf458a8f074c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -14,7 +14,7 @@ import torch import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.protocol import EngineClient +from vllm.engine.protocol import Device, EngineClient from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs import PromptType from vllm.logger import init_logger @@ -29,7 +29,6 @@ from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list from vllm.utils.func_utils import deprecate_kwargs diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 9d69ed93ed37..0cd5e1ff3944 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -14,6 +14,7 @@ from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group from vllm.distributed.parallel_state import get_dp_group from vllm.engine.arg_utils import EngineArgs +from vllm.engine.protocol import Device from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -26,7 +27,6 @@ from vllm.tasks import SupportedTask from vllm.tracing import init_tracer from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor