diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 34f6e8c928ff..e3dc40fd0ec7 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -36,8 +36,10 @@ FILES = [ "vllm/transformers_utils", "vllm/triton_utils", "vllm/usage", + "vllm/utils", "vllm/v1/core", "vllm/v1/engine", + "vllm/v1/pool", "vllm/v1/worker", ] @@ -59,7 +61,6 @@ SEPARATE_GROUPS = [ "vllm/v1/executor", "vllm/v1/kv_offload", "vllm/v1/metrics", - "vllm/v1/pool", "vllm/v1/sample", "vllm/v1/spec_decode", "vllm/v1/structured_output", diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py index b6c24e1ceeee..77234cbd0c8c 100644 --- a/vllm/utils/async_utils.py +++ b/vllm/utils/async_utils.py @@ -12,7 +12,7 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task from collections.abc import AsyncGenerator, Awaitable, Callable from concurrent.futures import Executor, ThreadPoolExecutor from functools import partial -from typing import TypeVar +from typing import TYPE_CHECKING, TypeVar from transformers.tokenization_utils_base import BatchEncoding from typing_extensions import ParamSpec @@ -257,6 +257,13 @@ def in_loop(event_loop: AbstractEventLoop) -> bool: return False +# A hack to pass mypy +if TYPE_CHECKING: + + def anext(it: AsyncGenerator[T, None]): + return it.__anext__() + + async def merge_async_iterators( *iterators: AsyncGenerator[T, None], ) -> AsyncGenerator[tuple[int, T], None]: diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py index cde9aa6ff901..fe757c2f3374 100644 --- a/vllm/utils/jsontree.py +++ b/vllm/utils/jsontree.py @@ -4,7 +4,7 @@ from collections.abc import Callable, Iterable from functools import reduce -from typing import TYPE_CHECKING, TypeAlias, TypeVar, cast, overload +from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, overload if TYPE_CHECKING: import torch @@ -82,16 +82,13 @@ def json_map_leaves( def json_map_leaves( func: Callable[[_T], _U], - value: "BatchedTensorInputs" | _JSONTree[_T], + value: Any, ) -> "BatchedTensorInputs" | _JSONTree[_U]: """Apply a function to each leaf in a nested JSON structure.""" if isinstance(value, dict): - return { - k: json_map_leaves(func, v) # type: ignore[arg-type] - for k, v in value.items() - } + return {k: json_map_leaves(func, v) for k, v in value.items()} # type: ignore elif isinstance(value, list): - return [json_map_leaves(func, v) for v in value] + return [json_map_leaves(func, v) for v in value] # type: ignore elif isinstance(value, tuple): return tuple(json_map_leaves(func, v) for v in value) else: @@ -140,9 +137,9 @@ def json_reduce_leaves( def json_reduce_leaves( - func: Callable[..., _T | _U], + func: Callable[[_T, _T], _T] | Callable[[_U, _T], _U], value: _JSONTree[_T], - initial: _U = cast(_U, ...), # noqa: B008 + initial: _U = ..., # type: ignore[assignment] /, ) -> _T | _U: """ @@ -151,13 +148,9 @@ def json_reduce_leaves( sequence to a single value. """ if initial is ...: - return reduce(func, json_iter_leaves(value)) # type: ignore[arg-type] + return reduce(func, json_iter_leaves(value)) # type: ignore - return reduce( - func, # type: ignore[arg-type] - json_iter_leaves(value), - initial, - ) + return reduce(func, json_iter_leaves(value), initial) # type: ignore def json_count_leaves(value: JSONTree[_T]) -> int: diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index c6a6757bed3b..e2517b935bf2 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -68,11 +68,11 @@ class MemorySnapshot: timestamp: float = 0.0 auto_measure: bool = True - def __post_init__(self): + def __post_init__(self) -> None: if self.auto_measure: self.measure() - def measure(self): + def measure(self) -> None: from vllm.platforms import current_platform # we measure the torch peak memory usage via allocated_bytes, diff --git a/vllm/utils/nccl.py b/vllm/utils/nccl.py index b1459fcbd246..4807bc076f82 100644 --- a/vllm/utils/nccl.py +++ b/vllm/utils/nccl.py @@ -3,7 +3,7 @@ from __future__ import annotations -import importlib +import importlib.util import os import torch @@ -47,8 +47,8 @@ def find_nccl_include_paths() -> list[str] | None: try: spec = importlib.util.find_spec("nvidia.nccl") - if spec and getattr(spec, "submodule_search_locations", None): - for loc in spec.submodule_search_locations: + if spec and (locs := getattr(spec, "submodule_search_locations", None)): + for loc in locs: inc_dir = os.path.join(loc, "include") if os.path.exists(os.path.join(inc_dir, "nccl.h")): paths.append(inc_dir) diff --git a/vllm/utils/network_utils.py b/vllm/utils/network_utils.py index 0a68e48ba5e7..80ff0df28c66 100644 --- a/vllm/utils/network_utils.py +++ b/vllm/utils/network_utils.py @@ -72,7 +72,7 @@ def get_ip() -> str: return "0.0.0.0" -def test_loopback_bind(address, family): +def test_loopback_bind(address: str, family: int) -> bool: try: s = socket.socket(family, socket.SOCK_DGRAM) s.bind((address, 0)) # Port 0 = auto assign diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py index ac9b859159ea..a136d450e7b1 100644 --- a/vllm/utils/registry.py +++ b/vllm/utils/registry.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any +from typing import Any, TypeVar + +_T = TypeVar("_T", bound=type) class ExtensionManager: @@ -34,7 +36,7 @@ class ExtensionManager: Decorator to register a class with the given name. """ - def wrap(cls_to_register): + def wrap(cls_to_register: _T) -> _T: self.name2class[name] = cls_to_register return cls_to_register diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index 3661dfd09047..f5c49ac169f0 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -13,7 +13,7 @@ import numpy.typing as npt import torch from packaging import version from packaging.version import Version -from torch.library import Library +from torch.library import Library, infer_schema import vllm.envs as envs @@ -78,7 +78,6 @@ def guard_cuda_initialization(): yield return - had_key = "CUDA_VISIBLE_DEVICES" in os.environ old_value = os.environ.get("CUDA_VISIBLE_DEVICES") os.environ["CUDA_VISIBLE_DEVICES"] = "" try: @@ -90,10 +89,10 @@ def guard_cuda_initialization(): err_msg = str(e) raise RuntimeError(err_msg) from e finally: - if had_key: - os.environ["CUDA_VISIBLE_DEVICES"] = old_value + if old_value is None: + del os.environ["CUDA_VISIBLE_DEVICES"] else: - os.environ.pop("CUDA_VISIBLE_DEVICES") + os.environ["CUDA_VISIBLE_DEVICES"] = old_value def get_dtype_size(dtype: torch.dtype) -> int: @@ -525,8 +524,7 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor: # Helper function used in testing. def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool: - torch_version = version.parse(torch_version) - return torch_version >= version.parse(target) + return version.parse(torch_version) >= version.parse(target) def is_torch_equal_or_newer(target: str) -> bool: @@ -640,15 +638,8 @@ def direct_register_custom_op( dispatch_key = current_platform.dispatch_key - import torch.library + schema_str = infer_schema(op_func, mutates_args=mutates_args) - if hasattr(torch.library, "infer_schema"): - schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args) - else: - # for pytorch 2.4 - import torch._custom_op.impl - - schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) my_lib = target_lib or vllm_lib my_lib.define(op_name + schema_str, tags=tags) my_lib.impl(op_name, op_func, dispatch_key=dispatch_key) diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py index 9883ab8fb996..7bd2c7415daf 100644 --- a/vllm/v1/pool/metadata.py +++ b/vllm/v1/pool/metadata.py @@ -67,16 +67,16 @@ def build_pooling_cursor( n_seq = len(num_scheduled_tokens) index = list(range(n_seq)) - num_scheduled_tokens = torch.tensor(num_scheduled_tokens, device="cpu") + num_scheduled_tokens_cpu = torch.tensor(num_scheduled_tokens, device="cpu") cumsum = torch.zeros( n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu" ) - torch.cumsum(num_scheduled_tokens, dim=0, out=cumsum[1:]) + torch.cumsum(num_scheduled_tokens_cpu, dim=0, out=cumsum[1:]) cumsum = cumsum.to(device, non_blocking=True) return PoolingCursor( index=index, first_token_indices_gpu=cumsum[:n_seq], last_token_indices_gpu=cumsum[1:] - 1, prompt_lens_cpu=prompt_lens, - num_scheduled_tokens_cpu=num_scheduled_tokens, + num_scheduled_tokens_cpu=num_scheduled_tokens_cpu, )