diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index cce99d0c4f4c..71ee22878143 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -25,7 +25,7 @@ from vllm.distributed.parallel_state import ( initialize_model_parallel, ) from vllm.platforms import current_platform -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables from ..models.registry import HF_EXAMPLE_MODELS from ..utils import ( diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index 7688ba3d1b6c..6d0a0ed7d89d 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -31,7 +31,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( GroupShape, ) from vllm.platforms import current_platform -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables from ..utils import has_module_attribute, multi_gpu_test from .backend import TestBackend diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index 31b6ddf3c698..e909cf7393ad 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -29,7 +29,7 @@ from vllm.distributed.parallel_state import ( from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables from ..utils import multi_gpu_test from .backend import TestBackend diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index 7ca3d3d27b56..7b45ae82c72d 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -15,7 +15,7 @@ from vllm.distributed.parallel_state import ( get_tp_group, init_distributed_environment, ) -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables def distributed_run(fn, world_size): diff --git a/tests/distributed/test_nccl_symm_mem_allreduce.py b/tests/distributed/test_nccl_symm_mem_allreduce.py index 40dcf7567c92..eeb74bdf5357 100644 --- a/tests/distributed/test_nccl_symm_mem_allreduce.py +++ b/tests/distributed/test_nccl_symm_mem_allreduce.py @@ -23,7 +23,7 @@ from vllm.distributed.parallel_state import ( initialize_model_parallel, ) from vllm.platforms import current_platform -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables torch.manual_seed(42) random.seed(44) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 4bab709fb589..c3085beeb356 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -18,7 +18,7 @@ from vllm.distributed.parallel_state import ( graph_capture, init_distributed_environment, ) -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables def distributed_run(fn, world_size): diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index eeb611ce54be..a7ace62e1b54 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -10,8 +10,8 @@ import torch.distributed as dist from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.distributed.utils import StatelessProcessGroup -from vllm.utils import update_environment_variables from vllm.utils.network_utils import get_open_port +from vllm.utils.system_utils import update_environment_variables def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]: diff --git a/tests/distributed/test_symm_mem_allreduce.py b/tests/distributed/test_symm_mem_allreduce.py index e669b81b04f0..b8f04cf8e62c 100644 --- a/tests/distributed/test_symm_mem_allreduce.py +++ b/tests/distributed/test_symm_mem_allreduce.py @@ -23,7 +23,7 @@ from vllm.distributed.parallel_state import ( from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.platforms import current_platform -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables torch.manual_seed(42) random.seed(44) diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 9ac637ee82b8..8289f697fea6 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -10,8 +10,8 @@ import torch import vllm.envs as envs from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.utils import StatelessProcessGroup -from vllm.utils import update_environment_variables from vllm.utils.network_utils import get_open_port +from vllm.utils.system_utils import update_environment_variables from vllm.utils.torch_utils import cuda_device_count_stateless from ..utils import multi_gpu_test diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index 25934c409744..6fca33acd48a 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -13,7 +13,7 @@ from vllm.distributed.parallel_state import ( ) from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated from vllm.platforms import current_platform -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables @multi_gpu_test(num_gpus=2) diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index 5eb051381b13..82ba958a58c4 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -19,8 +19,8 @@ from vllm.model_executor.models.vision import ( run_dp_sharded_vision_model, ) from vllm.platforms import current_platform -from vllm.utils import update_environment_variables from vllm.utils.network_utils import get_open_port +from vllm.utils.system_utils import update_environment_variables pytestmark = pytest.mark.cpu_test diff --git a/tests/utils_/test_system_utils.py b/tests/utils_/test_system_utils.py new file mode 100644 index 000000000000..3d1b1fc4ce37 --- /dev/null +++ b/tests/utils_/test_system_utils.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import tempfile +from pathlib import Path + +from vllm.utils.system_utils import unique_filepath + + +def test_unique_filepath(): + temp_dir = tempfile.mkdtemp() + path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt" + paths = set() + for i in range(10): + path = unique_filepath(path_fn) + path.write_text("test") + paths.add(path) + assert len(paths) == 10 + assert len(list(Path(temp_dir).glob("*.txt"))) == 10 diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 9028c925b5ea..08dc7632b74b 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -19,7 +19,6 @@ from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens from vllm.utils import ( FlexibleArgumentParser, bind_kv_cache, - unique_filepath, ) from ..utils import create_new_process_for_each_test, flat_product @@ -466,18 +465,6 @@ def test_load_config_file(tmp_path): os.remove(str(config_file_path)) -def test_unique_filepath(): - temp_dir = tempfile.mkdtemp() - path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt" - paths = set() - for i in range(10): - path = unique_filepath(path_fn) - path.write_text("test") - paths.add(path) - assert len(paths) == 10 - assert len(list(Path(temp_dir).glob("*.txt"))) == 10 - - def test_flat_product(): # Check regular itertools.product behavior result1 = list(flat_product([1, 2, 3], ["a", "b"])) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index e985578f05ec..c2c34ee95ad5 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -21,8 +21,8 @@ from vllm.distributed.parallel_state import ( from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.utils import update_environment_variables from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.system_utils import update_environment_variables from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.kv_cache_interface import ( diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index a06956c7d6ea..3bc35a8f7198 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -8,7 +8,7 @@ from vllm import envs from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import set_env_var +from vllm.utils.system_utils import set_env_var from .post_cleanup import PostCleanupPass from .vllm_inductor_pass import VllmInductorPass diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index 8add14ebcc3c..08721e3ae4a2 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -114,7 +114,7 @@ class VllmPatternMatcherPass(VllmInductorPass): debug_dump_path.mkdir(parents=True, exist_ok=True) - from vllm.utils import unique_filepath + from vllm.utils.system_utils import unique_filepath file_path = unique_filepath( lambda i: debug_dump_path / f"patterns.{self.pass_name}.{i}.py" diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 7ccc04cf55e0..ff2d7436b270 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -22,7 +22,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) -from vllm.utils import update_environment_variables +from vllm.utils.system_utils import update_environment_variables from vllm.utils.torch_utils import cuda_device_count_stateless logger = init_logger(__name__) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index b96e0e7c860f..e4ba66024135 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -18,12 +18,9 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_se from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import ( - FlexibleArgumentParser, - decorate_logs, - set_process_title, -) +from vllm.utils import FlexibleArgumentParser from vllm.utils.network_utils import get_tcp_uri +from vllm.utils.system_utils import decorate_logs, set_process_title from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor import Executor diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9a66ab97437f..c455d5016623 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -108,13 +108,9 @@ from vllm.entrypoints.utils import ( from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.usage.usage_lib import UsageContext -from vllm.utils import ( - Device, - FlexibleArgumentParser, - decorate_logs, - set_ulimit, -) +from vllm.utils import Device, FlexibleArgumentParser, set_ulimit from vllm.utils.network_utils import is_valid_ipv6_address +from vllm.utils.system_utils import decorate_logs from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 230a6cfc52e3..9bfb348a1d24 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -5,6 +5,7 @@ import contextlib import datetime import enum import getpass +import importlib.util import inspect import json import multiprocessing @@ -33,13 +34,11 @@ from collections.abc import ( ) from concurrent.futures.process import ProcessPoolExecutor from functools import cache, partial, wraps -from pathlib import Path -from typing import TYPE_CHECKING, Any, TextIO, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar import cloudpickle import psutil import regex as re -import setproctitle import torch import yaml @@ -144,18 +143,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -def update_environment_variables(envs: dict[str, str]): - for k, v in envs.items(): - if k in os.environ and os.environ[k] != v: - logger.warning( - "Overwriting environment variable %s from '%s' to '%s'", - k, - os.environ[k], - v, - ) - os.environ[k] = v - - def cdiv(a: int, b: int) -> int: """Ceiling division.""" return -(a // -b) @@ -1061,70 +1048,44 @@ def check_use_alibi(model_config: ModelConfig) -> bool: ) -def set_process_title( - name: str, suffix: str = "", prefix: str = envs.VLLM_PROCESS_NAME_PREFIX -) -> None: +@cache +def _has_module(module_name: str) -> bool: + """Return True if *module_name* can be found in the current environment. + + The result is cached so that subsequent queries for the same module incur + no additional overhead. """ - Set the current process title to a specific name with an - optional suffix. - - Args: - name: The title to assign to the current process. - suffix: An optional suffix to append to the base name. - prefix: A prefix to prepend to the front separated by `::`. - """ - if suffix: - name = f"{name}_{suffix}" - setproctitle.setproctitle(f"{prefix}::{name}") + return importlib.util.find_spec(module_name) is not None -def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: - """Prepend each output line with process-specific prefix""" +def has_pplx() -> bool: + """Whether the optional `pplx_kernels` package is available.""" - prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " - file_write = file.write - - def write_with_prefix(s: str): - if not s: - return - if file.start_new_line: # type: ignore[attr-defined] - file_write(prefix) - idx = 0 - while (next_idx := s.find("\n", idx)) != -1: - next_idx += 1 - file_write(s[idx:next_idx]) - if next_idx == len(s): - file.start_new_line = True # type: ignore[attr-defined] - return - file_write(prefix) - idx = next_idx - file_write(s[idx:]) - file.start_new_line = False # type: ignore[attr-defined] - - file.start_new_line = True # type: ignore[attr-defined] - file.write = write_with_prefix # type: ignore[method-assign] + return _has_module("pplx_kernels") -def decorate_logs(process_name: str | None = None) -> None: - """ - Adds a process-specific prefix to each line of output written to stdout and - stderr. +def has_deep_ep() -> bool: + """Whether the optional `deep_ep` package is available.""" - This function is intended to be called before initializing the api_server, - engine_core, or worker classes, so that all subsequent output from the - process is prefixed with the process name and PID. This helps distinguish - log output from different processes in multi-process environments. + return _has_module("deep_ep") - Args: - process_name: Optional; the name of the process to use in the prefix. - If not provided, the current process name from the multiprocessing - context is used. - """ - if process_name is None: - process_name = get_mp_context().current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + +def has_deep_gemm() -> bool: + """Whether the optional `deep_gemm` package is available.""" + + return _has_module("deep_gemm") + + +def has_triton_kernels() -> bool: + """Whether the optional `triton_kernels` package is available.""" + + return _has_module("triton_kernels") + + +def has_tilelang() -> bool: + """Whether the optional `tilelang` package is available.""" + + return _has_module("tilelang") def length_from_prompt_token_ids_or_embeds( @@ -1149,36 +1110,3 @@ def length_from_prompt_token_ids_or_embeds( f" prompt_embeds={prompt_embeds_len}" ) return prompt_token_len - - -@contextlib.contextmanager -def set_env_var(key, value): - old = os.environ.get(key) - os.environ[key] = value - try: - yield - finally: - if old is None: - del os.environ[key] - else: - os.environ[key] = old - - -def unique_filepath(fn: Callable[[int], Path]) -> Path: - """ - unique_filepath returns a unique path by trying - to include an integer in increasing order. - - fn should be a callable that returns a path that - includes the passed int at a fixed location. - - Note: This function has a TOCTOU race condition. - Caller should use atomic operations (e.g., open with 'x' mode) - when creating the file to ensure thread safety. - """ - i = 0 - while True: - p = fn(i) - if not p.exists(): - return p - i += 1 diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py new file mode 100644 index 000000000000..dd18adf55e1f --- /dev/null +++ b/vllm/utils/system_utils.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import contextlib +import os +import sys +from collections.abc import Callable, Iterator +from pathlib import Path +from typing import TextIO + +try: + import setproctitle +except ImportError: + setproctitle = None # type: ignore[assignment] + +import vllm.envs as envs +from vllm.logger import init_logger + +logger = init_logger(__name__) + +CYAN = "\033[1;36m" +RESET = "\033[0;0m" + + +# Environment variable utilities + + +def update_environment_variables(envs_dict: dict[str, str]): + """Update multiple environment variables with logging.""" + for k, v in envs_dict.items(): + if k in os.environ and os.environ[k] != v: + logger.warning( + "Overwriting environment variable %s from '%s' to '%s'", + k, + os.environ[k], + v, + ) + os.environ[k] = v + + +@contextlib.contextmanager +def set_env_var(key: str, value: str) -> Iterator[None]: + """Temporarily set an environment variable.""" + old = os.environ.get(key) + os.environ[key] = value + try: + yield + finally: + if old is None: + os.environ.pop(key, None) + else: + os.environ[key] = old + + +# File path utilities + + +def unique_filepath(fn: Callable[[int], Path]) -> Path: + """Generate a unique file path by trying incrementing integers. + + Note: This function has a TOCTOU race condition. + Caller should use atomic operations (e.g., open with 'x' mode) + when creating the file to ensure thread safety. + """ + i = 0 + while True: + p = fn(i) + if not p.exists(): + return p + i += 1 + + +# Process management utilities + + +def set_process_title( + name: str, suffix: str = "", prefix: str = envs.VLLM_PROCESS_NAME_PREFIX +) -> None: + """Set the current process title with optional suffix.""" + if setproctitle is None: + return + if suffix: + name = f"{name}_{suffix}" + setproctitle.setproctitle(f"{prefix}::{name}") + + +def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: + """Add colored prefix to file output for log decoration.""" + prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " + file_write = file.write + + def write_with_prefix(s: str): + if not s: + return + if file.start_new_line: # type: ignore[attr-defined] + file_write(prefix) + idx = 0 + while (next_idx := s.find("\n", idx)) != -1: + next_idx += 1 + file_write(s[idx:next_idx]) + if next_idx == len(s): + file.start_new_line = True # type: ignore[attr-defined] + return + file_write(prefix) + idx = next_idx + file_write(s[idx:]) + file.start_new_line = False # type: ignore[attr-defined] + + file.start_new_line = True # type: ignore[attr-defined] + file.write = write_with_prefix # type: ignore[method-assign] + + +def decorate_logs(process_name: str | None = None) -> None: + """Decorate stdout/stderr with process name and PID prefix.""" + from vllm.utils import get_mp_context + + if process_name is None: + process_name = get_mp_context().current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index e946981e78e5..39d8655ff858 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -10,8 +10,9 @@ import zmq from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import get_mp_context, set_process_title +from vllm.utils import get_mp_context from vllm.utils.network_utils import make_zmq_socket +from vllm.utils.system_utils import set_process_title from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType from vllm.v1.serial_utils import MsgpackDecoder from vllm.v1.utils import get_engine_client_zmq_addr, shutdown diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 27cf2fbe8c35..f338ff01265d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -28,14 +28,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import engine_receiver_cache_from_config from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import maybe_register_config_serialize_by_value -from vllm.utils import ( - decorate_logs, - set_process_title, -) from vllm.utils.gc_utils import maybe_attach_gc_debug_callback from vllm.utils.hashing import get_hash_fn_by_name from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.network_utils import make_zmq_socket +from vllm.utils.system_utils import decorate_logs, set_process_title from vllm.v1.core.kv_cache_utils import ( BlockHash, generate_scheduler_kv_cache_config, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 8eb45d85fff1..1b4b9c4550f7 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -35,17 +35,13 @@ from vllm.distributed.parallel_state import ( ) from vllm.envs import enable_envs_cache from vllm.logger import init_logger -from vllm.utils import ( - _maybe_force_spawn, - decorate_logs, - get_mp_context, - set_process_title, -) +from vllm.utils import _maybe_force_spawn, get_mp_context from vllm.utils.network_utils import ( get_distributed_init_method, get_loopback_ip, get_open_port, ) +from vllm.utils.system_utils import decorate_logs, set_process_title from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 7032f3ef68b4..d912589ef73a 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -16,10 +16,10 @@ from vllm.multimodal.cache import worker_receiver_cache_from_config from vllm.utils import ( enable_trace_function_call_for_thread, run_method, - update_environment_variables, warn_for_unimplemented_methods, ) from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.utils.system_utils import update_environment_variables from vllm.v1.kv_cache_interface import KVCacheSpec if TYPE_CHECKING: