From dad5f4d16d2feb8f52428af0aa7ec8db52a3ac63 Mon Sep 17 00:00:00 2001 From: Wenlong Wang Date: Sat, 20 Sep 2025 04:45:18 -0700 Subject: [PATCH] [Docs] Fix warnings in mkdocs build (continued) (#25042) Signed-off-by: wwl2755 Signed-off-by: yewentao256 --- vllm/multimodal/__init__.py | 2 +- vllm/utils/__init__.py | 6 +++--- vllm/v1/core/kv_cache_utils.py | 4 ++-- vllm/v1/sample/rejection_sampler.py | 10 +++++----- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 3 ++- vllm/worker/model_runner.py | 12 ++++++++++-- 7 files changed, 24 insertions(+), 15 deletions(-) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index b7d4cd298e24f..7ffa732cf3708 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -15,7 +15,7 @@ is used by model runners to dispatch data processing according to the target model. Info: - [mm_processing](../../../design/mm_processing.html) + [mm_processing](../../../design/mm_processing.md) """ __all__ = [ diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index fd1c0af31269c..968bba664f0a9 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3216,7 +3216,7 @@ def cprofile_context(save_file: Optional[str] = None): Args: save_file: path to save the profile result. "1" or - None will result in printing to stdout. + None will result in printing to stdout. """ import cProfile @@ -3273,7 +3273,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool: and getattr(cfg.attn_config, "alibi", False))))) -def sha256(input) -> bytes: +def sha256(input: Any) -> bytes: """Hash any picklable Python object using SHA-256. The input is serialized using pickle before hashing, which allows @@ -3290,7 +3290,7 @@ def sha256(input) -> bytes: return hashlib.sha256(input_bytes).digest() -def sha256_cbor(input) -> bytes: +def sha256_cbor(input: Any) -> bytes: """ Hash objects using CBOR serialization and SHA-256. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3ccd00121f8ed..47a41322c423c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1229,8 +1229,8 @@ def get_kv_cache_configs(vllm_config: VllmConfig, Args: vllm_config: The global VllmConfig kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker. - available_memory: Memory available for KV cache in bytes for each - worker. + available_memory: Memory available for KV cache in bytes for each + worker. Returns: The generated KVCacheConfigs for each worker. diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 3d5e59addfcfa..ced5c7a970388 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -351,17 +351,17 @@ def generate_uniform_probs( without a seed. Args: - num_tokens : int + num_tokens: int Total number of tokens. - num_draft_tokens : List[List[int]] + num_draft_tokens: List[List[int]] Number of draft tokens per request. - generators : Optional[Dict[int, torch.Generator]] + generators: Optional[Dict[int, torch.Generator]] A dictionary mapping indices in the batch to `torch.Generator` objects. - device : torch.device + device: torch.device The device on which to allocate the tensor. Returns: - uniform_rand : torch.Tensor + uniform_rand: torch.Tensor A tensor of shape `(num_tokens, )` containing uniform random values in the range [0, 1). """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 233df8f1b0e9b..d0946e8c5d7d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1479,7 +1479,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: scheduler_output: The scheduler output containing scheduled encoder - inputs. + inputs. Returns: A tuple of (mm_kwargs, req_ids_pos) where: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 021d18b2500f0..af922f9979d1a 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -205,7 +205,8 @@ def gather_mm_placeholders( """ Reconstructs the embeddings from the placeholder tokens. - This is the operation of [scatter_mm_placeholders][]. + This is the operation of [`scatter_mm_placeholders`] + [vllm.v1.worker.utils.scatter_mm_placeholders]. """ if is_embed is None: return placeholders diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f662f5a85eff6..bab89586b0f2c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1810,7 +1810,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): return [output] - def need_recv_kv(self, model_input, kv_caches) -> bool: + def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor]) -> bool: """Check if we need to receive kv-cache from the other worker. We need to receive KV when 1. current vLLM instance is KV cache consumer/decode vLLM instance @@ -1825,6 +1826,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if self.vllm_config.kv_transfer_config is None: return False + if model_input.attn_metadata is None: + raise ValueError("model_input.attn_metadata cannot be None") + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1835,7 +1839,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): return self.vllm_config.kv_transfer_config.is_kv_consumer and ( not is_profile_run) and is_prefill_run - def need_send_kv(self, model_input, kv_caches) -> bool: + def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata, + kv_caches: List[torch.Tensor]) -> bool: """Check if we need to send kv-cache to the other worker. We need to send KV when 1. current vLLM instance is KV cache producer/prefill vLLM instance @@ -1850,6 +1855,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if self.vllm_config.kv_transfer_config is None: return False + if model_input.attn_metadata is None: + raise ValueError("model_input.attn_metadata cannot be None") + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling