From dad5f4d16d2feb8f52428af0aa7ec8db52a3ac63 Mon Sep 17 00:00:00 2001
From: Wenlong Wang <wangwenlong2755@gmail.com>
Date: Sat, 20 Sep 2025 04:45:18 -0700
Subject: [PATCH] [Docs] Fix warnings in mkdocs build (continued)  (#25042)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/multimodal/__init__.py         |  2 +-
 vllm/utils/__init__.py              |  6 +++---
 vllm/v1/core/kv_cache_utils.py      |  4 ++--
 vllm/v1/sample/rejection_sampler.py | 10 +++++-----
 vllm/v1/worker/gpu_model_runner.py  |  2 +-
 vllm/v1/worker/utils.py             |  3 ++-
 vllm/worker/model_runner.py         | 12 ++++++++++--
 7 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index b7d4cd298e24f..7ffa732cf3708 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -15,7 +15,7 @@ is used by model runners to dispatch data processing according to the target
 model.
 
 Info:
-    [mm_processing](../../../design/mm_processing.html)
+    [mm_processing](../../../design/mm_processing.md)
 """
 
 __all__ = [
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index fd1c0af31269c..968bba664f0a9 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3216,7 +3216,7 @@ def cprofile_context(save_file: Optional[str] = None):
 
     Args:
         save_file: path to save the profile result. "1" or
-          None will result in printing to stdout.
+            None will result in printing to stdout.
     """
     import cProfile
 
@@ -3273,7 +3273,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
                       and getattr(cfg.attn_config, "alibi", False)))))
 
 
-def sha256(input) -> bytes:
+def sha256(input: Any) -> bytes:
     """Hash any picklable Python object using SHA-256.
 
     The input is serialized using pickle before hashing, which allows
@@ -3290,7 +3290,7 @@ def sha256(input) -> bytes:
     return hashlib.sha256(input_bytes).digest()
 
 
-def sha256_cbor(input) -> bytes:
+def sha256_cbor(input: Any) -> bytes:
     """
     Hash objects using CBOR serialization and SHA-256.
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 3ccd00121f8ed..47a41322c423c 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1229,8 +1229,8 @@ def get_kv_cache_configs(vllm_config: VllmConfig,
     Args:
         vllm_config: The global VllmConfig
         kv_cache_specs: List of dict[layer_name, KVCacheSpec] for each worker.
-        available_memory: Memory available for KV cache in bytes for each 
-        worker.
+        available_memory: Memory available for KV cache in bytes for each
+            worker. 
 
     Returns:
         The generated KVCacheConfigs for each worker.
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 3d5e59addfcfa..ced5c7a970388 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -351,17 +351,17 @@ def generate_uniform_probs(
     without a seed.
 
     Args:
-        num_tokens : int
+        num_tokens: int
             Total number of tokens.
-        num_draft_tokens : List[List[int]]
+        num_draft_tokens: List[List[int]]
             Number of draft tokens per request.
-        generators : Optional[Dict[int, torch.Generator]]
+        generators: Optional[Dict[int, torch.Generator]]
             A dictionary mapping indices in the batch to
             `torch.Generator` objects.
-        device : torch.device
+        device: torch.device
             The device on which to allocate the tensor.
     Returns:
-        uniform_rand : torch.Tensor
+        uniform_rand: torch.Tensor
             A tensor of shape `(num_tokens, )` containing uniform
             random values in the range [0, 1).
     """
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 233df8f1b0e9b..d0946e8c5d7d8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1479,7 +1479,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         Args:
             scheduler_output: The scheduler output containing scheduled encoder
-              inputs.
+                inputs.
 
         Returns:
             A tuple of (mm_kwargs, req_ids_pos) where:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 021d18b2500f0..af922f9979d1a 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -205,7 +205,8 @@ def gather_mm_placeholders(
     """
     Reconstructs the embeddings from the placeholder tokens.
 
-    This is the operation of [scatter_mm_placeholders][].
+    This is the operation of [`scatter_mm_placeholders`]
+    [vllm.v1.worker.utils.scatter_mm_placeholders].
     """
     if is_embed is None:
         return placeholders
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index f662f5a85eff6..bab89586b0f2c 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1810,7 +1810,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
 
         return [output]
 
-    def need_recv_kv(self, model_input, kv_caches) -> bool:
+    def need_recv_kv(self, model_input: ModelInputForGPUWithSamplingMetadata,
+                     kv_caches: List[torch.Tensor]) -> bool:
         """Check if we need to receive kv-cache from the other worker.
         We need to receive KV when
             1. current vLLM instance is KV cache consumer/decode vLLM instance
@@ -1825,6 +1826,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         if self.vllm_config.kv_transfer_config is None:
             return False
 
+        if model_input.attn_metadata is None:
+            raise ValueError("model_input.attn_metadata cannot be None")
+
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
@@ -1835,7 +1839,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         return self.vllm_config.kv_transfer_config.is_kv_consumer and (
             not is_profile_run) and is_prefill_run
 
-    def need_send_kv(self, model_input, kv_caches) -> bool:
+    def need_send_kv(self, model_input: ModelInputForGPUWithSamplingMetadata,
+                     kv_caches: List[torch.Tensor]) -> bool:
         """Check if we need to send kv-cache to the other worker.
         We need to send KV when
             1. current vLLM instance is KV cache producer/prefill vLLM instance
@@ -1850,6 +1855,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
         if self.vllm_config.kv_transfer_config is None:
             return False
 
+        if model_input.attn_metadata is None:
+            raise ValueError("model_input.attn_metadata cannot be None")
+
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling