[Docs] Fix warnings in mkdocs build (continued) (#23743)

Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
2025-12-10 15:14:53 +08:00 · 2025-08-28 02:17:29 +09:00 · 2025-08-28 02:17:29 +09:00 · 4e4d017b6f
commit 4e4d017b6f
parent dd58932280
26 changed files with 56 additions and 50 deletions
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@ -491,7 +491,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size * num_kv_heads * head_size]
                NOTE: kv_cache will be an empty tensor with shape [0]
                for profiling run.
            attn_metadata: Metadata for attention.
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@ -438,7 +438,8 @@ class FlashAttentionImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -637,11 +637,9 @@ class FlashInferImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape -
-            # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
-            # HND: [num_blocks, 2,  num_kv_heads, block_size, head_size]
-
-
+            kv_cache: KV cache tensor with different possible shapes:
+                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@ -689,7 +689,8 @@ class FlexAttentionImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@ -235,7 +235,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads * head_size]
            key: shape = [num_tokens, num_kv_heads * head_size]
            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+            kv_cache: shape =
+                [num_blocks, block_size, num_kv_heads * 2, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
@ -329,7 +330,7 @@ def write_to_kv_cache(
    Args:
        key: shape = [num_tokens, num_kv_heads, head_size]
        value: shape = [num_tokens, num_kv_heads, head_size]
-        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+        kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
        num_slices_per_kv_cache_update_block: int
    """
    _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@ -429,7 +429,8 @@ class AiterFlashAttentionImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@ -362,7 +362,8 @@ class TreeAttentionImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@ -285,7 +285,8 @@ class TritonAttentionImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@ -330,7 +330,8 @@ class XFormersAttentionImpl(AttentionImpl):
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@ -119,7 +119,8 @@ class KVCacheCoordinator(ABC):

        Args:
            request: The request.
-            num_tokens: The total number of tokens that need to be cached 
+            num_computed_tokens: The total number of tokens
+                that need to be cached
                (including tokens that are already cached).
        """
        for manager in self.single_type_managers:
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -54,14 +54,15 @@ class KVCacheBlocks:
    def get_block_ids(
        self,
        allow_none: bool = False,
-    ):
+    ) -> Optional[tuple[list[int], ...]]:
        """
        Converts the KVCacheBlocks instance to block_ids.

        Returns:
-            tuple[list[int], ...]: A tuple of lists where
-            * the outer tuple corresponds to KV cache groups
-            * each inner list contains the block_ids of the blocks in that group
+            tuple[list[int], ...]: A tuple of lists where:
+                - the outer tuple corresponds to KV cache groups
+                - each inner list contains the block_ids of the blocks in that
+                  group
        """
        if allow_none and all(len(group) == 0 for group in self.blocks):
            return None
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@ -8,6 +8,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.ray_distributed_executor import (  # noqa
    RayDistributedExecutor as RayDistributedExecutorV0)
 from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
@ -64,7 +65,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):

    def execute_model(
        self,
-        scheduler_output,
+        scheduler_output: SchedulerOutput,
    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
        """Execute the model on the Ray workers.

--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@ -36,7 +36,7 @@ def setup_multiprocess_prometheus():
                       "and vLLM will properly handle cleanup.")


-def get_prometheus_registry():
+def get_prometheus_registry() -> CollectorRegistry:
    """Get the appropriate prometheus registry based on multiprocessing 
    configuration.
    
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@ -91,7 +91,7 @@ class LogitsProcessor(ABC):
        to each forward pass.

        Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
+            batch_update: Non-None iff there have been changes
+                to the batch makeup.
        """
        raise NotImplementedError
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@ -68,7 +68,7 @@ class RejectionSampler(nn.Module):
                different requests are flattened into a single tensor because
                this is the shape of the output logits.
                NOTE: `target_logits` can be updated in place to save memory.
-            bonus_token_ids_tensor (torch.Tensor):
+            bonus_token_ids (torch.Tensor):
                A tensor containing bonus tokens. Shape is [batch_size, 1].
                Bonus tokens are added to the end of the sequence if all
                proposed tokens are accepted. We generate the bonus tokens
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@ -89,7 +89,7 @@ class Sampler(nn.Module):
        Gather logprobs for topk and sampled/prompt token.

        Args:
-          logits: (num tokens) x (vocab) tensor
+          logprobs: (num tokens) x (vocab) tensor
          num_logprobs: minimum number of logprobs to
                        retain per token
          token_ids: prompt tokens (if prompt logprobs)
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@ -525,9 +525,6 @@ class InputBatch:
        Any consecutive empty indices at the very end of the list are not
        filled.

-        Args:
-          empty_req_indices: empty indices which may be filled.
-
        Returns:
          swaps: list of (from,to) swap tuples for moved requests
          empty_req_indices: indices not filled by condensation
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@ -552,7 +552,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        return kv_cache_spec

    def _get_slot_mapping_metadata(self, num_reqs,
-                                   num_scheduled_tokens_per_req):
+                                   num_scheduled_tokens_per_req) -> np.ndarray:
        """
        Computes metadata for mapping slots to blocks in the key-value (KV)
        cache for a batch of requests.