[Docs] Fix warnings in mkdocs build (#23649)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Hyogeun Oh (오효근) 2025-08-27 03:19:23 +09:00 committed by GitHub
parent 9b0187003e
commit 730d0ac8b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 66 additions and 58 deletions

View File

@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
"""Forward pass with FlashAttention.
Args:
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
layer: Attention layer instance.
q: Query tensor with shape = [num_tokens, num_heads, head_size]
k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
output: Output tensor with shape [num_tokens, num_heads, head_size]
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
NOTE: It in-place updates the output tensor.
NOTE: FP8 quantization, flash-attn expect the size of
{q,k,v}_descale to be (num_sequences, num_kv_heads).

View File

@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
output: shape = [num_tokens, num_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size, num_kv_heads, head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
def _get_query_key_seq_metadata(
attn_metadata,
attn_metadata: FlashAttentionMetadata,
is_prompt: bool,
attn_type: str,
) -> tuple:

View File

@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
use prefill sequence attributes
Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention,
decoder self-attention, or encoder/decoder cross-
attention. Defaults to decoder self-attention,
which is the vLLM default generally
output: Optional output tensor.
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
Returns:
shape = [num_tokens, num_heads * head_size]
"""

View File

@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
Raises:
AssertionError: If the number of encoder tokens in `attn_metadata`
is `None` when required for the calculations.
is `None` when required for the calculations.
"""
num_prefill_query_tokens = 0
num_decode_query_tokens = 0

View File

@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
max_encoder_seq_len)
Args:
layer: Attention layer instance.
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: KV cache tensor with shape
[2, num_blocks, block_size * num_kv_heads * head_size].
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
attn_type: Select attention type, between encoder attention,
decoder self-attention, or encoder/decoder cross-
attention. Defaults to decoder self-attention,
which is the vLLM default generally
output: Optional output tensor.
output_scale: Optional output scale tensor.
output_block_scale: Optional output block scale tensor.
Returns:
shape = [num_tokens, num_heads * head_size]
"""
@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
for API spec.
Args:
output: shape = [num_prefill_tokens, num_heads, head_size]
query: shape = [num_prefill_tokens, num_heads, head_size]
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
value: shape = [num_prefill_tokens, num_kv_heads, head_size]

View File

@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
with num_lookahead_slots.
Args:
sequence_group (SequenceGroup): The sequence group to swap in.
seq_group (SequenceGroup): The sequence group to swap in.
num_lookahead_slots (int): Number of lookahead slots used in
speculative decoding, default to 0.
@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
Args:
seq_group (SequenceGroup): The sequence group to swap out.
num_lookahead_slots (int): Number of lookahead slots used in
speculative decoding, default to 0.
Returns:
bool: Whether it's possible to swap out current sequence group.
@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
swapping out the given sequence_group with num_lookahead_slots.
Args:
sequence_group (SequenceGroup): The sequence group to swap out.
seq_group (SequenceGroup): The sequence group to swap out.
Returns:
List[Tuple[int, int]]: The mapping of swapping block from
@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
on to the 'device'.
Args:
sequence_group (SequenceGroup): The sequence group to swap in/out.
seq_group (SequenceGroup): The sequence group to swap in/out.
device (Device): device to swap the 'seq_group' on.
status (SequenceStatus): The status of sequence which is needed
for action. RUNNING for swap out and SWAPPED for swap in

View File

@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
def __init__(self,
*args,
*args: Any,
log_requests: bool = True,
start_engine_loop: bool = True,
**kwargs) -> None:
**kwargs: Any) -> None:
if envs.VLLM_USE_V1:
raise ValueError(
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "

View File

@ -644,10 +644,10 @@ class LLMEngine:
Details:
- Set arrival_time to the current time if it is None.
- Set prompt_token_ids to the encoded prompt if it is None.
- Create `n` number of [Sequence][vllm.Sequence] objects.
- Create a [SequenceGroup][vllm.SequenceGroup] object
from the list of [Sequence][vllm.Sequence].
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
- Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
- Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
from the list of [Sequence][vllm.sequence.Sequence].
- Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
scheduler.
Example:

View File

@ -186,7 +186,7 @@ class LLM:
CompilationConfig]] = None,
logits_processors: Optional[list[Union[str,
type[LogitsProcessor]]]] = None,
**kwargs,
**kwargs: Any,
) -> None:
"""LLM constructor."""
@ -697,8 +697,8 @@ class LLM:
Generate responses for a chat conversation.
The chat conversation is converted into a text prompt using the
tokenizer and calls the [generate][] method to generate the
responses.
tokenizer and calls the [generate][vllm.LLM.generate] method to generate
the responses.
Multi-modal inputs can be passed in the same way you would pass them
to the OpenAI API.
@ -1334,8 +1334,8 @@ class LLM:
def wake_up(self, tags: Optional[list[str]] = None):
"""
Wake up the engine from sleep mode. See the [sleep][] method
for more details.
Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
method for more details.
Args:
tags: An optional list of tags to reallocate the engine memory

View File

@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
i += 1
return boundaries
def _extract_tool_args(self, tool_content: str, args_match) -> str:
def _extract_tool_args(self, tool_content: str,
args_match: re.Match[str]) -> str:
"""
Extract tool arguments from tool content.

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
import torch
from einops import rearrange
@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
lightning_attention_ = _attention.apply
def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
def lightning_attention(
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
ed: torch.Tensor,
block_size: int = 256,
kv_history: Optional[torch.Tensor] = None
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Apply lightning attention algorithm
to compute attention efficiently.

View File

@ -233,10 +233,10 @@ class LinearBase(CustomOp):
Args:
input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer.
bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configure.
prefix: Prefix for parameter names.
return_bias: If true, return bias together with outputs in forward pass.
"""
@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
Args:
input_size: input dimension of the linear layer.
output_size: output dimension of the linear layer.
output_sizes: list of output dimensions of the linear layer.
bias: If true, add bias.
skip_bias_add: If true, skip adding bias but instead return it.
params_dtype: Data type for the parameters.
quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.qkv_proj)
return_bias: If true, return bias together with outputs in forward pass.
"""
def __init__(

View File

@ -409,7 +409,7 @@ class EmbeddingOutput:
Args:
embedding: The embedding vector, which is a list of floats.
Its length depends on the hidden dimension of the model.
Its length depends on the hidden dimension of the model.
"""
embedding: list[float]
@ -447,7 +447,7 @@ class ClassificationOutput:
Args:
probs: The probability vector, which is a list of floats.
Its length depends on the number of classes.
Its length depends on the number of classes.
"""
probs: list[float]

View File

@ -147,18 +147,7 @@ class SequenceDataDelta(
class SequenceData(msgspec.Struct,
omit_defaults=True): # type: ignore[call-arg]
"""Data associated with a sequence.
Args:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output. Set to an empty list if
None.
Attributes:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output.
cumulative_logprob: The cumulative log probability of the output.
"""
"""Data associated with a sequence."""
# NOTE: we cannot use Union[list, array] because msgspec cannot support
# union of 2 list types.
_prompt_token_ids: array
@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
@property
def cumulative_logprob(self) -> float:
"""The cumulative log probability of the output."""
return self._cumulative_logprob
@property
def prompt_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the prompt."""
return self._prompt_token_ids_tuple
@prompt_token_ids.setter
@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
@property
def output_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the output."""
return tuple(self._output_token_ids)
@output_token_ids.setter
@ -940,7 +932,7 @@ class SequenceGroupMetadata(
omit_defaults=True): # type: ignore[call-arg]
"""Metadata for a sequence group. Used to create `AttentionMetadata`.
Args:
Attributes:
request_id: The ID of the request.
is_prompt: Whether the request is at prompt stage.
seq_data: The sequence data. (Seq id -> sequence data)
@ -950,14 +942,14 @@ class SequenceGroupMetadata(
do_sample: True if sampling is required. Sampling is not required when
e.g., prefill is chunked, and the current iteration only computes
query tokens for prefill, we don't need sampling.
token_chunk_size: The number of tokens to be processed (per sequence).
None if chunking is not required.
pooling_params: Pooling parameters.
lora_request: LoRA request.
computed_block_nums: The block numbers that are already computed,
used in prefix caching.
state: Internal state tied to this sequence group.
token_type_ids: Token type IDs.
multi_modal_data: Multi modal data.
mm_processor_kwargs: Multimodal input processor / mapper overrides.
multi_modal_placeholders: Multi modal placeholders.
encoder_seq_data: Optional sequence data for encoder prompt
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
@ -1043,12 +1035,13 @@ class SequenceOutput(
array_like=True): # type: ignore[call-arg]
"""The model output associated with a sequence.
Args:
Attributes:
parent_seq_id: The ID of the parent sequence (for forking in beam
search).
output_token: The output token ID.
logprobs: The logprobs of the output token.
(Token id -> logP(x_i+1 | x_0, ..., x_i))
output_embed: Optional output embedding tensor.
"""
parent_seq_id: int
output_token: int