mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-25 05:29:12 +08:00
[Docs] Fix warnings in mkdocs build (#23649)
Signed-off-by: Zerohertz <ohg3417@gmail.com> Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
9b0187003e
commit
730d0ac8b9
@ -805,14 +805,18 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
|
||||
"""Forward pass with FlashAttention.
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads, head_size]
|
||||
key: shape = [num_tokens, num_kv_heads, head_size]
|
||||
value: shape = [num_tokens, num_kv_heads, head_size]
|
||||
output: shape = [num_tokens, num_heads, head_size]
|
||||
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
|
||||
layer: Attention layer instance.
|
||||
q: Query tensor with shape = [num_tokens, num_heads, head_size]
|
||||
k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
|
||||
v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
|
||||
kv_cache: KV cache tensor with shape
|
||||
[2, num_blocks, block_size, num_kv_heads, head_size].
|
||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||
for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
output: Output tensor with shape [num_tokens, num_heads, head_size]
|
||||
output_scale: Optional output scale tensor.
|
||||
output_block_scale: Optional output block scale tensor.
|
||||
NOTE: It in-place updates the output tensor.
|
||||
NOTE: FP8 quantization, flash-attn expect the size of
|
||||
{q,k,v}_descale to be (num_sequences, num_kv_heads).
|
||||
|
||||
@ -605,7 +605,8 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
key: shape = [num_tokens, num_kv_heads, head_size]
|
||||
value: shape = [num_tokens, num_kv_heads, head_size]
|
||||
output: shape = [num_tokens, num_heads, head_size]
|
||||
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
|
||||
kv_cache: KV cache tensor with shape
|
||||
[2, num_blocks, block_size, num_kv_heads, head_size].
|
||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||
for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
@ -850,7 +851,7 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
|
||||
|
||||
def _get_query_key_seq_metadata(
|
||||
attn_metadata,
|
||||
attn_metadata: FlashAttentionMetadata,
|
||||
is_prompt: bool,
|
||||
attn_type: str,
|
||||
) -> tuple:
|
||||
|
||||
@ -584,17 +584,18 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
use prefill sequence attributes
|
||||
|
||||
Args:
|
||||
layer: Attention layer instance.
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
||||
kv_cache: KV cache tensor with shape
|
||||
[2, num_blocks, block_size * num_kv_heads * head_size].
|
||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||
for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
attn_type: Select attention type, between encoder attention,
|
||||
decoder self-attention, or encoder/decoder cross-
|
||||
attention. Defaults to decoder self-attention,
|
||||
which is the vLLM default generally
|
||||
output: Optional output tensor.
|
||||
output_scale: Optional output scale tensor.
|
||||
output_block_scale: Optional output block scale tensor.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
|
||||
@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
|
||||
|
||||
Raises:
|
||||
AssertionError: If the number of encoder tokens in `attn_metadata`
|
||||
is `None` when required for the calculations.
|
||||
is `None` when required for the calculations.
|
||||
"""
|
||||
num_prefill_query_tokens = 0
|
||||
num_decode_query_tokens = 0
|
||||
|
||||
@ -471,17 +471,18 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
||||
max_encoder_seq_len)
|
||||
|
||||
Args:
|
||||
layer: Attention layer instance.
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
key: shape = [num_tokens, num_kv_heads * head_size]
|
||||
value: shape = [num_tokens, num_kv_heads * head_size]
|
||||
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
|
||||
kv_cache: KV cache tensor with shape
|
||||
[2, num_blocks, block_size * num_kv_heads * head_size].
|
||||
NOTE: kv_cache will be an empty tensor with shape [0]
|
||||
for profiling run.
|
||||
attn_metadata: Metadata for attention.
|
||||
attn_type: Select attention type, between encoder attention,
|
||||
decoder self-attention, or encoder/decoder cross-
|
||||
attention. Defaults to decoder self-attention,
|
||||
which is the vLLM default generally
|
||||
output: Optional output tensor.
|
||||
output_scale: Optional output scale tensor.
|
||||
output_block_scale: Optional output block scale tensor.
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
@ -644,7 +645,6 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
|
||||
for API spec.
|
||||
|
||||
Args:
|
||||
output: shape = [num_prefill_tokens, num_heads, head_size]
|
||||
query: shape = [num_prefill_tokens, num_heads, head_size]
|
||||
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
|
||||
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
|
||||
|
||||
@ -352,7 +352,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
||||
with num_lookahead_slots.
|
||||
|
||||
Args:
|
||||
sequence_group (SequenceGroup): The sequence group to swap in.
|
||||
seq_group (SequenceGroup): The sequence group to swap in.
|
||||
num_lookahead_slots (int): Number of lookahead slots used in
|
||||
speculative decoding, default to 0.
|
||||
|
||||
@ -405,8 +405,6 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
||||
|
||||
Args:
|
||||
seq_group (SequenceGroup): The sequence group to swap out.
|
||||
num_lookahead_slots (int): Number of lookahead slots used in
|
||||
speculative decoding, default to 0.
|
||||
|
||||
Returns:
|
||||
bool: Whether it's possible to swap out current sequence group.
|
||||
@ -420,7 +418,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
||||
swapping out the given sequence_group with num_lookahead_slots.
|
||||
|
||||
Args:
|
||||
sequence_group (SequenceGroup): The sequence group to swap out.
|
||||
seq_group (SequenceGroup): The sequence group to swap out.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: The mapping of swapping block from
|
||||
@ -473,7 +471,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
||||
on to the 'device'.
|
||||
|
||||
Args:
|
||||
sequence_group (SequenceGroup): The sequence group to swap in/out.
|
||||
seq_group (SequenceGroup): The sequence group to swap in/out.
|
||||
device (Device): device to swap the 'seq_group' on.
|
||||
status (SequenceStatus): The status of sequence which is needed
|
||||
for action. RUNNING for swap out and SWAPPED for swap in
|
||||
|
||||
@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
|
||||
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
|
||||
|
||||
def __init__(self,
|
||||
*args,
|
||||
*args: Any,
|
||||
log_requests: bool = True,
|
||||
start_engine_loop: bool = True,
|
||||
**kwargs) -> None:
|
||||
**kwargs: Any) -> None:
|
||||
if envs.VLLM_USE_V1:
|
||||
raise ValueError(
|
||||
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
|
||||
|
||||
@ -644,10 +644,10 @@ class LLMEngine:
|
||||
Details:
|
||||
- Set arrival_time to the current time if it is None.
|
||||
- Set prompt_token_ids to the encoded prompt if it is None.
|
||||
- Create `n` number of [Sequence][vllm.Sequence] objects.
|
||||
- Create a [SequenceGroup][vllm.SequenceGroup] object
|
||||
from the list of [Sequence][vllm.Sequence].
|
||||
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
|
||||
- Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
|
||||
- Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
|
||||
from the list of [Sequence][vllm.sequence.Sequence].
|
||||
- Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
|
||||
scheduler.
|
||||
|
||||
Example:
|
||||
|
||||
@ -186,7 +186,7 @@ class LLM:
|
||||
CompilationConfig]] = None,
|
||||
logits_processors: Optional[list[Union[str,
|
||||
type[LogitsProcessor]]]] = None,
|
||||
**kwargs,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""LLM constructor."""
|
||||
|
||||
@ -697,8 +697,8 @@ class LLM:
|
||||
Generate responses for a chat conversation.
|
||||
|
||||
The chat conversation is converted into a text prompt using the
|
||||
tokenizer and calls the [generate][] method to generate the
|
||||
responses.
|
||||
tokenizer and calls the [generate][vllm.LLM.generate] method to generate
|
||||
the responses.
|
||||
|
||||
Multi-modal inputs can be passed in the same way you would pass them
|
||||
to the OpenAI API.
|
||||
@ -1334,8 +1334,8 @@ class LLM:
|
||||
|
||||
def wake_up(self, tags: Optional[list[str]] = None):
|
||||
"""
|
||||
Wake up the engine from sleep mode. See the [sleep][] method
|
||||
for more details.
|
||||
Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
|
||||
method for more details.
|
||||
|
||||
Args:
|
||||
tags: An optional list of tags to reallocate the engine memory
|
||||
|
||||
@ -461,7 +461,8 @@ class MinimaxToolParser(ToolParser):
|
||||
i += 1
|
||||
return boundaries
|
||||
|
||||
def _extract_tool_args(self, tool_content: str, args_match) -> str:
|
||||
def _extract_tool_args(self, tool_content: str,
|
||||
args_match: re.Match[str]) -> str:
|
||||
"""
|
||||
Extract tool arguments from tool content.
|
||||
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from einops import rearrange
|
||||
|
||||
@ -453,7 +455,14 @@ class _attention(torch.autograd.Function):
|
||||
lightning_attention_ = _attention.apply
|
||||
|
||||
|
||||
def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
|
||||
def lightning_attention(
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
ed: torch.Tensor,
|
||||
block_size: int = 256,
|
||||
kv_history: Optional[torch.Tensor] = None
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Apply lightning attention algorithm
|
||||
to compute attention efficiently.
|
||||
|
||||
@ -233,10 +233,10 @@ class LinearBase(CustomOp):
|
||||
Args:
|
||||
input_size: input dimension of the linear layer.
|
||||
output_size: output dimension of the linear layer.
|
||||
bias: If true, add bias.
|
||||
skip_bias_add: If true, skip adding bias but instead return it.
|
||||
params_dtype: Data type for the parameters.
|
||||
quant_config: Quantization configure.
|
||||
prefix: Prefix for parameter names.
|
||||
return_bias: If true, return bias together with outputs in forward pass.
|
||||
"""
|
||||
|
||||
@ -378,13 +378,14 @@ class MergedReplicatedLinear(ReplicatedLinear):
|
||||
|
||||
Args:
|
||||
input_size: input dimension of the linear layer.
|
||||
output_size: output dimension of the linear layer.
|
||||
output_sizes: list of output dimensions of the linear layer.
|
||||
bias: If true, add bias.
|
||||
skip_bias_add: If true, skip adding bias but instead return it.
|
||||
params_dtype: Data type for the parameters.
|
||||
quant_config: Quantization configure.
|
||||
prefix: The name of the layer in the state dict, including all parents
|
||||
(e.g. model.layers.0.qkv_proj)
|
||||
return_bias: If true, return bias together with outputs in forward pass.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@ -409,7 +409,7 @@ class EmbeddingOutput:
|
||||
|
||||
Args:
|
||||
embedding: The embedding vector, which is a list of floats.
|
||||
Its length depends on the hidden dimension of the model.
|
||||
Its length depends on the hidden dimension of the model.
|
||||
"""
|
||||
embedding: list[float]
|
||||
|
||||
@ -447,7 +447,7 @@ class ClassificationOutput:
|
||||
|
||||
Args:
|
||||
probs: The probability vector, which is a list of floats.
|
||||
Its length depends on the number of classes.
|
||||
Its length depends on the number of classes.
|
||||
"""
|
||||
probs: list[float]
|
||||
|
||||
|
||||
@ -147,18 +147,7 @@ class SequenceDataDelta(
|
||||
|
||||
class SequenceData(msgspec.Struct,
|
||||
omit_defaults=True): # type: ignore[call-arg]
|
||||
"""Data associated with a sequence.
|
||||
|
||||
Args:
|
||||
prompt_token_ids: The token IDs of the prompt.
|
||||
output_token_ids: The token IDs of the output. Set to an empty list if
|
||||
None.
|
||||
|
||||
Attributes:
|
||||
prompt_token_ids: The token IDs of the prompt.
|
||||
output_token_ids: The token IDs of the output.
|
||||
cumulative_logprob: The cumulative log probability of the output.
|
||||
"""
|
||||
"""Data associated with a sequence."""
|
||||
# NOTE: we cannot use Union[list, array] because msgspec cannot support
|
||||
# union of 2 list types.
|
||||
_prompt_token_ids: array
|
||||
@ -256,10 +245,12 @@ class SequenceData(msgspec.Struct,
|
||||
|
||||
@property
|
||||
def cumulative_logprob(self) -> float:
|
||||
"""The cumulative log probability of the output."""
|
||||
return self._cumulative_logprob
|
||||
|
||||
@property
|
||||
def prompt_token_ids(self) -> tuple[int, ...]:
|
||||
"""The token IDs of the prompt."""
|
||||
return self._prompt_token_ids_tuple
|
||||
|
||||
@prompt_token_ids.setter
|
||||
@ -277,6 +268,7 @@ class SequenceData(msgspec.Struct,
|
||||
|
||||
@property
|
||||
def output_token_ids(self) -> tuple[int, ...]:
|
||||
"""The token IDs of the output."""
|
||||
return tuple(self._output_token_ids)
|
||||
|
||||
@output_token_ids.setter
|
||||
@ -940,7 +932,7 @@ class SequenceGroupMetadata(
|
||||
omit_defaults=True): # type: ignore[call-arg]
|
||||
"""Metadata for a sequence group. Used to create `AttentionMetadata`.
|
||||
|
||||
Args:
|
||||
Attributes:
|
||||
request_id: The ID of the request.
|
||||
is_prompt: Whether the request is at prompt stage.
|
||||
seq_data: The sequence data. (Seq id -> sequence data)
|
||||
@ -950,14 +942,14 @@ class SequenceGroupMetadata(
|
||||
do_sample: True if sampling is required. Sampling is not required when
|
||||
e.g., prefill is chunked, and the current iteration only computes
|
||||
query tokens for prefill, we don't need sampling.
|
||||
token_chunk_size: The number of tokens to be processed (per sequence).
|
||||
None if chunking is not required.
|
||||
pooling_params: Pooling parameters.
|
||||
lora_request: LoRA request.
|
||||
computed_block_nums: The block numbers that are already computed,
|
||||
used in prefix caching.
|
||||
state: Internal state tied to this sequence group.
|
||||
token_type_ids: Token type IDs.
|
||||
multi_modal_data: Multi modal data.
|
||||
mm_processor_kwargs: Multimodal input processor / mapper overrides.
|
||||
multi_modal_placeholders: Multi modal placeholders.
|
||||
encoder_seq_data: Optional sequence data for encoder prompt
|
||||
(SequenceGroup.encoder_seq). Should be None
|
||||
unless you are working with an encoder/decoder
|
||||
@ -1043,12 +1035,13 @@ class SequenceOutput(
|
||||
array_like=True): # type: ignore[call-arg]
|
||||
"""The model output associated with a sequence.
|
||||
|
||||
Args:
|
||||
Attributes:
|
||||
parent_seq_id: The ID of the parent sequence (for forking in beam
|
||||
search).
|
||||
output_token: The output token ID.
|
||||
logprobs: The logprobs of the output token.
|
||||
(Token id -> logP(x_i+1 | x_0, ..., x_i))
|
||||
output_embed: Optional output embedding tensor.
|
||||
"""
|
||||
parent_seq_id: int
|
||||
output_token: int
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user