mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-29 07:07:26 +08:00
Correct position of docstring of class attributes (#31209)
Signed-off-by: Weida Hong <wdhongtw@google.com>
This commit is contained in:
parent
f32cfd7d97
commit
73cfb7a722
@ -186,6 +186,7 @@ class DPMetadata:
|
||||
class ForwardContext:
|
||||
# copy from vllm_config.compilation_config.static_forward_context
|
||||
no_compile_layers: dict[str, Any]
|
||||
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
|
||||
"""
|
||||
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
|
||||
attention layer to its attention metadata
|
||||
@ -193,7 +194,6 @@ class ForwardContext:
|
||||
for each microbatch.
|
||||
Set dynamically for each forward pass
|
||||
"""
|
||||
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
|
||||
# TODO: remove after making all virtual_engines share the same kv cache
|
||||
virtual_engine: int # set dynamically for each forward pass
|
||||
# set dynamically for each forward pass
|
||||
|
||||
@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec):
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FullAttentionSpec(AttentionSpec):
|
||||
sliding_window: int | None = None
|
||||
attention_chunk_size: int | None = None
|
||||
"""
|
||||
When hybrid allocator is disabled and the model contains both full
|
||||
attention layers and sliding window attention layers, sliding
|
||||
window attention are regarded as full attention in KV cache manager
|
||||
(blocks are allocated for all tokens), while computed as sliding window
|
||||
When hybrid allocator is disabled and the model contains both full
|
||||
attention layers and sliding window attention layers, sliding
|
||||
window attention are regarded as full attention in KV cache manager
|
||||
(blocks are allocated for all tokens), while computed as sliding window
|
||||
attention in model runner.
|
||||
In this case, we use FullAttentionSpec and record the sliding window size.
|
||||
"""
|
||||
|
||||
sliding_window: int | None = None
|
||||
"""
|
||||
Default to None for not using sliding window attention.
|
||||
"""
|
||||
attention_chunk_size: int | None = None
|
||||
|
||||
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
||||
max_model_len = vllm_config.model_config.max_model_len
|
||||
@ -390,10 +393,11 @@ class KVCacheConfig:
|
||||
The KV cache configuration of a model.
|
||||
"""
|
||||
|
||||
"""The number of KV cache blocks"""
|
||||
num_blocks: int
|
||||
"""How should model runner initialize the KV cache tensors for each layer"""
|
||||
"""The number of KV cache blocks"""
|
||||
kv_cache_tensors: list[KVCacheTensor]
|
||||
"""How should model runner initialize the KV cache tensors for each layer"""
|
||||
kv_cache_groups: list[KVCacheGroupSpec]
|
||||
"""
|
||||
The kv cache groups of the model.
|
||||
For models with only one type of attention, there is only one group that
|
||||
@ -401,4 +405,3 @@ class KVCacheConfig:
|
||||
For models with multiple types of attention, there will be multiple groups,
|
||||
see `_get_kv_cache_config_uniform_page_size` for more details.
|
||||
"""
|
||||
kv_cache_groups: list[KVCacheGroupSpec]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user