From 73cfb7a722e36c8429e7fdb5306be2d981c3f8dd Mon Sep 17 00:00:00 2001 From: Weida Hong Date: Tue, 23 Dec 2025 18:08:58 +0800 Subject: [PATCH] Correct position of docstring of class attributes (#31209) Signed-off-by: Weida Hong --- vllm/forward_context.py | 2 +- vllm/v1/kv_cache_interface.py | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 033cc1f544b3b..7a569ec32eac9 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -186,6 +186,7 @@ class DPMetadata: class ForwardContext: # copy from vllm_config.compilation_config.static_forward_context no_compile_layers: dict[str, Any] + attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]] """ Type Dict[str, AttentionMetadata] for v1, map from layer_name of each attention layer to its attention metadata @@ -193,7 +194,6 @@ class ForwardContext: for each microbatch. Set dynamically for each forward pass """ - attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]] # TODO: remove after making all virtual_engines share the same kv cache virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 751862aa9c767..7370f0aefafb4 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec): @dataclass(frozen=True) class FullAttentionSpec(AttentionSpec): - sliding_window: int | None = None - attention_chunk_size: int | None = None """ - When hybrid allocator is disabled and the model contains both full - attention layers and sliding window attention layers, sliding - window attention are regarded as full attention in KV cache manager - (blocks are allocated for all tokens), while computed as sliding window + When hybrid allocator is disabled and the model contains both full + attention layers and sliding window attention layers, sliding + window attention are regarded as full attention in KV cache manager + (blocks are allocated for all tokens), while computed as sliding window attention in model runner. In this case, we use FullAttentionSpec and record the sliding window size. + """ + + sliding_window: int | None = None + """ Default to None for not using sliding window attention. """ + attention_chunk_size: int | None = None def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len @@ -390,10 +393,11 @@ class KVCacheConfig: The KV cache configuration of a model. """ - """The number of KV cache blocks""" num_blocks: int - """How should model runner initialize the KV cache tensors for each layer""" + """The number of KV cache blocks""" kv_cache_tensors: list[KVCacheTensor] + """How should model runner initialize the KV cache tensors for each layer""" + kv_cache_groups: list[KVCacheGroupSpec] """ The kv cache groups of the model. For models with only one type of attention, there is only one group that @@ -401,4 +405,3 @@ class KVCacheConfig: For models with multiple types of attention, there will be multiple groups, see `_get_kv_cache_config_uniform_page_size` for more details. """ - kv_cache_groups: list[KVCacheGroupSpec]