From 73cfb7a722e36c8429e7fdb5306be2d981c3f8dd Mon Sep 17 00:00:00 2001
From: Weida Hong <wdhongtw@google.com>
Date: Tue, 23 Dec 2025 18:08:58 +0800
Subject: [PATCH] Correct position of docstring of class attributes (#31209)

Signed-off-by: Weida Hong <wdhongtw@google.com>
---
 vllm/forward_context.py       |  2 +-
 vllm/v1/kv_cache_interface.py | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 033cc1f544b3b..7a569ec32eac9 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -186,6 +186,7 @@ class DPMetadata:
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     """
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
@@ -193,7 +194,6 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 751862aa9c767..7370f0aefafb4 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec):
 
 @dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
-    sliding_window: int | None = None
-    attention_chunk_size: int | None = None
     """
-    When hybrid allocator is disabled and the model contains both full 
-    attention layers and sliding window attention layers, sliding 
-    window attention are regarded as full attention in KV cache manager 
-    (blocks are allocated for all tokens), while computed as sliding window 
+    When hybrid allocator is disabled and the model contains both full
+    attention layers and sliding window attention layers, sliding
+    window attention are regarded as full attention in KV cache manager
+    (blocks are allocated for all tokens), while computed as sliding window
     attention in model runner.
     In this case, we use FullAttentionSpec and record the sliding window size.
+    """
+
+    sliding_window: int | None = None
+    """
     Default to None for not using sliding window attention.
     """
+    attention_chunk_size: int | None = None
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
@@ -390,10 +393,11 @@ class KVCacheConfig:
     The KV cache configuration of a model.
     """
 
-    """The number of KV cache blocks"""
     num_blocks: int
-    """How should model runner initialize the KV cache tensors for each layer"""
+    """The number of KV cache blocks"""
     kv_cache_tensors: list[KVCacheTensor]
+    """How should model runner initialize the KV cache tensors for each layer"""
+    kv_cache_groups: list[KVCacheGroupSpec]
     """
     The kv cache groups of the model.
     For models with only one type of attention, there is only one group that
@@ -401,4 +405,3 @@ class KVCacheConfig:
     For models with multiple types of attention, there will be multiple groups,
     see `_get_kv_cache_config_uniform_page_size` for more details.
     """
-    kv_cache_groups: list[KVCacheGroupSpec]