diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 24f2a6372b451..c45c83a0707fd 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -47,6 +47,10 @@ class Attention(nn.Module): attn_type: str = AttentionType.DECODER, **extra_impl_args, ) -> None: + """ + The KV cache is stored inside this class and is accessed via + `self.kv_cache`. + """ super().__init__() if per_layer_sliding_window is not None: # per-layer sliding window @@ -155,6 +159,15 @@ class Attention(nn.Module): key: torch.Tensor, value: torch.Tensor, ) -> torch.Tensor: + """ + The KV cache is stored inside this class and is accessed via + `self.kv_cache`. + + Attention metadata (`attn_metadata`) is set using a context manager in + the model runner's `execute_model` method. It is accessed via forward + context using + `vllm.forward_context.get_forward_context().attn_metadata`. + """ if self.calculate_kv_scales: attn_metadata = get_forward_context().attn_metadata if attn_metadata.enable_kv_scales_calculation: