Add comments on accessing kv_cache and attn_metadata (#13887)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-02-26 10:41:02 +00:00 committed by GitHub
parent 7b700ec8c8
commit 0ecdd98031
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -47,6 +47,10 @@ class Attention(nn.Module):
attn_type: str = AttentionType.DECODER,
**extra_impl_args,
) -> None:
"""
The KV cache is stored inside this class and is accessed via
`self.kv_cache`.
"""
super().__init__()
if per_layer_sliding_window is not None:
# per-layer sliding window
@ -155,6 +159,15 @@ class Attention(nn.Module):
key: torch.Tensor,
value: torch.Tensor,
) -> torch.Tensor:
"""
The KV cache is stored inside this class and is accessed via
`self.kv_cache`.
Attention metadata (`attn_metadata`) is set using a context manager in
the model runner's `execute_model` method. It is accessed via forward
context using
`vllm.forward_context.get_forward_context().attn_metadata`.
"""
if self.calculate_kv_scales:
attn_metadata = get_forward_context().attn_metadata
if attn_metadata.enable_kv_scales_calculation: