[BugFix] ChunkedLocalAttention is currently not CG compatible (#26034)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
Lucas Wilkinson 2025-10-01 19:28:00 -04:00 committed by simon-mo
parent ebce361c07
commit c536881a7c

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
from typing import List, Optional
from typing import ClassVar, List, Optional
import torch
@ -11,8 +11,8 @@ from vllm.attention.backends.abstract import (AttentionBackend,
from vllm.attention.selector import get_attn_backend
from vllm.config import CacheConfig, QuantizationConfig
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata, make_local_attention_virtual_batches,
subclass_attention_backend)
AttentionCGSupport, CommonAttentionMetadata,
make_local_attention_virtual_batches, subclass_attention_backend)
from ..layer import Attention
@ -28,6 +28,8 @@ def create_chunked_local_attention_backend(
underlying_builder = underlying_attn_backend.get_builder_cls()
class ChunkedLocalAttentionBuilder(underlying_builder): # type: ignore
cudagraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.NEVER
def build(self,
common_prefix_len: int,