[Attention] Remove imports from vllm/attention/__init__.py (#29342)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
Matthew Bonanni 2025-11-26 12:53:15 -05:00 committed by GitHub
parent c4c0354eec
commit 430dd4d9eb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
96 changed files with 120 additions and 121 deletions

View File

@ -29,7 +29,7 @@ The initialization code should look like this:
```python
from torch import nn
from vllm.config import VllmConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):

View File

@ -9,8 +9,9 @@ from tests.compile.backend import LazyInitPass, TestBackend
from tests.utils import flat_product
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import Attention
from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
from vllm.compilation.fx_utils import find_op_nodes

View File

@ -5,7 +5,8 @@ import pytest
import torch
from tests.compile.backend import TestBackend
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass

View File

@ -14,7 +14,7 @@ import torch
from torch._prims_common import TensorLikeType
from tests.kernels.quant_utils import native_w8a8_block_matmul
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils import (

View File

@ -5,8 +5,8 @@ import numpy as np
import pytest
import torch
from vllm.attention import Attention
from vllm.attention.backends.abstract import MultipleOf
from vllm.attention.layer import Attention
from vllm.config import (
CacheConfig,
ModelConfig,

View File

@ -7,7 +7,7 @@ from vllm.v1.worker.utils import bind_kv_cache
def test_bind_kv_cache():
from vllm.attention import Attention
from vllm.attention.layer import Attention
ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1),
@ -35,7 +35,7 @@ def test_bind_kv_cache():
def test_bind_kv_cache_non_attention():
from vllm.attention import Attention
from vllm.attention.layer import Attention
# example from Jamba PP=2
ctx = {

View File

@ -1,19 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.attention.backends.abstract import (
AttentionBackend,
AttentionMetadata,
AttentionType,
)
from vllm.attention.layer import Attention
from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend
__all__ = [
"Attention",
"AttentionBackend",
"AttentionMetadata",
"AttentionType",
"get_attn_backend",
"get_mamba_attn_backend",
]

View File

@ -178,7 +178,7 @@ class AttentionBackend(ABC):
By default, only supports decoder attention.
Backends should override this to support other attention types.
"""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type == AttentionType.DECODER

View File

@ -10,8 +10,11 @@ import torch.nn as nn
import torch.nn.functional as F
import vllm.envs as envs
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
from vllm.attention.backends.abstract import (
AttentionBackend,
AttentionType,
MLAAttentionImpl,
)
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import get_attn_backend
from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target

View File

@ -10,7 +10,7 @@ from torch import fx
from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.quant_utils import (

View File

@ -9,7 +9,7 @@ from torch import fx
from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding

View File

@ -20,7 +20,7 @@ import torch
import zmq
from vllm import envs
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import get_attn_backend
from vllm.config import VllmConfig

View File

@ -8,7 +8,8 @@ from typing import Any, ClassVar
import torch
from vllm.attention import Attention, AttentionBackend, AttentionMetadata
from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
from vllm.distributed.kv_transfer.kv_connector.v1 import (

View File

@ -8,7 +8,7 @@ import torch.nn.functional as F
from einops import rearrange
from torch import nn
from vllm.attention import AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
from vllm.distributed.parallel_state import (

View File

@ -11,8 +11,7 @@ import torch
from torch import nn
from typing_extensions import assert_never
from vllm.attention import Attention
from vllm.attention.layer import MLAAttention
from vllm.attention.layer import Attention, MLAAttention
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.base_config import (

View File

@ -9,7 +9,8 @@ from itertools import islice
import torch
from torch import nn
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -32,7 +32,8 @@ import torch
from torch import nn
from transformers import ApertusConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig

View File

@ -8,7 +8,7 @@ from itertools import islice
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -29,7 +29,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -32,7 +32,7 @@ import torch.nn.functional as F
from torch import nn
from transformers.configuration_utils import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -27,7 +27,7 @@ import torch
from torch import nn
from transformers import BloomConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -16,7 +16,7 @@ from transformers import (
ChameleonVQVAEConfig,
)
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -12,7 +12,7 @@ import torch
from torch import nn
from torch.nn import LayerNorm
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -14,8 +14,7 @@ from transformers import (
CLIPVisionConfig,
)
from vllm.attention import Attention
from vllm.attention.layer import MultiHeadAttention
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import divide, get_tensor_model_parallel_world_size

View File

@ -30,7 +30,7 @@ import torch
from torch import nn
from transformers import Cohere2Config, CohereConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -8,7 +8,7 @@ import torch
import torch.nn as nn
from transformers import DbrxConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (
get_pp_group,

View File

@ -33,8 +33,8 @@ from torch import nn
from transformers import DeepseekV2Config, DeepseekV3Config
from vllm._aiter_ops import rocm_aiter_ops
from vllm.attention import Attention
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention
from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config

View File

@ -32,7 +32,7 @@ import torch
from torch import nn
from transformers import Dots1Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (

View File

@ -32,7 +32,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -31,7 +31,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
# from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig

View File

@ -32,7 +32,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -28,7 +28,7 @@ import torch
from torch import nn
from transformers import Exaone4Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -30,7 +30,7 @@ from torch import nn
from torch.nn import LayerNorm
from transformers import FalconConfig as HF_FalconConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -26,7 +26,7 @@ import torch
from torch import nn
from transformers import GemmaConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -23,7 +23,7 @@ import torch
from torch import nn
from transformers import Gemma2Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -23,7 +23,8 @@ import torch.nn.functional as F
from torch import nn
from transformers import Gemma3TextConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -21,7 +21,7 @@ import torch
from torch import nn
from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size

View File

@ -29,7 +29,8 @@ import torch
from torch import nn
from transformers import Glm4Config
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -31,7 +31,7 @@ import torch
from torch import nn
from transformers.models.glm4_moe import Glm4MoeConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -27,7 +27,7 @@ import torch
from torch import nn
from transformers import GPT2Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed.parallel_state import (

View File

@ -28,7 +28,7 @@ import torch
from torch import nn
from transformers import GPTBigCodeConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -26,7 +26,7 @@ import torch
from torch import nn
from transformers import GPTJConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -26,7 +26,7 @@ import torch
from torch import nn
from transformers import GPTNeoXConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -7,7 +7,8 @@ import torch.distributed as dist
from torch import nn
from transformers import GptOssConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -31,7 +31,7 @@ import torch
from torch import nn
from transformers import GraniteConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -31,7 +31,7 @@ from typing import Any
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -31,7 +31,7 @@ import torch
import torch.nn.functional as F
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -33,7 +33,8 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -10,7 +10,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -28,7 +28,7 @@ from itertools import islice
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -7,7 +7,7 @@ import torch
import torch.nn as nn
from transformers import Lfm2Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -6,7 +6,7 @@ from itertools import islice
import torch
import torch.nn as nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -31,7 +31,8 @@ import torch
from torch import nn
from transformers import LlamaConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig

View File

@ -24,7 +24,7 @@ import torch
from torch import nn
from transformers import Llama4TextConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig

View File

@ -33,7 +33,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -29,7 +29,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.layernorm import RMSNorm

View File

@ -30,7 +30,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (

View File

@ -14,7 +14,8 @@ import torch
from torch import nn
from transformers import MiniMaxConfig
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed.parallel_state import (

View File

@ -32,7 +32,7 @@ import torch
from torch import nn
from transformers import MixtralConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -17,8 +17,7 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin, TensorT
from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput
from vllm.attention import Attention
from vllm.attention.layer import MultiHeadAttention
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions

View File

@ -10,7 +10,7 @@ import torch
import torch.nn as nn
from transformers import MptConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import (

View File

@ -30,7 +30,7 @@ from itertools import islice
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -31,7 +31,7 @@ import torch
from torch import nn
from transformers import LlamaConfig
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group

View File

@ -31,7 +31,7 @@ import torch
from torch import nn
from transformers import OlmoConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -32,7 +32,7 @@ import torch
from torch import nn
from transformers import Olmo2Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -21,7 +21,7 @@ from itertools import islice
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import (

View File

@ -29,7 +29,8 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ParallelConfig, VllmConfig
from vllm.distributed import (

View File

@ -27,7 +27,7 @@ import torch
from torch import nn
from transformers import OPTConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -15,7 +15,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -33,7 +33,8 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size

View File

@ -30,7 +30,7 @@ import torch
from torch import nn
from transformers import PersimmonConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -45,7 +45,7 @@ import torch
from torch import nn
from transformers import PhiConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -31,7 +31,7 @@ import torch
from torch import nn
from transformers.configuration_utils import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -16,7 +16,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -33,7 +33,8 @@ import torch
from torch import nn
from transformers import Qwen2Config
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig

View File

@ -34,7 +34,7 @@ import torch.nn.functional as F
from torch import nn
from transformers import Qwen2MoeConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -30,7 +30,8 @@ import torch
from torch import nn
from transformers import Qwen3Config
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -31,7 +31,7 @@ from typing import Any
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
from vllm.distributed import (

View File

@ -10,7 +10,8 @@ from einops import rearrange
from torch import nn
from transformers.activations import ACT2FN
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (
CacheConfig,

View File

@ -30,7 +30,8 @@ import torch
from torch import nn
from transformers import PretrainedConfig as SeedOssConfig
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -30,7 +30,7 @@ import torch
from torch import nn
from transformers import PretrainedConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -29,7 +29,7 @@ import torch
from torch import nn
from transformers import StableLmConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul

View File

@ -28,7 +28,7 @@ import torch
from torch import nn
from transformers import Starcoder2Config
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size

View File

@ -9,7 +9,7 @@ from typing import Any
import torch
from torch import nn
from vllm.attention import Attention
from vllm.attention.layer import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.distributed import (

View File

@ -27,7 +27,8 @@ from torch import nn
from transformers import AutoModel
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
from vllm.config.utils import getattr_iter
from vllm.distributed import get_pp_group, get_tp_group

View File

@ -16,8 +16,8 @@ from transformers import (
)
from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention import Attention, AttentionType
from vllm.attention.layer import MultiHeadAttention
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layers.cross_attention import CrossAttention
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions

View File

@ -335,7 +335,7 @@ class CudaPlatformBase(Platform):
use_sparse: bool,
attn_type: str | None = None,
) -> str:
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
if attn_type is None:
attn_type = AttentionType.DECODER

View File

@ -51,7 +51,7 @@ class CPUAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""CPU attention supports decoder and encoder-only attention."""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type in (
AttentionType.DECODER,

View File

@ -84,7 +84,7 @@ class FlashAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""FlashAttention supports all attention types."""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type in (
AttentionType.DECODER,

View File

@ -87,7 +87,7 @@ class FlexAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""FlexAttention supports both decoder and encoder-only attention."""
from vllm.attention import AttentionType
from vllm.attention.backends.abstract import AttentionType
return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)

View File

@ -4,7 +4,7 @@ from collections.abc import Iterator
import torch
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
from vllm.platforms import current_platform
from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager

View File

@ -11,7 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
if TYPE_CHECKING:
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
logger = init_logger(__name__)

View File

@ -5,7 +5,7 @@ import numpy as np
import torch
from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec

View File

@ -19,12 +19,13 @@ import torch.nn as nn
from tqdm import tqdm
import vllm.envs as envs
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import (
AttentionBackend,
AttentionMetadata,
AttentionType,
MultipleOf,
)
from vllm.attention.layer import Attention
from vllm.compilation.counter import compilation_counter
from vllm.compilation.cuda_graph import CUDAGraphWrapper
from vllm.compilation.monitor import set_cudagraph_capturing_enabled

View File

@ -13,7 +13,7 @@ from typing import (
import torch
from vllm.attention import AttentionBackend
from vllm.attention.backends.abstract import AttentionBackend
from vllm.config import VllmConfig
from vllm.config.cache import CacheDType
from vllm.distributed.kv_transfer import (

View File

@ -17,9 +17,8 @@ import torch_xla.distributed.spmd as xs
import torch_xla.runtime as xr
import vllm.envs as envs
from vllm.attention import Attention
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import MLAAttention
from vllm.attention.layer import Attention, MLAAttention
from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
from vllm.config import (