From 65552b476b1c475ef433995d2699bb27428693b3 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 9 Aug 2025 14:10:51 +0800 Subject: [PATCH] [Misc] Use config definitions from Transformers library (#21913) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aimv2.py | 22 +++++++++---------- vllm/model_executor/models/commandr.py | 8 +++---- vllm/model_executor/models/dbrx.py | 14 ++++++------ vllm/model_executor/models/deepseek_v2.py | 15 ++++++++----- vllm/model_executor/models/dots1.py | 8 +++---- vllm/model_executor/models/exaone4.py | 6 ++--- vllm/model_executor/models/glm4_moe.py | 10 ++++----- vllm/model_executor/models/minimax_text_01.py | 6 ++--- vllm/model_executor/models/olmoe.py | 4 ++-- vllm/model_executor/models/qwen2_moe.py | 6 ++--- vllm/model_executor/models/qwen3_moe.py | 6 ++--- 11 files changed, 54 insertions(+), 51 deletions(-) diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index d2307bb464bdb..b13d863ebb744 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,7 +8,6 @@ from typing import Optional import torch import torch.nn as nn -from transformers import PretrainedConfig from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size @@ -21,12 +20,13 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.transformers_utils.configs.ovis import AIMv2Config class AIMv2SwiGLUFFN(nn.Module): - def __init__(self, config: PretrainedConfig, - quant_config: QuantizationConfig, prefix: str): + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): super().__init__() hidden_features = config.intermediate_size in_features = config.hidden_size @@ -57,7 +57,7 @@ class AIMv2SwiGLUFFN(nn.Module): class AIMv2PatchEmbed(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: AIMv2Config): super().__init__() self.proj = nn.Conv2d( config.num_channels, @@ -75,7 +75,7 @@ class AIMv2PatchEmbed(nn.Module): class AIMv2ViTPreprocessor(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: AIMv2Config): super().__init__() num_patches = (config.image_size // config.patch_size)**2 @@ -93,8 +93,8 @@ class AIMv2ViTPreprocessor(nn.Module): class AIMv2Attention(nn.Module): - def __init__(self, config: PretrainedConfig, - quant_config: QuantizationConfig, prefix: str): + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -141,8 +141,8 @@ class AIMv2Attention(nn.Module): class AIMv2Block(nn.Module): - def __init__(self, config: PretrainedConfig, - quant_config: QuantizationConfig, prefix: str): + def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + prefix: str): super().__init__() self.attn = AIMv2Attention(config, quant_config=quant_config, @@ -163,7 +163,7 @@ class AIMv2Transformer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: AIMv2Config, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, @@ -193,7 +193,7 @@ class AIMv2Transformer(nn.Module): class AIMv2Model(torch.nn.Module): def __init__(self, - config: PretrainedConfig, + config: AIMv2Config, quant_config: QuantizationConfig, *, require_post_norm: Optional[bool] = None, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index c4f6144ed91f0..69281abf730aa 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -27,7 +27,7 @@ from typing import Optional, Union import torch from torch import nn -from transformers import CohereConfig +from transformers import Cohere2Config, CohereConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -89,7 +89,7 @@ class CohereMLP(nn.Module): def __init__( self, - config: CohereConfig, + config: Union[CohereConfig, Cohere2Config], quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -124,7 +124,7 @@ class CohereAttention(nn.Module): def __init__( self, - config: CohereConfig, + config: Union[CohereConfig, Cohere2Config], cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -242,7 +242,7 @@ class CohereAttention(nn.Module): class CohereDecoderLayer(nn.Module): def __init__(self, - config: CohereConfig, + config: Union[CohereConfig, Cohere2Config], cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 360c7e66bf5ce..e74d90e0b1d7d 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -6,7 +6,7 @@ from typing import Optional, Union import torch import torch.nn as nn -from transformers import PretrainedConfig +from transformers import DbrxConfig from vllm.attention import Attention from vllm.config import CacheConfig, VllmConfig @@ -39,7 +39,7 @@ class DbrxRouter(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, params_dtype: Optional[torch.dtype] = None, ): super().__init__() @@ -63,7 +63,7 @@ class DbrxExperts(FusedMoE): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -138,7 +138,7 @@ class DbrxMoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, quant_config: Optional[QuantizationConfig] = None, params_dtype: Optional[torch.dtype] = None, prefix: str = "", @@ -169,7 +169,7 @@ class DbrxAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -249,7 +249,7 @@ class DbrxFusedNormAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -284,7 +284,7 @@ class DbrxBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c2880c33cb65d..f199da135ec76 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -29,7 +29,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import DeepseekV2Config, DeepseekV3Config from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -100,7 +100,7 @@ class DeepseekV2MoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_eplb: bool = False, @@ -221,7 +221,7 @@ class DeepseekV2Attention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], hidden_size: int, num_heads: int, qk_nope_head_dim: int, @@ -373,7 +373,7 @@ class DeepseekV2MLAAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], hidden_size: int, num_heads: int, qk_nope_head_dim: int, @@ -538,7 +538,7 @@ class DeepseekV2DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Union[DeepseekV2Config, DeepseekV3Config], prefix: str, model_config: ModelConfig, cache_config: Optional[CacheConfig] = None, @@ -973,7 +973,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): pass -def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, +# Compatibility with +# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py +def get_spec_layer_idx_from_weight_name(config: Union[DeepseekV2Config, + DeepseekV3Config], weight_name: str) -> Optional[int]: if (hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0): diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 9b21a79446138..5f410c0ae5fb0 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -29,7 +29,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import Dots1Config from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -99,7 +99,7 @@ class Dots1MoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Dots1Config, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -174,7 +174,7 @@ class Dots1Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - config: PretrainedConfig, + config: Dots1Config, rope_theta: float = 10000, rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, @@ -260,7 +260,7 @@ class Dots1DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Dots1Config, prefix: str, model_config: ModelConfig, cache_config: Optional[CacheConfig] = None, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 3d6ce3e8895fb..ecd942a76aced 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -26,7 +26,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import Exaone4Config from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -96,7 +96,7 @@ class Exaone4Attention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Exaone4Config, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -224,7 +224,7 @@ class Exaone4DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Exaone4Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 0053e4e6ffec9..624eef6cf1ae9 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -28,7 +28,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers.models.glm4_moe import Glm4MoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -100,7 +100,7 @@ class Glm4MoE(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Glm4MoeConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_eplb: bool = False, @@ -198,7 +198,7 @@ class Glm4MoeAttention(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Glm4MoeConfig, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -297,7 +297,7 @@ class Glm4MoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Glm4MoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -681,7 +681,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): return self.model.get_expert_mapping() -def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, +def get_spec_layer_idx_from_weight_name(config: Glm4MoeConfig, weight_name: str) -> Optional[int]: if hasattr(config, "num_nextn_predict_layers") and (config.num_nextn_predict_layers diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 1f9f7f60cabf3..3d14a6ad5c3a4 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -12,7 +12,7 @@ import torch.distributed import torch.nn.functional as F from einops import rearrange from torch import nn -from transformers.configuration_utils import PretrainedConfig +from transformers import MiniMaxConfig from vllm import envs from vllm.attention import Attention, AttentionMetadata @@ -656,7 +656,7 @@ class MiniMaxText01DecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MiniMaxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, expert_num: int = 1, @@ -860,7 +860,7 @@ class MiniMaxText01Model(nn.Module): def __init__( self, - config: PretrainedConfig, + config: MiniMaxConfig, quant_config: Optional[QuantizationConfig] = None, cache_config: Optional[CacheConfig] = None, scheduler_config=None, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 7552f64c423ea..a47c3bd416459 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -19,7 +19,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import OlmoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -205,7 +205,7 @@ class OlmoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: OlmoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index b061e2f69a6c6..5c4ad34246d66 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -30,7 +30,7 @@ from typing import Any, Optional, Union import torch import torch.nn.functional as F from torch import nn -from transformers import PretrainedConfig +from transformers import Qwen2MoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -98,7 +98,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen2MoeConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): @@ -256,7 +256,7 @@ class Qwen2MoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen2MoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index b2397c115d1d1..3d1e72299b4b8 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -28,7 +28,7 @@ from typing import Any, Optional, Union import torch from torch import nn -from transformers import PretrainedConfig +from transformers import Qwen3MoeConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile @@ -101,7 +101,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen3MoeConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", enable_eplb: bool = False, @@ -278,7 +278,7 @@ class Qwen3MoeDecoderLayer(nn.Module): def __init__( self, - config: PretrainedConfig, + config: Qwen3MoeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "",