From 43ecd0a900e45e0c594bb428f18b8f2b14434391 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 6 Nov 2025 11:46:30 +0800 Subject: [PATCH] [Chore] Clean up deepseek v2/v3 config copy (#28055) Signed-off-by: Isotr0py --- vllm/model_executor/models/deepseek.py | 3 +- vllm/model_executor/models/deepseek_v2.py | 3 +- vllm/model_executor/models/kimi_vl.py | 3 +- vllm/transformers_utils/config.py | 10 +- vllm/transformers_utils/configs/__init__.py | 2 - .../transformers_utils/configs/deepseek_v3.py | 100 ----------------- .../configs/deepseek_vl2.py | 102 +----------------- vllm/transformers_utils/configs/eagle.py | 12 +-- vllm/transformers_utils/configs/kimi_vl.py | 2 +- 9 files changed, 15 insertions(+), 222 deletions(-) delete mode 100644 vllm/transformers_utils/configs/deepseek_v3.py diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index ac934abea45df..adba874a2cd56 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -292,6 +292,7 @@ class DeepseekDecoderLayer(nn.Module): rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + moe_layer_freq = getattr(config, "moe_layer_freq", 1) self.self_attn = DeepseekAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -306,7 +307,7 @@ class DeepseekDecoderLayer(nn.Module): if ( config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace - and layer_idx % config.moe_layer_freq == 0 + and layer_idx % moe_layer_freq == 0 ): self.mlp = DeepseekMoE( config=config, quant_config=quant_config, prefix=f"{prefix}.mlp" diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index a253cdffd9011..4858c30baab84 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -994,6 +994,7 @@ class DeepseekV2DecoderLayer(nn.Module): rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + moe_layer_freq = getattr(config, "moe_layer_freq", 1) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. layer_idx = int(prefix.split(sep=".")[-1]) @@ -1024,7 +1025,7 @@ class DeepseekV2DecoderLayer(nn.Module): if ( config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace - and layer_idx % config.moe_layer_freq == 0 + and layer_idx % moe_layer_freq == 0 ): self.mlp = DeepseekV2MoE( config=config, diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index c2630fa6ac2b6..b54f53931d714 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -50,7 +50,7 @@ from typing import Annotated, Any, Literal import torch from torch import nn -from transformers import BatchFeature +from transformers import BatchFeature, DeepseekV2Config from transformers.activations import GELUActivation from vllm.config import VllmConfig @@ -91,7 +91,6 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig -from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 771f16fe05106..14cae2b168e19 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -24,7 +24,7 @@ from huggingface_hub.utils import ( RepositoryNotFoundError, RevisionNotFoundError, ) -from transformers import GenerationConfig, PretrainedConfig +from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -68,16 +68,18 @@ def _get_hf_token() -> str | None: class LazyConfigDict(dict): def __getitem__(self, key): + if isinstance(value := super().__getitem__(key), type): + return value + import vllm.transformers_utils.configs as configs - return getattr(configs, super().__getitem__(key)) + return getattr(configs, value) _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( chatglm="ChatGLMConfig", deepseek_vl_v2="DeepseekVLV2Config", - deepseek_v3="DeepseekV3Config", - deepseek_v32="DeepseekV3Config", + deepseek_v32=DeepseekV3Config, flex_olmo="FlexOlmoConfig", kimi_linear="KimiLinearConfig", kimi_vl="KimiVLConfig", diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 405a2f6b23954..ac612b255143c 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -8,7 +8,6 @@ Model configs may be defined in this directory for the following reasons: """ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig @@ -43,7 +42,6 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ "ChatGLMConfig", "DeepseekVLV2Config", - "DeepseekV3Config", "DotsOCRConfig", "EAGLEConfig", "FlexOlmoConfig", diff --git a/vllm/transformers_utils/configs/deepseek_v3.py b/vllm/transformers_utils/configs/deepseek_v3.py deleted file mode 100644 index 91fbed79dd021..0000000000000 --- a/vllm/transformers_utils/configs/deepseek_v3.py +++ /dev/null @@ -1,100 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -class DeepseekV3Config(PretrainedConfig): - model_type = "deepseek_v3" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=129280, - hidden_size=7168, - intermediate_size=18432, - moe_intermediate_size=2048, - num_hidden_layers=61, - num_nextn_predict_layers=1, - num_attention_heads=128, - num_key_value_heads=128, - n_shared_experts=1, - n_routed_experts=256, - ep_size=1, - routed_scaling_factor=2.5, - kv_lora_rank=512, - q_lora_rank=1536, - qk_rope_head_dim=64, - v_head_dim=128, - qk_nope_head_dim=128, - topk_method="noaux_tc", - n_group=8, - topk_group=4, - num_experts_per_tok=8, - moe_layer_freq=1, - first_k_dense_replace=3, - norm_topk_prob=True, - scoring_func="sigmoid", - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=0, - eos_token_id=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_nextn_predict_layers = num_nextn_predict_layers - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.ep_size = ep_size - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.topk_method = topk_method - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.moe_layer_freq = moe_layer_freq - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.scoring_func = scoring_func - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index 23b913157d6db..8b02a4ddd4bc7 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -3,7 +3,7 @@ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 -from transformers.configuration_utils import PretrainedConfig +from transformers import DeepseekV2Config, PretrainedConfig class VisionEncoderConfig(PretrainedConfig): @@ -87,106 +87,6 @@ class MlpProjectorConfig(PretrainedConfig): super().__init__(**kwargs) -class DeepseekV2Config(PretrainedConfig): - model_type = "deepseek_v2" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=11008, - moe_intermediate_size=1407, - num_hidden_layers=30, - num_attention_heads=32, - num_key_value_heads=32, - n_shared_experts=None, - n_routed_experts=None, - ep_size=1, - routed_scaling_factor=1.0, - kv_lora_rank=512, - q_lora_rank=1536, - qk_rope_head_dim=64, - v_head_dim=128, - qk_nope_head_dim=128, - topk_method="gready", - n_group=None, - topk_group=None, - num_experts_per_tok=None, - moe_layer_freq=1, - first_k_dense_replace=0, - norm_topk_prob=False, - scoring_func="softmax", - aux_loss_alpha=0.001, - seq_aux=True, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=100000, - eos_token_id=100001, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - use_mla=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.ep_size = ep_size - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.topk_method = topk_method - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.moe_layer_freq = moe_layer_freq - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.scoring_func = scoring_func - self.aux_loss_alpha = aux_loss_alpha - self.seq_aux = seq_aux - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = float(rms_norm_eps) - self.pretraining_tp = pretraining_tp - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.use_mla = use_mla - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - class DeepseekVLV2Config(PretrainedConfig): model_type = "deepseek_vl_v2" vision_config: VisionEncoderConfig diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 4da877f9e81f5..f5dc9ddfbc575 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -3,9 +3,7 @@ import os -from transformers import AutoConfig, PretrainedConfig - -from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config +from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig class EAGLEConfig(PretrainedConfig): @@ -20,13 +18,7 @@ class EAGLEConfig(PretrainedConfig): ): model_config: PretrainedConfig | DeepseekV2Config | None if isinstance(model, dict): - archs = model.get("architectures", []) - target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"] - if any(target_arch in archs for target_arch in target_archs): - # AutoConfig does not support DeepSeek MoE models yet - model_config = DeepseekV2Config(**model) - else: - model_config = AutoConfig.for_model(**model) + model_config = AutoConfig.for_model(**model) else: model_config = model diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py index e8c19d0ec2ffe..6d992464cbe81 100644 --- a/vllm/transformers_utils/configs/kimi_vl.py +++ b/vllm/transformers_utils/configs/kimi_vl.py @@ -2,9 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py +from transformers import DeepseekV2Config from transformers.configuration_utils import PretrainedConfig -from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config from vllm.transformers_utils.configs.moonvit import MoonViTConfig