From 2671334d45ea96ca57938cc765ba26cdb796d067 Mon Sep 17 00:00:00 2001 From: Asher Date: Wed, 23 Jul 2025 18:54:08 +0800 Subject: [PATCH] [Model] add Hunyuan V1 Dense Model support. (#21368) Signed-off-by: Asher Zhang --- docs/models/supported_models.md | 1 + tests/models/registry.py | 2 + .../{hunyuan_v1_moe.py => hunyuan_v1.py} | 70 ++++++++++++++----- vllm/model_executor/models/registry.py | 3 +- 4 files changed, 57 insertions(+), 19 deletions(-) rename vllm/model_executor/models/{hunyuan_v1_moe.py => hunyuan_v1.py} (95%) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bbb52f035c727..c8b6c6c861209 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -363,6 +363,7 @@ th { | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | +| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 1eb7f7b9d8296..84ca0bc60003e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -199,6 +199,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct", trust_remote_code=True), + "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", + trust_remote_code=True), "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", diff --git a/vllm/model_executor/models/hunyuan_v1_moe.py b/vllm/model_executor/models/hunyuan_v1.py similarity index 95% rename from vllm/model_executor/models/hunyuan_v1_moe.py rename to vllm/model_executor/models/hunyuan_v1.py index b3baec98b0fc0..fbba849a76f23 100644 --- a/vllm/model_executor/models/hunyuan_v1_moe.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -61,6 +61,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_layers) +def _is_moe(config: PretrainedConfig) -> bool: + num_experts = getattr(config, "num_experts", None) + if isinstance(num_experts, int): + return num_experts > 1 + if isinstance(num_experts, list) and num_experts: + # Ensure all elements are integers before calling max. + if all(isinstance(e, int) for e in num_experts): + return max(num_experts) > 1 + else: + return False + return False + + def _get_cla_factor(config: PretrainedConfig) -> int: if not getattr(config, "use_cla", False): return 1 @@ -140,8 +153,8 @@ class HunYuanAttention(nn.Module): # the KV heads across multiple tensor parallel GPUs. assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - # MistralConfig has an optional head_dim introduced by Mistral-Nemo - if hasattr(config, "head_dim"): + + if hasattr(config, "head_dim") and config.head_dim: self.head_dim = config.head_dim elif hasattr(config, "attention_head_dim"): self.head_dim = config.attention_head_dim @@ -490,12 +503,23 @@ class HunYuanDecoderLayer(nn.Module): else: raise RuntimeError(f"Unsupported attention type: {attention_type}") - self.mlp = HunYuanSparseMoeBlock( - config=config, - quant_config=quant_config, - layer_id=layer_id, - prefix=f"{prefix}.mlp", - ) + if _is_moe(config): + self.mlp = HunYuanSparseMoeBlock( + config=config, + quant_config=quant_config, + layer_id=layer_id, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = HunYuanMLP( + hidden_size=self.hidden_size, + intermediate_size=self.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, @@ -642,15 +666,17 @@ class HunYuanModel(nn.Module): return torch.concat((q, k, v)) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts, - ) + if _is_moe(self.config): + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + ) + else: + return [] def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): cla_factor = _get_cla_factor(self.config) @@ -815,7 +841,7 @@ class HunYuanModel(nn.Module): return loaded_params -class HunYuanMoEV1ForCausalLM(nn.Module, SupportsLoRA): +class HunYuanV1Base(nn.Module, SupportsLoRA): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -901,3 +927,11 @@ class HunYuanMoEV1ForCausalLM(nn.Module, SupportsLoRA): def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: return self.model.get_expert_mapping() + + +class HunYuanDenseV1ForCausalLM(HunYuanV1Base): + pass + + +class HunYuanMoEV1ForCausalLM(HunYuanV1Base): + pass diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 100532943c2bf..fafb6a704383b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -79,7 +79,8 @@ _TEXT_GENERATION_MODELS = { "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501 "GritLM": ("gritlm", "GritLM"), "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"), - "HunYuanMoEV1ForCausalLM": ("hunyuan_v1_moe", "HunYuanMoEV1ForCausalLM"), + "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"), + "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),