mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-08 17:42:14 +08:00
[Model] add Hunyuan V1 Dense Model support. (#21368)
Signed-off-by: Asher Zhang <asherszhang@tencent.com>
This commit is contained in:
parent
2cc5016a19
commit
2671334d45
@ -363,6 +363,7 @@ th {
|
|||||||
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
|
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
|
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
|
||||||
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
|
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
|
||||||
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
|
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
|
||||||
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||||
|
|||||||
@ -199,6 +199,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
|||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
|
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-A13B-Instruct",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
|
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
|
||||||
|
trust_remote_code=True),
|
||||||
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
|
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
|
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
|
||||||
|
|||||||
@ -61,6 +61,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
|
|||||||
make_layers)
|
make_layers)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_moe(config: PretrainedConfig) -> bool:
|
||||||
|
num_experts = getattr(config, "num_experts", None)
|
||||||
|
if isinstance(num_experts, int):
|
||||||
|
return num_experts > 1
|
||||||
|
if isinstance(num_experts, list) and num_experts:
|
||||||
|
# Ensure all elements are integers before calling max.
|
||||||
|
if all(isinstance(e, int) for e in num_experts):
|
||||||
|
return max(num_experts) > 1
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _get_cla_factor(config: PretrainedConfig) -> int:
|
def _get_cla_factor(config: PretrainedConfig) -> int:
|
||||||
if not getattr(config, "use_cla", False):
|
if not getattr(config, "use_cla", False):
|
||||||
return 1
|
return 1
|
||||||
@ -140,8 +153,8 @@ class HunYuanAttention(nn.Module):
|
|||||||
# the KV heads across multiple tensor parallel GPUs.
|
# the KV heads across multiple tensor parallel GPUs.
|
||||||
assert tp_size % self.total_num_kv_heads == 0
|
assert tp_size % self.total_num_kv_heads == 0
|
||||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||||
# MistralConfig has an optional head_dim introduced by Mistral-Nemo
|
|
||||||
if hasattr(config, "head_dim"):
|
if hasattr(config, "head_dim") and config.head_dim:
|
||||||
self.head_dim = config.head_dim
|
self.head_dim = config.head_dim
|
||||||
elif hasattr(config, "attention_head_dim"):
|
elif hasattr(config, "attention_head_dim"):
|
||||||
self.head_dim = config.attention_head_dim
|
self.head_dim = config.attention_head_dim
|
||||||
@ -490,12 +503,23 @@ class HunYuanDecoderLayer(nn.Module):
|
|||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Unsupported attention type: {attention_type}")
|
raise RuntimeError(f"Unsupported attention type: {attention_type}")
|
||||||
|
|
||||||
self.mlp = HunYuanSparseMoeBlock(
|
if _is_moe(config):
|
||||||
config=config,
|
self.mlp = HunYuanSparseMoeBlock(
|
||||||
quant_config=quant_config,
|
config=config,
|
||||||
layer_id=layer_id,
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.mlp",
|
layer_id=layer_id,
|
||||||
)
|
prefix=f"{prefix}.mlp",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.mlp = HunYuanMLP(
|
||||||
|
hidden_size=self.hidden_size,
|
||||||
|
intermediate_size=self.intermediate_size,
|
||||||
|
hidden_act=config.hidden_act,
|
||||||
|
quant_config=quant_config,
|
||||||
|
bias=getattr(config, "mlp_bias", False),
|
||||||
|
prefix=f"{prefix}.mlp",
|
||||||
|
)
|
||||||
|
|
||||||
self.input_layernorm = RMSNorm(config.hidden_size,
|
self.input_layernorm = RMSNorm(config.hidden_size,
|
||||||
eps=config.rms_norm_eps)
|
eps=config.rms_norm_eps)
|
||||||
self.post_attention_layernorm = RMSNorm(config.hidden_size,
|
self.post_attention_layernorm = RMSNorm(config.hidden_size,
|
||||||
@ -642,15 +666,17 @@ class HunYuanModel(nn.Module):
|
|||||||
return torch.concat((q, k, v))
|
return torch.concat((q, k, v))
|
||||||
|
|
||||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||||
|
if _is_moe(self.config):
|
||||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||||
# (param_name, weight_name, expert_id, shard_id)
|
# (param_name, weight_name, expert_id, shard_id)
|
||||||
return FusedMoE.make_expert_params_mapping(
|
return FusedMoE.make_expert_params_mapping(
|
||||||
ckpt_gate_proj_name="gate_proj",
|
ckpt_gate_proj_name="gate_proj",
|
||||||
ckpt_down_proj_name="down_proj",
|
ckpt_down_proj_name="down_proj",
|
||||||
ckpt_up_proj_name="up_proj",
|
ckpt_up_proj_name="up_proj",
|
||||||
num_experts=self.config.num_experts,
|
num_experts=self.config.num_experts,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||||
cla_factor = _get_cla_factor(self.config)
|
cla_factor = _get_cla_factor(self.config)
|
||||||
@ -815,7 +841,7 @@ class HunYuanModel(nn.Module):
|
|||||||
return loaded_params
|
return loaded_params
|
||||||
|
|
||||||
|
|
||||||
class HunYuanMoEV1ForCausalLM(nn.Module, SupportsLoRA):
|
class HunYuanV1Base(nn.Module, SupportsLoRA):
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
@ -901,3 +927,11 @@ class HunYuanMoEV1ForCausalLM(nn.Module, SupportsLoRA):
|
|||||||
|
|
||||||
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
|
||||||
return self.model.get_expert_mapping()
|
return self.model.get_expert_mapping()
|
||||||
|
|
||||||
|
|
||||||
|
class HunYuanDenseV1ForCausalLM(HunYuanV1Base):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class HunYuanMoEV1ForCausalLM(HunYuanV1Base):
|
||||||
|
pass
|
||||||
@ -79,7 +79,8 @@ _TEXT_GENERATION_MODELS = {
|
|||||||
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
|
"GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"), # noqa: E501
|
||||||
"GritLM": ("gritlm", "GritLM"),
|
"GritLM": ("gritlm", "GritLM"),
|
||||||
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
|
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
|
||||||
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1_moe", "HunYuanMoEV1ForCausalLM"),
|
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
|
||||||
|
"HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
|
||||||
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
|
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||||
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
|
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
|
||||||
"InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
|
"InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user