mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 13:15:39 +08:00
[Model][Bugfix]fix ernie45 load failed due to ernie45 eplb code (#26684)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
This commit is contained in:
parent
481545b397
commit
01ad27faff
@ -23,7 +23,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Inference-only ErineMoE model compatible with HuggingFace weights."""
|
"""Inference-only ErineMoE model compatible with HuggingFace weights."""
|
||||||
|
|
||||||
from collections.abc import Iterable
|
import typing
|
||||||
|
from collections.abc import Callable, Iterable
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@ -139,10 +140,10 @@ class Ernie4_5_MoeMoE(nn.Module):
|
|||||||
|
|
||||||
# Load balancing settings.
|
# Load balancing settings.
|
||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
parallel_config = vllm_config.parallel_config
|
eplb_config = vllm_config.parallel_config.eplb_config
|
||||||
self.enable_eplb = enable_eplb
|
self.enable_eplb = enable_eplb
|
||||||
|
|
||||||
self.n_redundant_experts = parallel_config.num_redundant_experts
|
self.n_redundant_experts = eplb_config.num_redundant_experts
|
||||||
self.n_logical_experts = self.n_routed_experts
|
self.n_logical_experts = self.n_routed_experts
|
||||||
self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
|
self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
|
||||||
self.n_local_physical_experts = self.n_physical_experts // self.ep_size
|
self.n_local_physical_experts = self.n_physical_experts // self.ep_size
|
||||||
@ -426,8 +427,10 @@ class Ernie4_5_MoeModel(nn.Module):
|
|||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.config = config
|
self.config = config
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
|
eplb_config = parallel_config.eplb_config
|
||||||
enable_eplb = parallel_config.enable_eplb
|
enable_eplb = parallel_config.enable_eplb
|
||||||
self.num_redundant_experts = parallel_config.num_redundant_experts
|
|
||||||
|
self.num_redundant_experts = eplb_config.num_redundant_experts
|
||||||
|
|
||||||
if get_pp_group().is_first_rank:
|
if get_pp_group().is_first_rank:
|
||||||
self.embed_tokens = VocabParallelEmbedding(
|
self.embed_tokens = VocabParallelEmbedding(
|
||||||
@ -570,20 +573,27 @@ class Ernie4_5_MoeModel(nn.Module):
|
|||||||
|
|
||||||
# Skip loading extra bias for GPTQ models.
|
# Skip loading extra bias for GPTQ models.
|
||||||
if (
|
if (
|
||||||
name.endswith(".bias") or name.endswith("_bias")
|
name_mapped.endswith(".bias") or name_mapped.endswith("_bias")
|
||||||
) and name not in params_dict:
|
) and name_mapped not in params_dict:
|
||||||
continue
|
continue
|
||||||
param = params_dict[name]
|
param = params_dict[name_mapped]
|
||||||
|
# We should ask the weight loader to return success or not
|
||||||
weight_loader = param.weight_loader
|
# here since otherwise we may skip experts with other
|
||||||
weight_loader(
|
# available replicas.
|
||||||
|
weight_loader = typing.cast(
|
||||||
|
Callable[..., bool], param.weight_loader
|
||||||
|
)
|
||||||
|
success = weight_loader(
|
||||||
param,
|
param,
|
||||||
loaded_weight,
|
loaded_weight,
|
||||||
name,
|
name_mapped,
|
||||||
shard_id=shard_id,
|
shard_id=shard_id,
|
||||||
expert_id=expert_id,
|
expert_id=expert_id,
|
||||||
|
return_success=True,
|
||||||
)
|
)
|
||||||
break
|
if success:
|
||||||
|
name = name_mapped
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
if is_expert_weight:
|
if is_expert_weight:
|
||||||
# We've checked that this is an expert weight
|
# We've checked that this is an expert weight
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user