[Bugfix] Fix DeepSeek R1 MTP weight loading (#29545)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
This commit is contained in:
Matthew Bonanni 2025-12-02 10:52:18 -05:00 committed by GitHub
parent 60c3d413af
commit 51c57b51dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -346,11 +346,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
# Use expert_params_mapping to locate the destination
# param and delegate to its expert-aware weight_loader
# with expert_id.
is_expert_weight = False
for mapping in expert_params_mapping:
param_name, weight_name, expert_id, shard_id = mapping
if weight_name not in chunk_name:
continue
# Anyway, this is an expert weight and should not be
# attempted to load as other weights later
is_expert_weight = True
# Do not modify `name` since the loop may continue here
# Instead, create a new variable
name_mapped = chunk_name.replace(weight_name, param_name)
@ -377,6 +382,12 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
loaded_params.add(name_mapped)
break
else:
if is_expert_weight:
# We've checked that this is an expert weight
# However it's not mapped locally to this rank
# So we simply skip it
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue