From 4f5299f7174ffb10bdc640b47d3494083fc39c48 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:50:30 +0100 Subject: [PATCH] Relax Transformers modeling backend MoE experts check (#28952) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 4 +++- vllm/model_executor/models/transformers/moe.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bd14bbb9ab662..80fe143269a76 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -79,7 +79,9 @@ To make your model compatible with the Transformers modeling backend, it needs: 1. Add `is_causal = False` to `MyAttention`. - If your model is mixture-of-experts (MoE): 1. Your sparse MoE block must have an attribute called `experts`. - 2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`. + 2. The class of `experts` (`MyExperts`) must either: + - Inherit from `nn.ModuleList` (naive). + - Or contain all 3D `nn.Parameters` (packed). 3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 3. `MyModel` must contain `_supports_attention_backend = True`. diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 4973014c3d4ed..31db9d682bd40 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -256,7 +256,14 @@ class MoEMixin(MixtureOfExperts): def _recursive_replace(module: nn.Module, prefix: str): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - if child_name == "experts" and isinstance(child_module, nn.ModuleList): + # Naive implementations will have experts as ModuleList + is_modulelist = isinstance(child_module, nn.ModuleList) + # Packed implementations will have experts as 3D tensors of shapes like: + # gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size) + # down_proj = (num_experts, intermediate_size, hidden_size) + params = list(child_module.parameters()) + is_3d = len(params) > 0 and all(p.ndim == 3 for p in params) + if child_name == "experts" and (is_modulelist or is_3d): # Alias for readability mlp = module experts = child_module