[Feature][EPLB] Add eplb support for Qwen3 (#20815)

Signed-off-by: aladerran <aladerran@gmail.com>
2026-06-11 03:29:10 +08:00 · 2025-07-30 21:27:57 +08:00 · 2025-07-30 21:27:57 +08:00 · d979dd6beb
commit d979dd6beb
parent b876860c62
1 changed files with 142 additions and 24 deletions
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@ -22,7 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
-from collections.abc import Iterable
+import typing
 from collections.abc import Callable, Iterable
 from typing import Any, Optional, Union
 import torch
@ -31,8 +32,9 @@ from transformers import PretrainedConfig
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (get_ep_group, get_pp_group,
                              get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@ -50,8 +52,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
-from .utils import (AutoWeightsLoader, extract_layer_index,
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
                    is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)
@ -101,23 +103,47 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        config: PretrainedConfig,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
        enable_eplb: bool = False,
    ):
        super().__init__()
        self.tp_size = get_tensor_model_parallel_world_size()
        self.ep_group = get_ep_group().device_group
        self.ep_rank = self.ep_group.rank()
        self.ep_size = self.ep_group.size()
        self.n_routed_experts = config.num_experts
        if self.tp_size > config.num_experts:
            raise ValueError(
                f"Tensor parallel size {self.tp_size} is greater than "
                f"the number of experts {config.num_experts}.")
-        self.experts = FusedMoE(num_experts=config.num_experts,
+        # Load balancing settings.
        vllm_config = get_current_vllm_config()
        parallel_config = vllm_config.parallel_config
        self.enable_eplb = enable_eplb
        self.n_logical_experts = self.n_routed_experts
        self.n_redundant_experts = parallel_config.num_redundant_experts
        self.n_physical_experts = (self.n_logical_experts +
                                   self.n_redundant_experts)
        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
        self.physical_expert_start = (self.ep_rank *
                                      self.n_local_physical_experts)
        self.physical_expert_end = (self.physical_expert_start +
                                    self.n_local_physical_experts)
        self.experts = FusedMoE(num_experts=self.n_routed_experts,
                                top_k=config.num_experts_per_tok,
                                hidden_size=config.hidden_size,
                                intermediate_size=config.moe_intermediate_size,
                                reduce_results=False,
                                renormalize=config.norm_topk_prob,
                                quant_config=quant_config,
-                                prefix=f"{prefix}.experts")
+                                prefix=f"{prefix}.experts",
                                enable_eplb=self.enable_eplb,
                                num_redundant_experts=self.n_redundant_experts)
        self.gate = ReplicatedLinear(config.hidden_size,
                                     config.num_experts,
@ -246,6 +272,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
        cache_config: Optional[CacheConfig] = None,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
        enable_eplb: bool = False,
    ) -> None:
        super().__init__()
        self.hidden_size = config.hidden_size
@ -277,7 +304,8 @@ class Qwen3MoeDecoderLayer(nn.Module):
            (layer_idx + 1) % config.decoder_sparse_step == 0):
            self.mlp = Qwen3MoeSparseMoeBlock(config=config,
                                              quant_config=quant_config,
-                                              prefix=f"{prefix}.mlp")
+                                              prefix=f"{prefix}.mlp",
                                              enable_eplb=enable_eplb)
        else:
            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
                                   intermediate_size=config.intermediate_size,
@ -323,6 +351,9 @@ class Qwen3MoeModel(nn.Module):
        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        parallel_config = vllm_config.parallel_config
        enable_eplb = parallel_config.enable_eplb
        self.num_redundant_experts = parallel_config.num_redundant_experts
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
@ -336,7 +367,8 @@ class Qwen3MoeModel(nn.Module):
            lambda prefix: Qwen3MoeDecoderLayer(config=config,
                                                cache_config=cache_config,
                                                quant_config=quant_config,
-                                                prefix=prefix),
+                                                prefix=prefix,
                                                enable_eplb=enable_eplb),
            prefix=f"{prefix}.layers",
        )
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@ -382,7 +414,8 @@ class Qwen3MoeModel(nn.Module):
            ckpt_gate_proj_name="gate_proj",
            ckpt_down_proj_name="down_proj",
            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_experts)
+            num_experts=self.config.num_experts,
            num_redundant_experts=self.num_redundant_experts)
    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
@ -433,27 +466,51 @@ class Qwen3MoeModel(nn.Module):
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                is_expert_weight = False
                for mapping in expert_params_mapping:
                    param_name, weight_name, expert_id, shard_id = mapping
                    if weight_name not in name:
                        continue
-                    name = name.replace(weight_name, param_name)
+
-                    # Skip layers on other devices.
+                    # Anyway, this is an expert weight and should not be
-                    if is_pp_missing_parameter(name, self):
+                    # attempted to load as other weights later
                    is_expert_weight = True
                    # Do not modify `name` since the loop may continue here
                    # Instead, create a new variable
                    name_mapped = name.replace(weight_name, param_name)
                    if is_pp_missing_parameter(name_mapped, self):
                        continue
                    # Skip loading extra parameters for GPTQ/modelopt models.
-                    if name.endswith(
+                    if name_mapped.endswith(
-                            ignore_suffixes) and name not in params_dict:
+                            ignore_suffixes
                    ) and name_mapped not in params_dict:
                        continue
-                    param = params_dict[name]
+
-                    weight_loader = param.weight_loader
+                    param = params_dict[name_mapped]
-                    weight_loader(param,
+                    # We should ask the weight loader to return success or not
-                                  loaded_weight,
+                    # here since otherwise we may skip experts with other
-                                  name,
+                    # available replicas.
-                                  shard_id=shard_id,
+                    weight_loader = typing.cast(Callable[..., bool],
-                                  expert_id=expert_id)
+                                                param.weight_loader)
-                    break
+                    success = weight_loader(param,
                                            loaded_weight,
                                            name_mapped,
                                            shard_id=shard_id,
                                            expert_id=expert_id,
                                            return_success=True)
                    if success:
                        name = name_mapped
                        break
                else:
                    if is_expert_weight:
                        # We've checked that this is an expert weight
                        # However it's not mapped locally to this rank
                        # So we simply skip it
                        continue
                    # Skip loading extra parameters for GPTQ/modelopt models.
                    if name.endswith(
                            ignore_suffixes) and name not in params_dict:
@ -482,7 +539,8 @@ class Qwen3MoeModel(nn.Module):
        return loaded_params
-class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
                          MixtureOfExperts):
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
@ -514,6 +572,66 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
        # Set MoE hyperparameters
        self.expert_weights = []
        self.moe_layers: list[FusedMoE] = []
        example_layer = None
        for layer in self.model.layers:
            if isinstance(layer, PPMissingLayer):
                continue
            assert isinstance(layer, Qwen3MoeDecoderLayer)
            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
                example_layer = layer.mlp
                self.moe_layers.append(layer.mlp.experts)
        if example_layer is None:
            raise RuntimeError("No Qwen3MoE layer found in the model.layers.")
        self.num_moe_layers = len(self.moe_layers)
        self.num_expert_groups = 1
        self.num_shared_experts = 0
        self.num_logical_experts = example_layer.n_logical_experts
        self.num_physical_experts = example_layer.n_physical_experts
        self.num_local_physical_experts = example_layer.n_local_physical_experts
        self.num_routed_experts = example_layer.n_routed_experts
        self.num_redundant_experts = example_layer.n_redundant_experts
    def set_eplb_state(
        self,
        expert_load_view: torch.Tensor,
        logical_to_physical_map: torch.Tensor,
        logical_replica_count: torch.Tensor,
    ) -> None:
        for layer_idx, layer in enumerate(self.moe_layers):
            # Register the expert weights.
            self.expert_weights.append(layer.get_expert_weights())
            layer.set_eplb_state(
                moe_layer_idx=layer_idx,
                expert_load_view=expert_load_view,
                logical_to_physical_map=logical_to_physical_map,
                logical_replica_count=logical_replica_count,
            )
    def update_physical_experts_metadata(
        self,
        num_physical_experts: int,
        num_local_physical_experts: int,
    ) -> None:
        assert self.num_local_physical_experts == num_local_physical_experts
        self.num_physical_experts = num_physical_experts
        self.num_local_physical_experts = num_local_physical_experts
        self.num_redundant_experts = (num_physical_experts -
                                      self.num_logical_experts)
        for layer in self.model.layers:
            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
                moe = layer.mlp
                moe.n_local_physical_experts = num_local_physical_experts
                moe.n_physical_experts = num_physical_experts
                moe.n_redundant_experts = self.num_redundant_experts
                moe.experts.update_expert_map()
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)