[BugFix] Fix Llama4 Pipeline Parallelism Assert Error (#28577)

Co-authored-by: Dezhan Tu <dztu@meta.com>
2026-01-23 15:34:28 +08:00 · 2025-11-20 02:52:36 -08:00 · 2025-11-20 02:52:36 -08:00 · dc45efc8ef
commit dc45efc8ef
parent fb8851f254
1 changed files with 7 additions and 0 deletions
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@ -53,6 +53,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (
    AutoWeightsLoader,
+    PPMissingLayer,
    extract_layer_index,
    fast_topk,
    is_pp_missing_parameter,
@ -729,6 +730,9 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
        self.moe_layers = []
        example_moe = None
        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
            assert isinstance(layer, Llama4DecoderLayer)
            if isinstance(layer.feed_forward, Llama4MoE):
                # Pick last one layer since the first ones may be dense layers.
@ -765,6 +769,9 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts):
        self.num_local_physical_experts = num_local_physical_experts
        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
            if isinstance(layer.feed_forward, Llama4MoE):
                moe = layer.feed_forward
                moe.n_local_physical_experts = num_local_physical_experts