diff --git a/tests/models/registry.py b/tests/models/registry.py index c9d4823d52792..1f4a106c06b4b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -913,6 +913,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "Qwen/Qwen2.5-VL-7B-Instruct", speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl", ), + "Eagle3Qwen3vlForCausalLM": _HfExamplesInfo( + "Qwen/Qwen3-VL-8B-Instruct", + speculative_model="taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + ), "Qwen3NextMTP": _HfExamplesInfo( "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3" ), diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 03396270a31cb..3a25f7411eecd 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -283,6 +283,19 @@ def test_speculators_model_integration( ["model_setup", "mm_enabled", "enable_chunked_prefill"], [ (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), + pytest.param( + ( + "eagle3", + "Qwen/Qwen3-VL-8B-Instruct", + "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + 1, + ), + False, + False, + marks=pytest.mark.skip( + reason="architecture of its eagle3 is LlamaForCausalLMEagle3" + ), + ), pytest.param( ( "eagle3", @@ -352,6 +365,7 @@ def test_speculators_model_integration( ], ids=[ "qwen3_eagle3", + "qwen3_vl_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", "llama3_eagle3", diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4cd6fa14c32df..52d31e70a8f05 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -89,6 +89,7 @@ from vllm.utils.collection_utils import is_list_of from .interfaces import ( MultiModalEmbeddings, + SupportsEagle3, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, @@ -1122,9 +1123,14 @@ class Qwen3LLMModel(Qwen3Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + + aux_hidden_states = [] for layer_idx, layer in islice( enumerate(self.layers), self.start_layer, self.end_layer ): + if layer_idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer( positions, hidden_states, @@ -1144,6 +1150,9 @@ class Qwen3LLMModel(Qwen3Model): {"hidden_states": hidden_states, "residual": residual} ) hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states @@ -1186,7 +1195,12 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM): dummy_inputs=Qwen3VLDummyInputsBuilder, ) class Qwen3VLForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + nn.Module, + SupportsMultiModal, + SupportsLoRA, + SupportsPP, + SupportsMRoPE, + SupportsEagle3, ): merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} @@ -1279,6 +1293,13 @@ class Qwen3VLForConditionalGeneration( self.visual_dim = config.vision_config.out_hidden_size self.multiscale_dim = self.visual_dim * self.deepstack_num_level + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.language_model.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.language_model.model.layers) + return (2, num_layers // 2, num_layers - 3) + def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: # get deepstack_input_embeds from buffer, and clear the buffer return IntermediateTensors( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ba9f33819c950..0d582043e8c02 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -414,6 +414,7 @@ _SPECULATIVE_DECODING_MODELS = { "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 7600df48150ac..305abdade8da6 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1017,10 +1017,10 @@ class EagleProposer: if supports_multimodal(target_model): # handle multimodality - if ( - self.get_model_name(target_model) - == "Qwen2_5_VLForConditionalGeneration" - ): + if self.get_model_name(target_model) in [ + "Qwen2_5_VLForConditionalGeneration", + "Qwen3VLForConditionalGeneration", + ]: self.model.config.image_token_index = target_model.config.image_token_id else: self.model.config.image_token_index = (