From 81573635da2f88e2487ae7a7646c7d8bc80645e0 Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Tue, 25 Nov 2025 21:37:17 +0000 Subject: [PATCH] qwen2.5 & 3 vl fixes and tests Signed-off-by: prashanth058 --- tests/lora/conftest.py | 10 ++++ .../lora/{test_qwen2vl.py => test_qwenvl.py} | 57 ++++++++++++++----- vllm/lora/layers/column_parallel_linear.py | 7 ++- vllm/lora/models.py | 6 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 16 ++++-- vllm/v1/worker/lora_model_runner_mixin.py | 2 +- 7 files changed, 75 insertions(+), 25 deletions(-) rename tests/lora/{test_qwen2vl.py => test_qwenvl.py} (85%) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 1b301ee75f802..74e2fe22414d2 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -240,6 +240,16 @@ def qwen2vl_vision_tower_lora_files(): return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower") +@pytest.fixture(scope="session") +def qwen25vl_vision_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen2.5-3b-vl-flickr-lora-vision") + + +@pytest.fixture(scope="session") +def qwen3vl_vision_lora_files(): + return snapshot_download(repo_id="prashanth058/qwen3-4b-vl-lora-vision-connector") + + @pytest.fixture(scope="session") def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwenvl.py similarity index 85% rename from tests/lora/test_qwen2vl.py rename to tests/lora/test_qwenvl.py index 41c06cca36b27..ec9990fee0f67 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwenvl.py @@ -14,8 +14,9 @@ class TestConfig: lora_path: str max_num_seqs: int = 2 max_loras: int = 2 - max_lora_rank: int = 16 - max_model_len: int = 4096 + max_lora_rank: int = 32 + max_model_len: int = 8192 + gpu_memory_utilization: float = 0.85 mm_processor_kwargs: dict[str, int] | None = None def __post_init__(self): @@ -49,6 +50,7 @@ class Qwen2VLTester: max_loras=self.config.max_loras, max_lora_rank=self.config.max_lora_rank, trust_remote_code=True, + gpu_memory_utilization=self.config.gpu_memory_utilization, mm_processor_kwargs=self.config.mm_processor_kwargs, max_model_len=self.config.max_model_len, ) @@ -142,6 +144,16 @@ EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [ "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.", ] +EXPECTED_OUTPUTS_VISION_QWEN2_5_VL = [ + "A black car is driving past a stop sign and a large red and gold arch.", + "A view of the Tokyo Skytree through the branches of a cherry blossom tree.", +] + +EXPECTED_OUTPUTS_VISION_QWEN3_VL = [ + "A black SUV drives past a stop sign in front of a Chinese gate.", + "A white tower is seen through pink flowers.", +] + # NOTE - beam search .text contains the whole text EXPECTED_BEAM_SEARCH_OUTPUTS = [ [ @@ -152,6 +164,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [ QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct" +QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct" def test_qwen2vl_lora(qwen2vl_lora_files): @@ -192,10 +205,6 @@ def test_qwen25vl_lora(qwen25vl_lora_files): tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_language_lora(qwen2vl_language_lora_files): """ Test language-only LoRA adapter. @@ -210,10 +219,6 @@ def test_qwen2vl_language_lora(qwen2vl_language_lora_files): ) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files): """ Test vision tower + connector LoRA adapter. @@ -229,10 +234,6 @@ def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files): ) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_vision_no_connector_lora( qwen2vl_vision_tower_lora_files, ): @@ -251,3 +252,31 @@ def test_qwen2vl_vision_no_connector_lora( expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR, lora_id=lora_id, ) + + +def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files): + config = TestConfig( + model_path=QWEN25VL_MODEL_PATH, + lora_path=qwen25vl_vision_lora_files, + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN2_5_VL, + lora_id=lora_id, + ) + + +def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files): + config = TestConfig( + model_path=QWEN3VL_MODEL_PATH, + lora_path=qwen3vl_vision_lora_files, + ) + tester = Qwen2VLTester(config) + for lora_id in [1, 2]: + tester.run_test( + TEST_IMAGES, + expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN3_VL, + lora_id=lora_id, + ) diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 3e21d426c304a..f6f610669056d 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -340,7 +340,12 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): packed_modules_list: list, model_config: PretrainedConfig | None, ) -> bool: - return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1 + # Vision tower QKV has packed_modules_list=[] (already packed in checkpoint) + # Language models have packed_modules_list=[module_name] + # (single LoRA for qkv_proj) + return type(source_layer) is QKVParallelLinear and ( + len(packed_modules_list) <= 1 + ) class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 27a85a3ca2297..8402cd73b9263 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -562,10 +562,12 @@ class LoRAModelManager: target_wrapper = self.punica_wrapper if self.supports_mm_lora: - if mapping.type == LoRAMappingType.TOWER: + if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model: target_name = self.mm_mapping.tower_model[0] target_wrapper = self.mm_punica_wrapper_mapping[target_name] - elif mapping.type == LoRAMappingType.CONNECTOR: + elif ( + mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector + ): target_name = self.mm_mapping.connector[0] target_wrapper = self.mm_punica_wrapper_mapping[target_name] else: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4cd6fa14c32df..181c2957565ea 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1675,6 +1675,6 @@ class Qwen3VLForConditionalGeneration( """ return MultiModelKeys.from_string_field( language_model="language_model", - connector="visual.merger", + connector=["visual.merger", "visual.deepstack_merger_list"], tower_model="visual.", ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3126687fb4baf..9c4eff8b16389 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2075,7 +2075,9 @@ class GPUModelRunner( req_idx = self.input_batch.req_id_to_index[req_id] lora_id = int(self.input_batch.request_lora_mapping[req_idx]) - num_tokens = self.info.get_num_mm_encoder_tokens(pos_info.length) + num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] + pos_info.length + ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens) @@ -2095,16 +2097,18 @@ class GPUModelRunner( if hasattr(self.info, "get_num_mm_connector_tokens"): num_post_op_tokens = [] for _, pos_info in mm_hashes_pos: - mm_token_count = self.info.get_num_mm_encoder_tokens( + mm_token_count = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length ) - post_op_count = self.info.get_num_mm_connector_tokens( + post_op_count = self.info.get_num_mm_connector_tokens( # type: ignore[attr-defined] mm_token_count ) num_post_op_tokens.append(post_op_count) + last_mapping = self.lora_manager._adapter_manager._last_mapping + assert last_mapping is not None lora_ids = np.array( - self.lora_manager._adapter_manager._last_mapping.prompt_mapping, + last_mapping.prompt_mapping, dtype=np.int32, ) post_op_counts_np = np.array(num_post_op_tokens, dtype=np.int32) @@ -2112,8 +2116,8 @@ class GPUModelRunner( connector_mapping = LoRAMapping( index_mapping=tuple(new_token_indices.tolist()), - prompt_mapping=self.lora_manager._adapter_manager._last_mapping.prompt_mapping, - is_prefill=self.lora_manager._adapter_manager._last_mapping.is_prefill, + prompt_mapping=last_mapping.prompt_mapping, + is_prefill=last_mapping.is_prefill, type=LoRAMappingType.CONNECTOR, ) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index ed6b5525fa1a7..efb184d060d43 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -33,7 +33,7 @@ class LoRAModelRunnerMixin: model: nn.Module, vllm_config: VllmConfig, device: torch.device, - model_config: ModelConfig = None, + model_config: ModelConfig | None = None, ) -> nn.Module: if not supports_lora(model): raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.")