qwen2.5 & 3 vl fixes and tests

Signed-off-by: prashanth058 <prashanth.dannamaneni@uipath.com>
This commit is contained in:
prashanth058 2025-11-25 21:37:17 +00:00
parent a3647878c2
commit 81573635da
7 changed files with 75 additions and 25 deletions

View File

@ -240,6 +240,16 @@ def qwen2vl_vision_tower_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")
@pytest.fixture(scope="session")
def qwen25vl_vision_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2.5-3b-vl-flickr-lora-vision")
@pytest.fixture(scope="session")
def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="prashanth058/qwen3-4b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")

View File

@ -14,8 +14,9 @@ class TestConfig:
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
max_lora_rank: int = 32
max_model_len: int = 8192
gpu_memory_utilization: float = 0.85
mm_processor_kwargs: dict[str, int] | None = None
def __post_init__(self):
@ -49,6 +50,7 @@ class Qwen2VLTester:
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
trust_remote_code=True,
gpu_memory_utilization=self.config.gpu_memory_utilization,
mm_processor_kwargs=self.config.mm_processor_kwargs,
max_model_len=self.config.max_model_len,
)
@ -142,6 +144,16 @@ EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
"A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
]
EXPECTED_OUTPUTS_VISION_QWEN2_5_VL = [
"A black car is driving past a stop sign and a large red and gold arch.",
"A view of the Tokyo Skytree through the branches of a cherry blossom tree.",
]
EXPECTED_OUTPUTS_VISION_QWEN3_VL = [
"A black SUV drives past a stop sign in front of a Chinese gate.",
"A white tower is seen through pink flowers.",
]
# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
@ -152,6 +164,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"
def test_qwen2vl_lora(qwen2vl_lora_files):
@ -192,10 +205,6 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm",
)
def test_qwen2vl_language_lora(qwen2vl_language_lora_files):
"""
Test language-only LoRA adapter.
@ -210,10 +219,6 @@ def test_qwen2vl_language_lora(qwen2vl_language_lora_files):
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm",
)
def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files):
"""
Test vision tower + connector LoRA adapter.
@ -229,10 +234,6 @@ def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files):
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm",
)
def test_qwen2vl_vision_no_connector_lora(
qwen2vl_vision_tower_lora_files,
):
@ -251,3 +252,31 @@ def test_qwen2vl_vision_no_connector_lora(
expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
lora_id=lora_id,
)
def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
config = TestConfig(
model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_vision_lora_files,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN2_5_VL,
lora_id=lora_id,
)
def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
config = TestConfig(
model_path=QWEN3VL_MODEL_PATH,
lora_path=qwen3vl_vision_lora_files,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN3_VL,
lora_id=lora_id,
)

View File

@ -340,7 +340,12 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
packed_modules_list: list,
model_config: PretrainedConfig | None,
) -> bool:
return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
# Vision tower QKV has packed_modules_list=[] (already packed in checkpoint)
# Language models have packed_modules_list=[module_name]
# (single LoRA for qkv_proj)
return type(source_layer) is QKVParallelLinear and (
len(packed_modules_list) <= 1
)
class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):

View File

@ -562,10 +562,12 @@ class LoRAModelManager:
target_wrapper = self.punica_wrapper
if self.supports_mm_lora:
if mapping.type == LoRAMappingType.TOWER:
if mapping.type == LoRAMappingType.TOWER and self.mm_mapping.tower_model:
target_name = self.mm_mapping.tower_model[0]
target_wrapper = self.mm_punica_wrapper_mapping[target_name]
elif mapping.type == LoRAMappingType.CONNECTOR:
elif (
mapping.type == LoRAMappingType.CONNECTOR and self.mm_mapping.connector
):
target_name = self.mm_mapping.connector[0]
target_wrapper = self.mm_punica_wrapper_mapping[target_name]
else:

View File

@ -1675,6 +1675,6 @@ class Qwen3VLForConditionalGeneration(
"""
return MultiModelKeys.from_string_field(
language_model="language_model",
connector="visual.merger",
connector=["visual.merger", "visual.deepstack_merger_list"],
tower_model="visual.",
)

View File

@ -2075,7 +2075,9 @@ class GPUModelRunner(
req_idx = self.input_batch.req_id_to_index[req_id]
lora_id = int(self.input_batch.request_lora_mapping[req_idx])
num_tokens = self.info.get_num_mm_encoder_tokens(pos_info.length)
num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined]
pos_info.length
)
prompt_lora_mapping.append(lora_id)
token_lora_mapping.extend([lora_id] * num_tokens)
@ -2095,16 +2097,18 @@ class GPUModelRunner(
if hasattr(self.info, "get_num_mm_connector_tokens"):
num_post_op_tokens = []
for _, pos_info in mm_hashes_pos:
mm_token_count = self.info.get_num_mm_encoder_tokens(
mm_token_count = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined]
pos_info.length
)
post_op_count = self.info.get_num_mm_connector_tokens(
post_op_count = self.info.get_num_mm_connector_tokens( # type: ignore[attr-defined]
mm_token_count
)
num_post_op_tokens.append(post_op_count)
last_mapping = self.lora_manager._adapter_manager._last_mapping
assert last_mapping is not None
lora_ids = np.array(
self.lora_manager._adapter_manager._last_mapping.prompt_mapping,
last_mapping.prompt_mapping,
dtype=np.int32,
)
post_op_counts_np = np.array(num_post_op_tokens, dtype=np.int32)
@ -2112,8 +2116,8 @@ class GPUModelRunner(
connector_mapping = LoRAMapping(
index_mapping=tuple(new_token_indices.tolist()),
prompt_mapping=self.lora_manager._adapter_manager._last_mapping.prompt_mapping,
is_prefill=self.lora_manager._adapter_manager._last_mapping.is_prefill,
prompt_mapping=last_mapping.prompt_mapping,
is_prefill=last_mapping.is_prefill,
type=LoRAMappingType.CONNECTOR,
)

View File

@ -33,7 +33,7 @@ class LoRAModelRunnerMixin:
model: nn.Module,
vllm_config: VllmConfig,
device: torch.device,
model_config: ModelConfig = None,
model_config: ModelConfig | None = None,
) -> nn.Module:
if not supports_lora(model):
raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.")