mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-24 05:51:19 +08:00
add a enable option
Signed-off-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
parent
1745bb9353
commit
113eb2e0b8
@ -15,6 +15,7 @@ class TestConfig:
|
|||||||
max_num_seqs: int = 2
|
max_num_seqs: int = 2
|
||||||
max_loras: int = 2
|
max_loras: int = 2
|
||||||
max_lora_rank: int = 32
|
max_lora_rank: int = 32
|
||||||
|
enable_mm_lora: bool = True
|
||||||
max_model_len: int = 8192
|
max_model_len: int = 8192
|
||||||
gpu_memory_utilization: float = 0.85
|
gpu_memory_utilization: float = 0.85
|
||||||
mm_processor_kwargs: dict[str, int] | None = None
|
mm_processor_kwargs: dict[str, int] | None = None
|
||||||
@ -49,6 +50,7 @@ class Qwen2VLTester:
|
|||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_loras=self.config.max_loras,
|
max_loras=self.config.max_loras,
|
||||||
max_lora_rank=self.config.max_lora_rank,
|
max_lora_rank=self.config.max_lora_rank,
|
||||||
|
enable_mm_lora=self.config.enable_mm_lora,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
gpu_memory_utilization=self.config.gpu_memory_utilization,
|
gpu_memory_utilization=self.config.gpu_memory_utilization,
|
||||||
mm_processor_kwargs=self.config.mm_processor_kwargs,
|
mm_processor_kwargs=self.config.mm_processor_kwargs,
|
||||||
|
|||||||
@ -55,6 +55,9 @@ class LoRAConfig:
|
|||||||
per prompt. When run in offline mode, the lora IDs for n modalities
|
per prompt. When run in offline mode, the lora IDs for n modalities
|
||||||
will be automatically assigned to 1-n with the names of the modalities
|
will be automatically assigned to 1-n with the names of the modalities
|
||||||
in alphabetic order."""
|
in alphabetic order."""
|
||||||
|
enable_mm_lora: bool = False
|
||||||
|
"""If `True`, LoRA support for multimodal models will be enabled. Currently,
|
||||||
|
only the qwenvl series models support this feature. The default is False."""
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -484,6 +484,7 @@ class EngineArgs:
|
|||||||
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
|
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
|
||||||
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
|
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
|
||||||
lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
|
lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
|
||||||
|
enable_mm_lora: bool = LoRAConfig.enable_mm_lora
|
||||||
|
|
||||||
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
|
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
|
||||||
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
|
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
|
||||||
@ -985,6 +986,11 @@ class EngineArgs:
|
|||||||
"--lora-dtype",
|
"--lora-dtype",
|
||||||
**lora_kwargs["lora_dtype"],
|
**lora_kwargs["lora_dtype"],
|
||||||
)
|
)
|
||||||
|
lora_group.add_argument(
|
||||||
|
"--enable-mm-lora",
|
||||||
|
action=argparse.BooleanOptionalAction,
|
||||||
|
**lora_kwargs["enable_mm_lora"],
|
||||||
|
)
|
||||||
lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
|
lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
|
||||||
lora_group.add_argument(
|
lora_group.add_argument(
|
||||||
"--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
|
"--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
|
||||||
@ -1660,6 +1666,7 @@ class EngineArgs:
|
|||||||
default_mm_loras=self.default_mm_loras,
|
default_mm_loras=self.default_mm_loras,
|
||||||
fully_sharded_loras=self.fully_sharded_loras,
|
fully_sharded_loras=self.fully_sharded_loras,
|
||||||
lora_dtype=self.lora_dtype,
|
lora_dtype=self.lora_dtype,
|
||||||
|
enable_mm_lora=self.enable_mm_lora,
|
||||||
max_cpu_loras=self.max_cpu_loras
|
max_cpu_loras=self.max_cpu_loras
|
||||||
if self.max_cpu_loras and self.max_cpu_loras > 0
|
if self.max_cpu_loras and self.max_cpu_loras > 0
|
||||||
else None,
|
else None,
|
||||||
|
|||||||
@ -367,10 +367,11 @@ class LoRAModelManager:
|
|||||||
if self.supports_mm:
|
if self.supports_mm:
|
||||||
model_config: ModelConfig = vllm_config.model_config
|
model_config: ModelConfig = vllm_config.model_config
|
||||||
self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping()
|
self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping()
|
||||||
self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info
|
if self.lora_config.enable_mm_lora:
|
||||||
self.supports_mm_lora = self.supports_mm and hasattr(
|
self.info = MULTIMODAL_REGISTRY.create_processor(model_config).info
|
||||||
self.info, "get_num_mm_encoder_tokens"
|
self.supports_mm_lora = self.supports_mm and hasattr(
|
||||||
)
|
self.info, "get_num_mm_encoder_tokens"
|
||||||
|
)
|
||||||
|
|
||||||
if not self.supports_mm_lora:
|
if not self.supports_mm_lora:
|
||||||
return
|
return
|
||||||
@ -380,7 +381,6 @@ class LoRAModelManager:
|
|||||||
vllm_config.scheduler_config,
|
vllm_config.scheduler_config,
|
||||||
MULTIMODAL_REGISTRY,
|
MULTIMODAL_REGISTRY,
|
||||||
)
|
)
|
||||||
self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping()
|
|
||||||
limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values())
|
limit_per_prompt: int = max(self.info.get_allowed_mm_limits().values())
|
||||||
|
|
||||||
# For vision tower
|
# For vision tower
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user