diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6838fc227f355..25d939fc61d54 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -438,8 +438,8 @@ th { | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | -| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | -| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | ✅ | ✅︎ | +| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | ✅ | ✅︎ | | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 6765ee0c5779c..328e1651a1387 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -50,7 +50,12 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, sharded_weight_loader, ) -from vllm.model_executor.models.interfaces import HasInnerState, IsHybrid, SupportsPP +from vllm.model_executor.models.interfaces import ( + HasInnerState, + IsHybrid, + SupportsLoRA, + SupportsPP, +) from vllm.model_executor.models.utils import ( is_pp_missing_parameter, make_empty_intermediate_tensors_factory, @@ -105,6 +110,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp): self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config self.quant_config = vllm_config.quant_config + self.is_lora_enabled = bool(vllm_config.lora_config) self.hidden_size = self.config.hidden_size self.ssm_state_size = self.config.mamba_d_state self.conv_kernel_size = self.config.mamba_d_conv @@ -202,7 +208,11 @@ class Plamo2MambaMixer(MambaBase, CustomOp): self.prefix = prefix def _project_ssm_parameters(self, hidden_states): - ssm_parameters = self.bcdt_proj(hidden_states) + if self.is_lora_enabled: + # Lora kernel requires contiguous tensor. + ssm_parameters = self.bcdt_proj(hidden_states.contiguous()) + else: + ssm_parameters = self.bcdt_proj(hidden_states) B, C, time_step = torch.split( ssm_parameters, [self.ssm_state_size, self.ssm_state_size, self.time_step_rank], @@ -780,13 +790,13 @@ class Plamo2Model(torch.nn.Module): return hidden_states -class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): +class Plamo2ForCausalLM( + torch.nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid +): packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], + "qkv_proj": ["qkv_proj"], + "gate_up_proj": ["gate_up_proj"], + "in_proj": ["in_proj"], } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 3557104d905cb..3550c9fa7f65d 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import ( composed_weight_loader, default_weight_loader, ) -from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP from vllm.model_executor.models.utils import ( AutoWeightsLoader, extract_layer_index, @@ -369,13 +369,10 @@ class Plamo3Model(nn.Module): return hidden_states -class Plamo3ForCausalLM(nn.Module, SupportsPP): +class Plamo3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], + "qkv_proj": ["qkv_proj"], + "gate_up_proj": ["gate_up_proj"], } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: