[V0 deprecation] Clean up LoRA (#25686)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jee Jee Li 2025-09-26 02:12:33 +08:00 committed by GitHub
parent 3468f17ebe
commit 0fa673af4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -11,7 +11,6 @@ from typing import Optional, Union, final
import torch
import vllm.envs as envs
from vllm.lora.layers import LoRAMapping
from vllm.triton_utils import HAS_TRITON
@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
max_num_batched_tokens,
device=device)
# When cudagraph capture size is greater than max_num_seqs (max_batches,
# here), V0 captures the graph as if max_num_seqs is set to
# the capture size.
# V1 doesn't have this problem and always respects max_num_seqs.
max_num_prompts = (max_batches
if envs.VLLM_USE_V1 else max_num_batched_tokens)
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
max_num_prompts,
max_batches,
device=device)
def update_metadata(self, mapping: LoRAMapping,