[V0 deprecation] Clean up LoRA (#25686)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2026-05-27 20:37:07 +08:00 · 2025-09-26 02:12:33 +08:00 · 2025-09-26 02:12:33 +08:00 · 0fa673af4c
commit 0fa673af4c
parent 3468f17ebe
1 changed files with 1 additions and 8 deletions
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@ -11,7 +11,6 @@ from typing import Optional, Union, final
 import torch
 import vllm.envs as envs
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON
@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
                                                      max_num_batched_tokens,
                                                      device=device)
        # When cudagraph capture size is greater than max_num_seqs (max_batches,
        # here), V0 captures the graph as if max_num_seqs is set to
        # the capture size.
        # V1 doesn't have this problem and always respects max_num_seqs.
        max_num_prompts = (max_batches
                           if envs.VLLM_USE_V1 else max_num_batched_tokens)
        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-                                                       max_num_prompts,
+                                                       max_batches,
                                                       device=device)
    def update_metadata(self, mapping: LoRAMapping,