From 0fa673af4c2aeaf6d21a0163dc55218f5ee1daa6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 26 Sep 2025 02:12:33 +0800 Subject: [PATCH] [V0 deprecation] Clean up LoRA (#25686) Signed-off-by: Jee Jee Li --- vllm/lora/punica_wrapper/punica_gpu.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 2db0e9fee142..467f50050eb2 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -11,7 +11,6 @@ from typing import Optional, Union, final import torch -import vllm.envs as envs from vllm.lora.layers import LoRAMapping from vllm.triton_utils import HAS_TRITON @@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): max_num_batched_tokens, device=device) - # When cudagraph capture size is greater than max_num_seqs (max_batches, - # here), V0 captures the graph as if max_num_seqs is set to - # the capture size. - # V1 doesn't have this problem and always respects max_num_seqs. - max_num_prompts = (max_batches - if envs.VLLM_USE_V1 else max_num_batched_tokens) self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras, - max_num_prompts, + max_batches, device=device) def update_metadata(self, mapping: LoRAMapping,