mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 23:45:54 +08:00
[V0 deprecation] Clean up LoRA (#25686)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
3468f17ebe
commit
0fa673af4c
@ -11,7 +11,6 @@ from typing import Optional, Union, final
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.lora.layers import LoRAMapping
|
from vllm.lora.layers import LoRAMapping
|
||||||
from vllm.triton_utils import HAS_TRITON
|
from vllm.triton_utils import HAS_TRITON
|
||||||
|
|
||||||
@ -41,14 +40,8 @@ class PunicaWrapperGPU(PunicaWrapperBase):
|
|||||||
max_num_batched_tokens,
|
max_num_batched_tokens,
|
||||||
device=device)
|
device=device)
|
||||||
|
|
||||||
# When cudagraph capture size is greater than max_num_seqs (max_batches,
|
|
||||||
# here), V0 captures the graph as if max_num_seqs is set to
|
|
||||||
# the capture size.
|
|
||||||
# V1 doesn't have this problem and always respects max_num_seqs.
|
|
||||||
max_num_prompts = (max_batches
|
|
||||||
if envs.VLLM_USE_V1 else max_num_batched_tokens)
|
|
||||||
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
|
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
|
||||||
max_num_prompts,
|
max_batches,
|
||||||
device=device)
|
device=device)
|
||||||
|
|
||||||
def update_metadata(self, mapping: LoRAMapping,
|
def update_metadata(self, mapping: LoRAMapping,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user