mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-15 23:27:11 +08:00
ran pre-commit command
Signed-off-by: <mjtaheri68@gmail.com>
This commit is contained in:
parent
bfa2c0bbb9
commit
c10c9a3f98
@ -37,7 +37,10 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int):
|
||||
"I am \u5f20\u5b50\u8c6a, an AI assistant developed by \u9648\u58eb\u680b.", # noqa: E501
|
||||
]
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert generated_texts[i].startswith(expected_lora_output[i])
|
||||
# Check for key Chinese name to verify LoRA is applied
|
||||
assert "\u9648\u58eb\u680b" in generated_texts[i], (
|
||||
f"Expected Chinese name '陈士栋' not found in: {generated_texts[i]}"
|
||||
)
|
||||
|
||||
|
||||
def test_deepseekv2_lora(deepseekv2_lora_files):
|
||||
|
||||
@ -66,7 +66,36 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
|
||||
# Normalize SQL: remove whitespace/newlines, uppercase, remove punct
|
||||
gen_normalized = (
|
||||
"".join(generated_texts[i].split())
|
||||
.upper()
|
||||
.replace(",", "")
|
||||
.replace(";", "")
|
||||
)
|
||||
exp_normalized = (
|
||||
"".join(EXPECTED_LORA_OUTPUT[i].split())
|
||||
.upper()
|
||||
.replace(",", "")
|
||||
.replace(";", "")
|
||||
)
|
||||
|
||||
# Check key SQL keywords are present
|
||||
key_keywords = ["SELECT", "FROM", "FARM"]
|
||||
|
||||
# For AVG query
|
||||
if "AVG" in exp_normalized:
|
||||
key_keywords.extend(
|
||||
["AVG", "WORKING_HORSES", "WHERE", "TOTAL_HORSES", "5000"]
|
||||
)
|
||||
# For MAX/MIN query
|
||||
elif "MAX" in exp_normalized and "MIN" in exp_normalized:
|
||||
key_keywords.extend(["MAX", "MIN", "COWS"])
|
||||
|
||||
for keyword in key_keywords:
|
||||
assert keyword in gen_normalized, (
|
||||
f"Expected keyword '{keyword}' not found in SQL: {generated_texts[i]}"
|
||||
)
|
||||
|
||||
|
||||
def test_gpt_oss_lora(gptoss20b_lora_files):
|
||||
@ -76,8 +105,6 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=8,
|
||||
max_num_seqs=2,
|
||||
max_num_batched_tokens=2048,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
@ -96,10 +123,8 @@ def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=8,
|
||||
max_num_seqs=2,
|
||||
max_num_batched_tokens=2048,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.8,
|
||||
fully_sharded_loras=fully_sharded_loras,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
|
||||
@ -112,12 +112,23 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
lora_a = self.slice_lora_a(lora_a)
|
||||
lora_b = self.slice_lora_b(lora_b)
|
||||
|
||||
self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
|
||||
lora_a, non_blocking=True
|
||||
)
|
||||
self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
|
||||
lora_b, non_blocking=True
|
||||
)
|
||||
# Device-aware scatter: optimize GPU→GPU case (slab optimization)
|
||||
if lora_a.is_cuda:
|
||||
# Fast path: GPU→GPU scatter (already on GPU from slab)
|
||||
self.lora_a_stacked[0][index, 0, : lora_a.shape[0], : lora_a.shape[1]] = (
|
||||
lora_a
|
||||
)
|
||||
self.lora_b_stacked[0][index, 0, : lora_b.shape[0], : lora_b.shape[1]] = (
|
||||
lora_b
|
||||
)
|
||||
else:
|
||||
# Standard path: CPU→GPU transfer (baseline case)
|
||||
self.lora_a_stacked[0][
|
||||
index, 0, : lora_a.shape[0], : lora_a.shape[1]
|
||||
].copy_(lora_a, non_blocking=True)
|
||||
self.lora_b_stacked[0][
|
||||
index, 0, : lora_b.shape[0], : lora_b.shape[1]
|
||||
].copy_(lora_b, non_blocking=True)
|
||||
|
||||
def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
|
||||
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
|
||||
|
||||
@ -518,29 +518,59 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
|
||||
sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
|
||||
sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
|
||||
|
||||
self.w13_lora_a_stacked[0][
|
||||
index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
|
||||
].copy_(slliced_w1_lora_a, non_blocking=True)
|
||||
# Device-aware scatter: optimize GPU→GPU case (slab optimization)
|
||||
is_gpu_source = w1_lora_a.is_cuda
|
||||
|
||||
self.w13_lora_a_stacked[1][
|
||||
index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
|
||||
].copy_(slliced_w3_lora_a, non_blocking=True)
|
||||
if is_gpu_source:
|
||||
# Fast path: GPU→GPU scatter (source already on GPU from slab)
|
||||
self.w13_lora_a_stacked[0][
|
||||
index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
|
||||
] = slliced_w1_lora_a
|
||||
|
||||
self.w13_lora_b_stacked[0][
|
||||
index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
|
||||
].copy_(slliced_w1_lora_b, non_blocking=True)
|
||||
self.w13_lora_a_stacked[1][
|
||||
index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
|
||||
] = slliced_w3_lora_a
|
||||
|
||||
self.w13_lora_b_stacked[1][
|
||||
index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
|
||||
].copy_(slliced_w3_lora_b, non_blocking=True)
|
||||
self.w13_lora_b_stacked[0][
|
||||
index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
|
||||
] = slliced_w1_lora_b
|
||||
|
||||
self.w2_lora_a_stacked[0][
|
||||
index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
|
||||
].copy_(sliced_w2_lora_a, non_blocking=True)
|
||||
self.w13_lora_b_stacked[1][
|
||||
index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
|
||||
] = slliced_w3_lora_b
|
||||
|
||||
self.w2_lora_b_stacked[0][
|
||||
index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
|
||||
].copy_(sliced_w2_lora_b, non_blocking=True)
|
||||
self.w2_lora_a_stacked[0][
|
||||
index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
|
||||
] = sliced_w2_lora_a
|
||||
|
||||
self.w2_lora_b_stacked[0][
|
||||
index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
|
||||
] = sliced_w2_lora_b
|
||||
else:
|
||||
# Standard path: CPU→GPU transfer (baseline case)
|
||||
self.w13_lora_a_stacked[0][
|
||||
index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
|
||||
].copy_(slliced_w1_lora_a, non_blocking=True)
|
||||
|
||||
self.w13_lora_a_stacked[1][
|
||||
index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
|
||||
].copy_(slliced_w3_lora_a, non_blocking=True)
|
||||
|
||||
self.w13_lora_b_stacked[0][
|
||||
index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
|
||||
].copy_(slliced_w1_lora_b, non_blocking=True)
|
||||
|
||||
self.w13_lora_b_stacked[1][
|
||||
index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
|
||||
].copy_(slliced_w3_lora_b, non_blocking=True)
|
||||
|
||||
self.w2_lora_a_stacked[0][
|
||||
index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
|
||||
].copy_(sliced_w2_lora_a, non_blocking=True)
|
||||
|
||||
self.w2_lora_b_stacked[0][
|
||||
index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
|
||||
].copy_(sliced_w2_lora_b, non_blocking=True)
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
return self.base_layer.forward(*args, **kwargs)
|
||||
@ -691,20 +721,41 @@ class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
|
||||
|
||||
sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
|
||||
sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
|
||||
# Device-aware scatter: optimize GPU→GPU case (slab optimization)
|
||||
is_gpu_source = w13_lora_a.is_cuda
|
||||
|
||||
self.w13_lora_a_stacked[0][
|
||||
index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2]
|
||||
].copy_(sliced_w13_lora_a, non_blocking=True)
|
||||
self.w2_lora_a_stacked[0][
|
||||
index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
|
||||
].copy_(sliced_w2_lora_a, non_blocking=True)
|
||||
# logger.info(f" - is_gpu_source: {is_gpu_source}")
|
||||
|
||||
self.w13_lora_b_stacked[0][
|
||||
index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2]
|
||||
].copy_(sliced_w13_lora_b, non_blocking=True)
|
||||
self.w2_lora_b_stacked[0][
|
||||
index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
|
||||
].copy_(sliced_w2_lora_b, non_blocking=True)
|
||||
if is_gpu_source:
|
||||
# Fast path: GPU→GPU scatter (source already on GPU from slab)
|
||||
self.w13_lora_a_stacked[0][
|
||||
index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2]
|
||||
] = sliced_w13_lora_a
|
||||
self.w2_lora_a_stacked[0][
|
||||
index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
|
||||
] = sliced_w2_lora_a
|
||||
|
||||
self.w13_lora_b_stacked[0][
|
||||
index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2]
|
||||
] = sliced_w13_lora_b
|
||||
self.w2_lora_b_stacked[0][
|
||||
index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
|
||||
] = sliced_w2_lora_b
|
||||
else:
|
||||
# Standard path: CPU→GPU transfer (baseline case)
|
||||
self.w13_lora_a_stacked[0][
|
||||
index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2]
|
||||
].copy_(sliced_w13_lora_a, non_blocking=True)
|
||||
self.w2_lora_a_stacked[0][
|
||||
index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
|
||||
].copy_(sliced_w2_lora_a, non_blocking=True)
|
||||
|
||||
self.w13_lora_b_stacked[0][
|
||||
index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2]
|
||||
].copy_(sliced_w13_lora_b, non_blocking=True)
|
||||
self.w2_lora_b_stacked[0][
|
||||
index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
|
||||
].copy_(sliced_w2_lora_b, non_blocking=True)
|
||||
|
||||
@property
|
||||
def w13_input_size(self):
|
||||
|
||||
@ -3,15 +3,21 @@
|
||||
|
||||
import os
|
||||
|
||||
import safetensors
|
||||
import safetensors.torch
|
||||
import torch
|
||||
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.envs import SLAB_OPTIMIZATION
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.lora_weights import LoRALayerWeights
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.lora.slab_helper import (
|
||||
create_slab_optimized_lora_model,
|
||||
)
|
||||
from vllm.lora.utils import (
|
||||
get_lora_id,
|
||||
is_base_embeddding_weights,
|
||||
is_regex_target_modules,
|
||||
parse_fine_tuned_lora_name,
|
||||
)
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
@ -49,12 +55,24 @@ class LoRAModel:
|
||||
"""Return a copy of the object with different ids.
|
||||
|
||||
Will share the underlying tensors."""
|
||||
return self.__class__(
|
||||
cloned = self.__class__(
|
||||
lora_model_id,
|
||||
rank=self.rank,
|
||||
loras=self.loras.copy(),
|
||||
)
|
||||
|
||||
# Copy slab metadata if present (for SLAB optimization)
|
||||
if hasattr(self, "_cached_cpu_slab"):
|
||||
cloned._cached_cpu_slab = self._cached_cpu_slab # type: ignore[attr-defined]
|
||||
if hasattr(self, "_cached_metadata"):
|
||||
cloned._cached_metadata = self._cached_metadata # type: ignore[attr-defined]
|
||||
if hasattr(self, "_lora_dir"):
|
||||
cloned._lora_dir = self._lora_dir # type: ignore[attr-defined]
|
||||
if hasattr(self, "_loras_dict"):
|
||||
cloned._loras_dict = self._loras_dict # type: ignore[attr-defined]
|
||||
|
||||
return cloned
|
||||
|
||||
def get_lora(self, module_name: str) -> LoRALayerWeights | None:
|
||||
"""Get LoRA for a given module by name"""
|
||||
return self.loras.get(module_name, None)
|
||||
@ -71,42 +89,124 @@ class LoRAModel:
|
||||
device: str = "cuda",
|
||||
dtype: torch.dtype | None = None,
|
||||
model_vocab_size: int | None = None,
|
||||
embedding_modules: dict[str, str] | None = None,
|
||||
embedding_padding_modules: list[str] | None = None,
|
||||
weights_mapper: WeightsMapper | None = None,
|
||||
lora_dir: str | None = None,
|
||||
target_modules_dict: dict | None = None,
|
||||
target_lora_config: LoRAConfig | None = None,
|
||||
slab_path: str | None = None,
|
||||
packed_modules: dict | None = None,
|
||||
packed_modules_mapping: dict | None = None,
|
||||
) -> "LoRAModel":
|
||||
"""Create a LoRAModel from a dictionary of tensors."""
|
||||
pin_memory = str(device) == "cpu" and is_pin_memory_available()
|
||||
loras: dict[str, LoRALayerWeights] = {}
|
||||
for tensor_name, tensor in tensors.items():
|
||||
if is_base_embeddding_weights(tensor_name):
|
||||
continue
|
||||
module_name, is_lora_a = parse_fine_tuned_lora_name(
|
||||
tensor_name, weights_mapper
|
||||
)
|
||||
if module_name not in loras:
|
||||
loras[module_name] = LoRALayerWeights.from_config(
|
||||
module_name, peft_helper
|
||||
if not SLAB_OPTIMIZATION:
|
||||
pin_memory = str(device) == "cpu" and is_pin_memory_available()
|
||||
loras: dict[str, LoRALayerWeights] = {}
|
||||
|
||||
for tensor_name, tensor in tensors.items():
|
||||
if is_base_embeddding_weights(tensor_name):
|
||||
continue
|
||||
module_name, is_lora_a = parse_fine_tuned_lora_name(
|
||||
tensor_name, weights_mapper
|
||||
)
|
||||
|
||||
if is_lora_a:
|
||||
if (
|
||||
"lora_embedding_A" in tensor_name
|
||||
and model_vocab_size is not None
|
||||
and model_vocab_size != tensor.shape[1]
|
||||
):
|
||||
raise RuntimeError(
|
||||
f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
|
||||
f" with the base model's vocabulary size({model_vocab_size})."
|
||||
if module_name not in loras:
|
||||
loras[module_name] = LoRALayerWeights.from_config(
|
||||
module_name, peft_helper
|
||||
)
|
||||
loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
|
||||
if pin_memory:
|
||||
loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
|
||||
else:
|
||||
loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
|
||||
|
||||
if pin_memory:
|
||||
loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
|
||||
if is_lora_a:
|
||||
if (
|
||||
"lora_embedding_A" in tensor_name
|
||||
and model_vocab_size is not None
|
||||
and model_vocab_size != tensor.shape[1]
|
||||
):
|
||||
raise RuntimeError(
|
||||
f"The embedding LoRA size({tensor.shape[1]}) "
|
||||
f"must be consistent with the base model's "
|
||||
f"vocabulary size({model_vocab_size})."
|
||||
)
|
||||
loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
|
||||
if pin_memory:
|
||||
loras[module_name].lora_a = loras[
|
||||
module_name
|
||||
].lora_a.pin_memory()
|
||||
|
||||
return cls(lora_model_id, peft_helper.r, loras)
|
||||
else:
|
||||
loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
|
||||
|
||||
if pin_memory:
|
||||
loras[module_name].lora_b = loras[
|
||||
module_name
|
||||
].lora_b.pin_memory()
|
||||
|
||||
return cls(lora_model_id, peft_helper.r, loras)
|
||||
else:
|
||||
logger.debug("Using slab-based LoRA tensor optimization")
|
||||
|
||||
from vllm.lora.slab_helper import check_slab_cache
|
||||
|
||||
cache_hit, lora_model_cached = check_slab_cache(
|
||||
lora_dir, peft_helper, target_lora_config, target_modules_dict
|
||||
)
|
||||
|
||||
if cache_hit and lora_model_cached is not None:
|
||||
logger.debug(
|
||||
"[SLAB_CACHE_HIT] Using cached slab for %s, cloning with ID %s",
|
||||
lora_dir,
|
||||
lora_model_id,
|
||||
)
|
||||
# Clone cached model with correct ID
|
||||
return lora_model_cached.clone(lora_model_id)
|
||||
|
||||
logger.debug("Building new slab for %s", lora_dir)
|
||||
lora_model, gpu_slab, metadata = create_slab_optimized_lora_model(
|
||||
lora_model_id=lora_model_id,
|
||||
tensors=tensors,
|
||||
peft_helper=peft_helper,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
embeddings=None,
|
||||
target_embedding_padding=model_vocab_size,
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embedding_padding_modules,
|
||||
weights_mapper=weights_mapper,
|
||||
lora_dir=lora_dir,
|
||||
lora_config=peft_helper,
|
||||
target_modules_dict=target_modules_dict,
|
||||
target_lora_config=target_lora_config,
|
||||
slab_path=slab_path,
|
||||
packed_modules=packed_modules,
|
||||
packed_modules_mapping=packed_modules_mapping,
|
||||
)
|
||||
|
||||
if (
|
||||
gpu_slab is not None
|
||||
and metadata is not None
|
||||
and target_modules_dict is not None
|
||||
):
|
||||
# Pre-cache metadata lookup once for all modules
|
||||
if not hasattr(metadata, "_lookup_cache"):
|
||||
metadata._lookup_cache = {
|
||||
info.module_name: info for info in metadata.tensor_infos
|
||||
}
|
||||
|
||||
# Instead of calling create_lora_weights, just cache slab references
|
||||
for module_name, module in target_modules_dict.items():
|
||||
if hasattr(module, "create_lora_weights"):
|
||||
module._gpu_slab_ref = gpu_slab
|
||||
module._slab_metadata_ref = metadata
|
||||
module._slab_ready = True
|
||||
module._using_slab_views = True
|
||||
# Add any post-processing after slab model creation
|
||||
torch.cuda.synchronize() # Check for pending GPU operations
|
||||
|
||||
# Cache the built LoRAModel for future reuse
|
||||
from vllm.lora.slab_helper import cache_lora_model
|
||||
|
||||
cache_lora_model(lora_dir, lora_model)
|
||||
|
||||
return lora_model
|
||||
|
||||
@classmethod
|
||||
def from_local_checkpoint(
|
||||
@ -119,8 +219,15 @@ class LoRAModel:
|
||||
device: str = "cuda",
|
||||
dtype: torch.dtype | None = None,
|
||||
model_vocab_size: int | None = None,
|
||||
embedding_modules: dict[str, str] | None = None,
|
||||
embedding_padding_modules: list[str] | None = None,
|
||||
weights_mapper: WeightsMapper | None = None,
|
||||
tensorizer_config_dict: dict | None = None,
|
||||
target_modules_dict: dict | None = None,
|
||||
target_lora_config: LoRAConfig | None = None,
|
||||
slab_path: str | None = None,
|
||||
packed_modules: dict | None = None,
|
||||
packed_modules_mapping: dict | None = None,
|
||||
) -> "LoRAModel":
|
||||
"""Create a LoRAModel from a local checkpoint.
|
||||
|
||||
@ -200,22 +307,55 @@ class LoRAModel:
|
||||
for module in f.keys(): # noqa
|
||||
tensors[module] = f.get_tensor(module)
|
||||
elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
|
||||
# When a bin/pt file is provided, we rely on config to find
|
||||
# unexpected modules.
|
||||
unexpected_modules = []
|
||||
target_modules = peft_helper.target_modules
|
||||
if not isinstance(target_modules, list):
|
||||
target_modules = [target_modules]
|
||||
for module in target_modules:
|
||||
# Compatible with more modules,
|
||||
# such as:layers.11.self_attn.k_proj
|
||||
part_name = module.split(".")[-1]
|
||||
if part_name not in expected_lora_modules:
|
||||
unexpected_modules.append(module)
|
||||
# loaded lora's target modules must be a subset of
|
||||
# expected_lora_modules. It is not reliable. See
|
||||
# https://github.com/vllm-project/vllm/pull/5909. But there's no
|
||||
# other better mechanism.
|
||||
if unexpected_modules and not is_regex_target_modules(
|
||||
peft_helper.target_modules, expected_lora_modules
|
||||
):
|
||||
raise ValueError(
|
||||
f"While loading {lora_dir}, expected"
|
||||
f" target modules in {expected_lora_modules}"
|
||||
f" but received {unexpected_modules}."
|
||||
f" Please verify that the loaded LoRA module is correct"
|
||||
)
|
||||
lora_file_path = (
|
||||
lora_bin_file_path
|
||||
if os.path.isfile(lora_bin_file_path)
|
||||
else lora_pt_file_path
|
||||
)
|
||||
tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
|
||||
check_unexpected_modules(tensors)
|
||||
else:
|
||||
raise ValueError(f"{lora_dir} doesn't contain tensors")
|
||||
|
||||
lora_id = get_lora_id() if lora_model_id is None else lora_model_id
|
||||
return cls.from_lora_tensors(
|
||||
lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
|
||||
lora_model_id=lora_id,
|
||||
tensors=tensors,
|
||||
peft_helper=peft_helper,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
model_vocab_size=model_vocab_size,
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embedding_padding_modules,
|
||||
weights_mapper=weights_mapper,
|
||||
lora_dir=lora_dir,
|
||||
target_modules_dict=target_modules_dict,
|
||||
target_lora_config=target_lora_config,
|
||||
slab_path=slab_path,
|
||||
packed_modules=packed_modules,
|
||||
packed_modules_mapping=packed_modules_mapping,
|
||||
)
|
||||
|
||||
@ -10,11 +10,13 @@ import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.envs import SLAB_OPTIMIZATION
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
|
||||
from vllm.lora.lora_model import LoRAModel
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||
from vllm.lora.slab_helper import process_slab_activation_loop
|
||||
from vllm.lora.utils import (
|
||||
from_layer,
|
||||
from_layer_logits_processor,
|
||||
@ -150,84 +152,139 @@ class LoRAModelManager:
|
||||
"Activating LoRA. int id: %d, slot index: %d", lora_model.id, index
|
||||
)
|
||||
self.lora_index_to_id[index] = lora_model.id
|
||||
for module_name, module in self.modules.items():
|
||||
module_lora = self._get_lora_layer_weights(lora_model, module_name)
|
||||
if not module_lora:
|
||||
module.reset_lora(index)
|
||||
continue
|
||||
# Note (gnovack) - If MOE lora weights are not split into
|
||||
# num_experts chunks, we split them here
|
||||
if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
|
||||
module_lora.lora_a
|
||||
):
|
||||
# Handle PEFT file format where experts.base_layer is the
|
||||
# gate_up_proj and experts is the down_proj
|
||||
gate_up_proj_lora = self._get_lora_layer_weights(
|
||||
lora_model, module_name + ".base_layer"
|
||||
)
|
||||
down_proj_lora = module_lora
|
||||
# FIXME Edge case where LoRA is not added to gate_up_proj
|
||||
# or down_proj
|
||||
assert gate_up_proj_lora is not None
|
||||
assert down_proj_lora is not None
|
||||
if self._is_3d_moe_model:
|
||||
module_lora.lora_a = [
|
||||
gate_up_proj_lora.lora_a,
|
||||
down_proj_lora.lora_a,
|
||||
]
|
||||
module_lora.lora_b = [
|
||||
gate_up_proj_lora.lora_b,
|
||||
down_proj_lora.lora_b,
|
||||
]
|
||||
else:
|
||||
# Some 3D MoE models haven't added the `is_3d_moe_weight`
|
||||
# attribute yet, so fallback here
|
||||
num_experts = module_lora.lora_a.shape[0] // module_lora.rank
|
||||
|
||||
gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
|
||||
up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
|
||||
if SLAB_OPTIMIZATION:
|
||||
# Check for cached CPU slab and metadata from LoRAModel
|
||||
has_cpu_slab = hasattr(lora_model, "_cached_cpu_slab")
|
||||
has_gpu_slab = hasattr(lora_model, "_cached_gpu_slab")
|
||||
has_metadata = hasattr(lora_model, "_cached_metadata")
|
||||
|
||||
gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk(
|
||||
num_experts, dim=-1
|
||||
)
|
||||
up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk(
|
||||
num_experts, dim=-1
|
||||
)
|
||||
if has_cpu_slab and has_metadata:
|
||||
# MEMORY EFFICIENT: Create GPU slab only during activation
|
||||
cpu_slab = lora_model._cached_cpu_slab # type: ignore[attr-defined]
|
||||
metadata = lora_model._cached_metadata # type: ignore[attr-defined]
|
||||
|
||||
down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0)
|
||||
down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1)
|
||||
# Transfer to GPU only when activated
|
||||
gpu_slab = cpu_slab.to(device="cuda", non_blocking=True)
|
||||
|
||||
lora_a = []
|
||||
lora_b = []
|
||||
for i in range(num_experts):
|
||||
lora_a.append(gate_proj_a[i])
|
||||
lora_a.append(down_proj_a[i])
|
||||
lora_a.append(up_proj_a[i])
|
||||
# Cache GPU slab for this activation
|
||||
lora_model._cached_gpu_slab = gpu_slab # type: ignore[attr-defined]
|
||||
|
||||
lora_b.append(gate_proj_b[i])
|
||||
lora_b.append(down_proj_b[i])
|
||||
lora_b.append(up_proj_b[i])
|
||||
elif has_gpu_slab and has_metadata:
|
||||
gpu_slab = lora_model._cached_gpu_slab # type: ignore[attr-defined]
|
||||
metadata = lora_model._cached_metadata # type: ignore[attr-defined]
|
||||
|
||||
module_lora.lora_a = lora_a
|
||||
module_lora.lora_b = lora_b
|
||||
module.set_lora(
|
||||
else:
|
||||
return False
|
||||
# Use helper function for the full activation loop with all optimizations
|
||||
process_slab_activation_loop(
|
||||
self.modules,
|
||||
lora_model,
|
||||
self._get_lora_layer_weights,
|
||||
self.lora_config,
|
||||
gpu_slab,
|
||||
metadata,
|
||||
index,
|
||||
module_lora.lora_a,
|
||||
module_lora.lora_b,
|
||||
)
|
||||
|
||||
return True
|
||||
return True
|
||||
else:
|
||||
for module_name, module in self.modules.items():
|
||||
module_lora = self._get_lora_layer_weights(lora_model, module_name)
|
||||
if not module_lora:
|
||||
module.reset_lora(index)
|
||||
continue
|
||||
# Note (gnovack) - If MOE lora weights are not split into
|
||||
# num_experts chunks, we split them here
|
||||
if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
|
||||
module_lora.lora_a
|
||||
):
|
||||
# Handle PEFT file format where experts.base_layer is the
|
||||
# gate_up_proj and experts is the down_proj
|
||||
gate_up_proj_lora = self._get_lora_layer_weights(
|
||||
lora_model, module_name + ".base_layer"
|
||||
)
|
||||
down_proj_lora = module_lora
|
||||
# FIXME Edge case where LoRA is not added to gate_up_proj
|
||||
# or down_proj
|
||||
assert gate_up_proj_lora is not None
|
||||
assert down_proj_lora is not None
|
||||
if self._is_3d_moe_model:
|
||||
module_lora.lora_a = [
|
||||
gate_up_proj_lora.lora_a,
|
||||
down_proj_lora.lora_a,
|
||||
]
|
||||
module_lora.lora_b = [
|
||||
gate_up_proj_lora.lora_b,
|
||||
down_proj_lora.lora_b,
|
||||
]
|
||||
else:
|
||||
# Some 3D MoE models haven't added the `is_3d_moe_weight`
|
||||
# attribute yet, so fallback here
|
||||
num_experts = module_lora.lora_a.shape[0] // module_lora.rank
|
||||
|
||||
gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
|
||||
up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
|
||||
|
||||
gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk(
|
||||
num_experts, dim=-1
|
||||
)
|
||||
up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk(
|
||||
num_experts, dim=-1
|
||||
)
|
||||
|
||||
down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0)
|
||||
down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1)
|
||||
|
||||
lora_a = []
|
||||
lora_b = []
|
||||
for i in range(num_experts):
|
||||
lora_a.append(gate_proj_a[i])
|
||||
lora_a.append(down_proj_a[i])
|
||||
lora_a.append(up_proj_a[i])
|
||||
|
||||
lora_b.append(gate_proj_b[i])
|
||||
lora_b.append(down_proj_b[i])
|
||||
lora_b.append(up_proj_b[i])
|
||||
|
||||
module_lora.lora_a = lora_a
|
||||
module_lora.lora_b = lora_b
|
||||
module.set_lora(
|
||||
index,
|
||||
module_lora.lora_a,
|
||||
module_lora.lora_b,
|
||||
)
|
||||
return True
|
||||
|
||||
def _deactivate_adapter(self, lora_id: int):
|
||||
try:
|
||||
index = self.lora_index_to_id.index(lora_id)
|
||||
self.lora_index_to_id[index] = None
|
||||
|
||||
# Free GPU slab when deactivating to respect max_loras constraint
|
||||
if SLAB_OPTIMIZATION and lora_id in self._registered_adapters:
|
||||
lora_model = self._registered_adapters[lora_id]
|
||||
if hasattr(lora_model, "_cached_gpu_slab"):
|
||||
# Free GPU slab to make room for other LoRAs
|
||||
del lora_model._cached_gpu_slab
|
||||
torch.cuda.empty_cache() # Force GPU memory cleanup
|
||||
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _add_adapter(self, lora: LoRAModel):
|
||||
self._create_merged_loras_inplace(lora)
|
||||
self._registered_adapters[lora.id] = lora
|
||||
if not SLAB_OPTIMIZATION:
|
||||
# Traditional approach: use CPU packing for packed modules
|
||||
self._create_merged_loras_inplace(lora)
|
||||
self._registered_adapters[lora.id] = lora
|
||||
else:
|
||||
# Slab optimization: slab already built with target modules
|
||||
logger.debug(
|
||||
"[SLAB_OPTIMIZATION] Registering LoRA %d - "
|
||||
"slab already built with target modules",
|
||||
lora.id,
|
||||
)
|
||||
self._registered_adapters[lora.id] = lora
|
||||
|
||||
def pin_adapter(self, lora_id: int) -> bool:
|
||||
"""Pin a LoRAModel in the manager cache."""
|
||||
@ -490,7 +547,8 @@ class LoRAModelManager:
|
||||
module_name = replaced_module_name
|
||||
if module_name.endswith(".experts"):
|
||||
lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe(
|
||||
replacement_loras, module_name
|
||||
replacement_loras,
|
||||
module_name,
|
||||
)
|
||||
else:
|
||||
lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
|
||||
@ -515,19 +573,19 @@ class LoRAModelManager:
|
||||
# overhead is significant.
|
||||
# 2. The weight packing above (e.g., pack_moe) may invalidate the
|
||||
# pin_memory allocation, so we execute it after packing.
|
||||
|
||||
pin_memory = str(lora_device) == "cpu" and is_pin_memory_available()
|
||||
if pin_memory:
|
||||
for lora in lora_model.loras.values():
|
||||
if isinstance(lora.lora_a, list):
|
||||
for index in range(len(lora.lora_a)):
|
||||
if lora.lora_a[index] is None:
|
||||
continue
|
||||
lora.lora_a[index] = lora.lora_a[index].pin_memory()
|
||||
lora.lora_b[index] = lora.lora_b[index].pin_memory()
|
||||
else:
|
||||
lora.lora_a = lora.lora_a.pin_memory()
|
||||
lora.lora_b = lora.lora_b.pin_memory()
|
||||
if not SLAB_OPTIMIZATION:
|
||||
pin_memory = str(lora_device) == "cpu" and is_pin_memory_available()
|
||||
if pin_memory:
|
||||
for lora in lora_model.loras.values():
|
||||
if isinstance(lora.lora_a, list):
|
||||
for index in range(len(lora.lora_a)):
|
||||
if lora.lora_a[index] is None:
|
||||
continue
|
||||
lora.lora_a[index] = lora.lora_a[index].pin_memory()
|
||||
lora.lora_b[index] = lora.lora_b[index].pin_memory()
|
||||
else:
|
||||
lora.lora_a = lora.lora_a.pin_memory()
|
||||
lora.lora_b = lora.lora_b.pin_memory()
|
||||
|
||||
def _get_lora_layer_weights(
|
||||
self, lora_model: LoRAModel, module_name: str
|
||||
|
||||
760
vllm/lora/slab_helper.py
Normal file
760
vllm/lora/slab_helper.py
Normal file
@ -0,0 +1,760 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
import threading
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.layers import FusedMoE3DWithLoRA
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
# Import here to avoid circular dependency
|
||||
from vllm.lora.utils import parse_fine_tuned_lora_name
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Global slab cache
|
||||
_GLOBAL_SLAB_CACHE: dict[str, tuple] = {}
|
||||
_CACHE_LOCK = threading.RLock()
|
||||
|
||||
# Global LoRAModel cache for early checking
|
||||
_GLOBAL_LORA_MODEL_CACHE: dict[str, Any] = {}
|
||||
_LORA_MODEL_CACHE_LOCK = threading.RLock()
|
||||
|
||||
# Global result storage
|
||||
_GLOBAL_RESULT_STORAGE: dict[str, tuple] = {}
|
||||
_RESULT_LOCK = threading.RLock()
|
||||
|
||||
|
||||
class UltraFastPinnedPool:
|
||||
"""Lazy-initialized pinned memory pool."""
|
||||
|
||||
def __init__(self):
|
||||
self.pool_size = 0
|
||||
self.pinned_pool = None # Lazy - allocated on first use
|
||||
self.pool_lock = threading.RLock()
|
||||
self.used_ranges = [] # Track used memory ranges
|
||||
|
||||
self.current_slab = None
|
||||
self.current_metadata = None
|
||||
|
||||
def allocate_slab_views_directly(
|
||||
self, tensor_sizes: list[int], dtype: torch.dtype
|
||||
) -> tuple[torch.Tensor, list[torch.Tensor]]:
|
||||
"""Allocate slab and return views - ZERO copy needed!"""
|
||||
total_elements = sum(tensor_sizes)
|
||||
|
||||
if total_elements == 0:
|
||||
return torch.empty(0, dtype=dtype, device="cpu").pin_memory(), []
|
||||
|
||||
tensor_bytes = total_elements * dtype.itemsize
|
||||
|
||||
with self.pool_lock:
|
||||
# Expand pool if needed
|
||||
if tensor_bytes > self.pool_size:
|
||||
new_size = max(self.pool_size * 2, tensor_bytes + self.pool_size)
|
||||
|
||||
new_pool = torch.empty(new_size, dtype=torch.uint8).pin_memory()
|
||||
|
||||
# Copy existing data if any
|
||||
if self.used_ranges and self.pinned_pool is not None:
|
||||
total_used = max(end for start, end in self.used_ranges)
|
||||
new_pool[:total_used] = self.pinned_pool[:total_used]
|
||||
|
||||
self.pinned_pool = new_pool
|
||||
self.pool_size = new_size
|
||||
|
||||
# Find available space
|
||||
start_offset = max((end for start, end in self.used_ranges), default=0)
|
||||
end_offset = start_offset + tensor_bytes
|
||||
|
||||
if end_offset > self.pool_size:
|
||||
# Reset pool - reuse from beginning
|
||||
self.used_ranges.clear()
|
||||
start_offset = 0
|
||||
end_offset = tensor_bytes
|
||||
|
||||
self.used_ranges.append((start_offset, end_offset))
|
||||
|
||||
# Create full slab view
|
||||
assert self.pinned_pool is not None
|
||||
pool_slice = self.pinned_pool[start_offset:end_offset]
|
||||
full_slab = pool_slice.view(torch.uint8).view(dtype)[:total_elements]
|
||||
|
||||
# Create individual tensor views for each component - NO copying!
|
||||
tensor_views = []
|
||||
current_offset = 0
|
||||
for size in tensor_sizes:
|
||||
if size > 0:
|
||||
tensor_view = full_slab[current_offset : current_offset + size]
|
||||
tensor_views.append(tensor_view)
|
||||
current_offset += size
|
||||
else:
|
||||
tensor_views.append(torch.empty(0, dtype=dtype, device="cpu"))
|
||||
|
||||
return full_slab, tensor_views
|
||||
|
||||
def allocate_slab_directly(
|
||||
self, num_elements: int, dtype: torch.dtype
|
||||
) -> torch.Tensor:
|
||||
"""Allocate slab DIRECTLY from pinned pool."""
|
||||
if num_elements == 0:
|
||||
return torch.empty(0, dtype=dtype, device="cpu").pin_memory()
|
||||
|
||||
tensor_bytes = num_elements * dtype.itemsize
|
||||
|
||||
with self.pool_lock:
|
||||
# Expand pool if needed
|
||||
if tensor_bytes > self.pool_size:
|
||||
new_size = max(self.pool_size * 2, tensor_bytes + self.pool_size)
|
||||
new_pool = torch.empty(new_size, dtype=torch.uint8).pin_memory()
|
||||
|
||||
# Copy existing data if any
|
||||
if self.used_ranges and self.pinned_pool is not None:
|
||||
total_used = max(end for start, end in self.used_ranges)
|
||||
new_pool[:total_used] = self.pinned_pool[:total_used]
|
||||
|
||||
self.pinned_pool = new_pool
|
||||
self.pool_size = new_size
|
||||
|
||||
# Find available space
|
||||
start_offset = max((end for start, end in self.used_ranges), default=0)
|
||||
end_offset = start_offset + tensor_bytes
|
||||
|
||||
if end_offset > self.pool_size:
|
||||
# Reset pool - reuse from beginning
|
||||
self.used_ranges.clear()
|
||||
start_offset = 0
|
||||
end_offset = tensor_bytes
|
||||
|
||||
self.used_ranges.append((start_offset, end_offset))
|
||||
|
||||
# Return direct view of pinned pool - NO copy needed!
|
||||
assert self.pinned_pool is not None
|
||||
pool_slice = self.pinned_pool[start_offset:end_offset]
|
||||
slab_tensor = pool_slice.view(torch.uint8).view(dtype)[:num_elements]
|
||||
|
||||
return slab_tensor
|
||||
|
||||
def get_pinned_tensor_fast(self, cpu_tensor: torch.Tensor) -> torch.Tensor:
|
||||
"""Ultra-fast pseudo-pinning using pre-allocated pool."""
|
||||
tensor_bytes = cpu_tensor.numel() * cpu_tensor.element_size()
|
||||
|
||||
with self.pool_lock:
|
||||
# Find available space in pool
|
||||
if tensor_bytes > self.pool_size:
|
||||
# Expand pool if needed
|
||||
new_size = max(self.pool_size * 2, tensor_bytes + self.pool_size)
|
||||
|
||||
# Create larger pool
|
||||
new_pool = torch.empty(new_size, dtype=torch.uint8).pin_memory()
|
||||
|
||||
# Copy existing data if any
|
||||
if self.used_ranges and self.pinned_pool is not None:
|
||||
total_used = max(end for start, end in self.used_ranges)
|
||||
new_pool[:total_used] = self.pinned_pool[:total_used]
|
||||
|
||||
self.pinned_pool = new_pool
|
||||
self.pool_size = new_size
|
||||
|
||||
# Simple allocation strategy - find space at end
|
||||
start_offset = max((end for start, end in self.used_ranges), default=0)
|
||||
end_offset = start_offset + tensor_bytes
|
||||
|
||||
if end_offset > self.pool_size:
|
||||
# Reset pool if we're at the end - reuse from beginning
|
||||
self.used_ranges.clear()
|
||||
start_offset = 0
|
||||
end_offset = tensor_bytes
|
||||
|
||||
self.used_ranges.append((start_offset, end_offset))
|
||||
|
||||
# Get slice from pre-pinned pool
|
||||
assert self.pinned_pool is not None
|
||||
pool_slice = self.pinned_pool[start_offset:end_offset]
|
||||
|
||||
# Reshape to match tensor and copy data (fast memory copy)
|
||||
pinned_tensor = (
|
||||
pool_slice.view(torch.uint8)
|
||||
.view(cpu_tensor.dtype)[: cpu_tensor.numel()]
|
||||
.view(cpu_tensor.shape)
|
||||
)
|
||||
pinned_tensor.copy_(cpu_tensor) # Fast copy into pre-pinned memory
|
||||
|
||||
return pinned_tensor
|
||||
|
||||
|
||||
# Global ultra-fast pool - initialized ONCE in envs.py
|
||||
_ULTRA_FAST_POOL = None
|
||||
_POOL_INIT_LOCK = threading.RLock()
|
||||
|
||||
|
||||
def set_global_pool(pool: UltraFastPinnedPool) -> None:
|
||||
"""Set the global pool instance."""
|
||||
global _ULTRA_FAST_POOL
|
||||
with _POOL_INIT_LOCK:
|
||||
if _ULTRA_FAST_POOL is None:
|
||||
_ULTRA_FAST_POOL = pool
|
||||
|
||||
|
||||
def get_ultra_fast_pool():
|
||||
"""Get the pre-initialized global pool - NO lazy initialization."""
|
||||
global _ULTRA_FAST_POOL
|
||||
if _ULTRA_FAST_POOL is None:
|
||||
# Fallback - create pool if not set (shouldn't happen)
|
||||
with _POOL_INIT_LOCK:
|
||||
if _ULTRA_FAST_POOL is None:
|
||||
_ULTRA_FAST_POOL = UltraFastPinnedPool()
|
||||
return _ULTRA_FAST_POOL
|
||||
|
||||
|
||||
# Main public interface with CPU caching and disk save/load
|
||||
def build_target_matched_slab(
|
||||
lora_model,
|
||||
target_modules,
|
||||
max_loras,
|
||||
lora_config,
|
||||
slab_path: str | None = None,
|
||||
):
|
||||
"""
|
||||
Build a slab that exactly matches the per-layer target shapes.
|
||||
Ultra-fast cached slab building with minimal overhead.
|
||||
Ensures perfect zero-copy during set_lora() and reuses slabs for identical LoRAs.
|
||||
|
||||
Args:
|
||||
lora_model: The LoRA model to build slab for
|
||||
target_modules: Target modules dictionary
|
||||
max_loras: Maximum number of LoRAs
|
||||
lora_config: LoRA configuration
|
||||
slab_path: Optional path to save/load slab to/from disk
|
||||
"""
|
||||
|
||||
# Get TP info for cache key when fully_sharded=True
|
||||
fully_sharded = lora_config.fully_sharded_loras if lora_config else False
|
||||
tp_rank = None
|
||||
if fully_sharded and target_modules:
|
||||
first_module = next(iter(target_modules.values()), None)
|
||||
if first_module:
|
||||
tp_rank = getattr(first_module, "tp_rank", 0)
|
||||
|
||||
cache_key = _generate_slab_cache_key(lora_model, "cpu", tp_rank, fully_sharded)
|
||||
|
||||
# Get pre-initialized pool ONCE to avoid repeated calls
|
||||
pool = get_ultra_fast_pool()
|
||||
|
||||
# Check CPU cache FIRST - if already on CPU, don't load again
|
||||
if cache_key in _GLOBAL_SLAB_CACHE:
|
||||
cached_slab, cached_metadata = _GLOBAL_SLAB_CACHE[cache_key]
|
||||
return cached_slab, cached_metadata
|
||||
|
||||
# Only take lock if not in memory cache
|
||||
with _CACHE_LOCK:
|
||||
# Double-check pattern for thread safety
|
||||
if cache_key in _GLOBAL_SLAB_CACHE:
|
||||
cached_slab, cached_metadata = _GLOBAL_SLAB_CACHE[cache_key]
|
||||
return cached_slab, cached_metadata
|
||||
|
||||
all_flattened_tensors = [] # Direct collection of all flattened tensors
|
||||
global_metadata = SlabMetadata()
|
||||
current_global_offset = 0
|
||||
|
||||
for module_name, module_lora in lora_model.loras.items():
|
||||
if module_lora is None:
|
||||
continue
|
||||
# Process lora_a
|
||||
if hasattr(module_lora, "lora_a") and module_lora.lora_a is not None:
|
||||
if isinstance(module_lora.lora_a, list):
|
||||
for expert_idx, expert_tensor in enumerate(module_lora.lora_a):
|
||||
if expert_tensor is not None:
|
||||
all_flattened_tensors.append(expert_tensor.flatten())
|
||||
tensor_info = TensorInfo(
|
||||
f"{module_name}.lora_a.expert_{expert_idx}",
|
||||
"a",
|
||||
expert_tensor.shape,
|
||||
expert_tensor.numel(),
|
||||
current_global_offset,
|
||||
)
|
||||
global_metadata.tensor_infos.append(tensor_info)
|
||||
current_global_offset += expert_tensor.numel()
|
||||
else:
|
||||
# Single tensor
|
||||
all_flattened_tensors.append(module_lora.lora_a.flatten())
|
||||
tensor_info = TensorInfo(
|
||||
f"{module_name}.lora_a",
|
||||
"a",
|
||||
module_lora.lora_a.shape,
|
||||
module_lora.lora_a.numel(),
|
||||
current_global_offset,
|
||||
)
|
||||
global_metadata.tensor_infos.append(tensor_info)
|
||||
current_global_offset += module_lora.lora_a.numel()
|
||||
|
||||
# Process lora_b (scaling already applied during packing for packed modules)
|
||||
if hasattr(module_lora, "lora_b") and module_lora.lora_b is not None:
|
||||
if isinstance(module_lora.lora_b, list):
|
||||
module_lora_b_count = 0
|
||||
for expert_idx, expert_tensor in enumerate(module_lora.lora_b):
|
||||
if expert_tensor is not None:
|
||||
all_flattened_tensors.append(expert_tensor.flatten())
|
||||
tensor_info = TensorInfo(
|
||||
f"{module_name}.lora_b.expert_{expert_idx}",
|
||||
"b",
|
||||
expert_tensor.shape,
|
||||
expert_tensor.numel(),
|
||||
current_global_offset,
|
||||
)
|
||||
global_metadata.tensor_infos.append(tensor_info)
|
||||
module_lora_b_count += expert_tensor.numel()
|
||||
current_global_offset += expert_tensor.numel()
|
||||
else:
|
||||
# Single tensor
|
||||
all_flattened_tensors.append(module_lora.lora_b.flatten())
|
||||
tensor_info = TensorInfo(
|
||||
f"{module_name}.lora_b",
|
||||
"b",
|
||||
module_lora.lora_b.shape,
|
||||
module_lora.lora_b.numel(),
|
||||
current_global_offset,
|
||||
)
|
||||
global_metadata.tensor_infos.append(tensor_info)
|
||||
current_global_offset += module_lora.lora_b.numel()
|
||||
extraction_map = {}
|
||||
lookup = {info.module_name: info for info in global_metadata.tensor_infos}
|
||||
|
||||
for module_name, module_lora in lora_model.loras.items():
|
||||
if module_lora is None:
|
||||
continue
|
||||
# Check if module has list structure (packed MoE/QKV) or single tensor
|
||||
has_list = (
|
||||
isinstance(module_lora.lora_a, list)
|
||||
if hasattr(module_lora, "lora_a") and module_lora.lora_a is not None
|
||||
else False
|
||||
)
|
||||
if has_list:
|
||||
# Packed module with list - collect all expert tensor infos
|
||||
expert_tensors_a = []
|
||||
expert_tensors_b = []
|
||||
|
||||
for expert_idx in range(len(module_lora.lora_a)):
|
||||
a_key = f"{module_name}.lora_a.expert_{expert_idx}"
|
||||
b_key = f"{module_name}.lora_b.expert_{expert_idx}"
|
||||
if a_key in lookup:
|
||||
a_info = lookup[a_key]
|
||||
expert_tensors_a.append(
|
||||
(a_info.offset, a_info.size, a_info.shape)
|
||||
)
|
||||
if b_key in lookup:
|
||||
b_info = lookup[b_key]
|
||||
expert_tensors_b.append(
|
||||
(b_info.offset, b_info.size, b_info.shape)
|
||||
)
|
||||
|
||||
# Determine type based on module name
|
||||
if module_name.endswith(".mlp.experts"):
|
||||
extraction_map[module_name] = (
|
||||
"moe",
|
||||
expert_tensors_a,
|
||||
expert_tensors_b,
|
||||
)
|
||||
elif module_name.endswith(".qkv_proj"):
|
||||
extraction_map[module_name] = (
|
||||
"qkv",
|
||||
expert_tensors_a,
|
||||
expert_tensors_b,
|
||||
)
|
||||
else:
|
||||
# Single tensor module
|
||||
lora_a_key = f"{module_name}.lora_a"
|
||||
lora_b_key = f"{module_name}.lora_b"
|
||||
|
||||
if lora_a_key in lookup and lora_b_key in lookup:
|
||||
a_info = lookup[lora_a_key]
|
||||
b_info = lookup[lora_b_key]
|
||||
extraction_map[module_name] = ( # type: ignore[assignment]
|
||||
"linear",
|
||||
a_info.offset,
|
||||
a_info.size,
|
||||
a_info.shape,
|
||||
b_info.offset,
|
||||
b_info.size,
|
||||
b_info.shape,
|
||||
)
|
||||
|
||||
# Store extraction_map in metadata for zero-overhead extraction
|
||||
global_metadata.extraction_map = extraction_map
|
||||
if all_flattened_tensors:
|
||||
# Calculate tensor sizes for view allocation
|
||||
tensor_sizes = [t.numel() for t in all_flattened_tensors]
|
||||
total_elements = sum(tensor_sizes)
|
||||
global_metadata.total_size = total_elements
|
||||
|
||||
# Allocate slab + individual views DIRECTLY in pinned pool - ZERO copy!
|
||||
full_slab, tensor_views = pool.allocate_slab_views_directly(
|
||||
tensor_sizes, torch.bfloat16
|
||||
)
|
||||
|
||||
for i, (source_tensor, view_tensor) in enumerate(
|
||||
zip(all_flattened_tensors, tensor_views)
|
||||
):
|
||||
view_tensor.copy_(source_tensor)
|
||||
else:
|
||||
# Empty slab case
|
||||
full_slab, _ = pool.allocate_slab_views_directly([], torch.bfloat16)
|
||||
global_metadata.total_size = 0
|
||||
|
||||
slab_tensor = full_slab
|
||||
metadata = global_metadata
|
||||
|
||||
# Cache the built slab in memory
|
||||
with _CACHE_LOCK:
|
||||
_GLOBAL_SLAB_CACHE[cache_key] = (slab_tensor, metadata)
|
||||
|
||||
# Touch the objects to ensure they're ready for return
|
||||
_ = slab_tensor.shape if hasattr(slab_tensor, "shape") else None
|
||||
_ = metadata.total_size if hasattr(metadata, "total_size") else None
|
||||
|
||||
# Generate unique result key for this build
|
||||
result_key = f"slab_result_{cache_key}_{int(time.time() * 1000000)}"
|
||||
|
||||
# Store large objects in global storage instead of returning them
|
||||
with _RESULT_LOCK:
|
||||
_GLOBAL_RESULT_STORAGE[result_key] = (slab_tensor, metadata)
|
||||
|
||||
# Clear local references to large objects to prevent cleanup overhead
|
||||
slab_tensor = None # type: ignore[assignment]
|
||||
metadata = None # type: ignore[assignment]
|
||||
full_slab = None # type: ignore[assignment]
|
||||
global_metadata = None # type: ignore[assignment]
|
||||
all_flattened_tensors = None # type: ignore[assignment]
|
||||
|
||||
return result_key
|
||||
|
||||
|
||||
def extract_tensors_from_gpu_slab(gpu_slab, metadata, module_name):
|
||||
"""Extract lora_a and lora_b tensors from GPU slab for a module."""
|
||||
extraction_info = metadata.extraction_map.get(module_name)
|
||||
if not extraction_info:
|
||||
return None, None
|
||||
|
||||
extraction_type = extraction_info[0]
|
||||
|
||||
if extraction_type == "linear":
|
||||
# tensor: ('linear', a_offset, a_size, a_shape, b_offset, b_size, b_shape)
|
||||
_, a_offset, a_size, a_shape, b_offset, b_size, b_shape = extraction_info
|
||||
lora_a = gpu_slab[a_offset : a_offset + a_size].view(a_shape)
|
||||
lora_b = gpu_slab[b_offset : b_offset + b_size].view(b_shape)
|
||||
return lora_a, lora_b
|
||||
|
||||
elif extraction_type in ("moe", "qkv"):
|
||||
# List of tensors: ('moe'/'qkv', expert_tensors_a, expert_tensors_b)
|
||||
_, expert_tensors_a, expert_tensors_b = extraction_info
|
||||
|
||||
lora_a_list = []
|
||||
for i, (offset, size, shape) in enumerate(expert_tensors_a):
|
||||
tensor = gpu_slab[offset : offset + size].view(shape)
|
||||
lora_a_list.append(tensor)
|
||||
|
||||
lora_b_list = []
|
||||
for i, (offset, size, shape) in enumerate(expert_tensors_b):
|
||||
tensor = gpu_slab[offset : offset + size].view(shape)
|
||||
lora_b_list.append(tensor)
|
||||
return lora_a_list, lora_b_list
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def process_slab_activation_loop(
|
||||
modules_dict,
|
||||
lora_model,
|
||||
get_lora_layer_weights_fn,
|
||||
lora_config,
|
||||
gpu_slab,
|
||||
metadata,
|
||||
index,
|
||||
):
|
||||
"""Extract weights from GPU slab and activate."""
|
||||
|
||||
# Loop through model modules
|
||||
for module_name, module in modules_dict.items():
|
||||
lora_a_gpu, lora_b_gpu = extract_tensors_from_gpu_slab(
|
||||
gpu_slab, metadata, module_name
|
||||
)
|
||||
|
||||
if lora_a_gpu is None or lora_b_gpu is None:
|
||||
# No weights for this module
|
||||
module.reset_lora(index)
|
||||
continue
|
||||
|
||||
# Special case: MoE3D needs 2-item list format
|
||||
if isinstance(module, FusedMoE3DWithLoRA) and not isinstance(lora_a_gpu, list):
|
||||
gate_up_a, gate_up_b = extract_tensors_from_gpu_slab(
|
||||
gpu_slab, metadata, module_name + ".base_layer"
|
||||
)
|
||||
down_a, down_b = lora_a_gpu, lora_b_gpu
|
||||
|
||||
if gate_up_a is not None and down_a is not None:
|
||||
lora_a_gpu = [gate_up_a, down_a]
|
||||
lora_b_gpu = [gate_up_b, down_b]
|
||||
module.set_lora(index, lora_a_gpu, lora_b_gpu)
|
||||
return True
|
||||
|
||||
|
||||
def check_slab_cache(lora_dir, peft_helper, target_lora_config, target_modules_dict):
|
||||
"""Check if LoRAModel is already cached for this LoRA directory."""
|
||||
if not lora_dir:
|
||||
return False, None
|
||||
|
||||
# Generate simple key based on lora_dir only
|
||||
cache_key = hashlib.md5(lora_dir.encode()).hexdigest()
|
||||
|
||||
# Check LoRAModel cache
|
||||
with _LORA_MODEL_CACHE_LOCK:
|
||||
if cache_key in _GLOBAL_LORA_MODEL_CACHE:
|
||||
logger.info("[SLAB_CACHE_HIT] Found cached LoRAModel for %s", lora_dir)
|
||||
return True, _GLOBAL_LORA_MODEL_CACHE[cache_key]
|
||||
|
||||
logger.info("[SLAB_CACHE_MISS] No cached LoRAModel for %s", lora_dir)
|
||||
return False, None
|
||||
|
||||
|
||||
def cache_lora_model(lora_dir, lora_model):
|
||||
"""Store LoRAModel in cache for reuse."""
|
||||
if not lora_dir:
|
||||
return
|
||||
|
||||
cache_key = hashlib.md5(lora_dir.encode()).hexdigest()
|
||||
|
||||
with _LORA_MODEL_CACHE_LOCK:
|
||||
_GLOBAL_LORA_MODEL_CACHE[cache_key] = lora_model
|
||||
logger.info("[SLAB_CACHE] Stored LoRAModel for %s", lora_dir)
|
||||
|
||||
|
||||
def get_cached_lora_model(cache_key):
|
||||
"""Get cached LoRA model."""
|
||||
with _LORA_MODEL_CACHE_LOCK:
|
||||
return _GLOBAL_LORA_MODEL_CACHE.get(cache_key)
|
||||
|
||||
|
||||
def _generate_slab_cache_key(lora_model, device, tp_rank=None, fully_sharded=False):
|
||||
"""Generate cache key for LoRA slab - includes tp_rank when fully_sharded=True."""
|
||||
lora_dir = getattr(lora_model, "_lora_dir", None)
|
||||
|
||||
if not lora_dir:
|
||||
lora_dir = f"unknown_path_{lora_model.rank}_{len(lora_model.loras)}"
|
||||
|
||||
# Base key
|
||||
key_str = f"{lora_dir}|{lora_model.rank}|{len(lora_model.loras)}|{str(device)}"
|
||||
|
||||
# Include tp_rank when fully_sharded=True (each GPU has different slab)
|
||||
if fully_sharded and tp_rank is not None:
|
||||
key_str += f"|tp_rank_{tp_rank}"
|
||||
|
||||
cache_key = hashlib.md5(key_str.encode()).hexdigest()
|
||||
|
||||
return cache_key
|
||||
|
||||
|
||||
class TensorInfo:
|
||||
"""Metadata for a tensor in the slab."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
module_name: str,
|
||||
tensor_type: str,
|
||||
shape: tuple,
|
||||
size: int,
|
||||
offset: int = 0,
|
||||
):
|
||||
self.module_name = module_name
|
||||
self.tensor_type = tensor_type # 'lora_a', 'lora_b'
|
||||
self.shape = shape
|
||||
self.size = size
|
||||
self.offset = offset
|
||||
|
||||
|
||||
class SlabMetadata:
|
||||
"""Metadata for the entire slab with pre-computed extraction data."""
|
||||
|
||||
def __init__(self):
|
||||
self.tensor_infos: list[TensorInfo] = []
|
||||
self.total_size = 0
|
||||
# PERFORMANCE: Pre-computed extraction data to eliminate all scatter overhead
|
||||
self.extraction_map: dict[
|
||||
str, tuple
|
||||
] = {} # module_name -> (lora_a_slice, lora_b_slice)
|
||||
|
||||
|
||||
def create_slab_optimized_lora_model(
|
||||
lora_model_id: int,
|
||||
tensors: dict[str, torch.Tensor],
|
||||
peft_helper,
|
||||
device: str = "cuda",
|
||||
dtype: torch.dtype | None = None,
|
||||
embeddings: dict[str, torch.Tensor] | None = None,
|
||||
target_embedding_padding: int | None = None,
|
||||
embedding_modules: dict[str, str] | None = None,
|
||||
embedding_padding_modules: list[str] | None = None,
|
||||
weights_mapper=None,
|
||||
lora_dir: str | None = None,
|
||||
lora_config=None,
|
||||
target_modules_dict=None,
|
||||
target_lora_config=None,
|
||||
slab_path: str | None = None,
|
||||
packed_modules: dict | None = None,
|
||||
packed_modules_mapping: dict | None = None,
|
||||
):
|
||||
"""Create a LoRAModel with target-aware slab."""
|
||||
if get_ultra_fast_pool() is None:
|
||||
pool = UltraFastPinnedPool()
|
||||
set_global_pool(pool)
|
||||
# Create LoRA weights as normal
|
||||
loras: dict[str, LoRALayerWeights] = {}
|
||||
|
||||
for tensor_name, tensor in tensors.items():
|
||||
module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name, weights_mapper)
|
||||
|
||||
if module_name not in loras:
|
||||
loras[module_name] = LoRALayerWeights.from_config(module_name, peft_helper)
|
||||
if is_lora_a:
|
||||
loras[module_name].lora_a = tensor.to(
|
||||
dtype=dtype
|
||||
) # Keep on CPU for slab building
|
||||
else:
|
||||
loras[module_name].lora_b = tensor.to(
|
||||
dtype=dtype
|
||||
) # Keep on CPU for slab building
|
||||
|
||||
assert embedding_padding_modules is not None
|
||||
if (
|
||||
any(name in module_name for name in embedding_padding_modules)
|
||||
and target_embedding_padding is not None
|
||||
):
|
||||
lora_b = loras[module_name].lora_b
|
||||
assert target_embedding_padding >= lora_b.shape[0]
|
||||
addition = target_embedding_padding - lora_b.shape[0]
|
||||
loras[module_name].lora_b = torch.nn.functional.pad(
|
||||
lora_b, (0, 0, 0, addition)
|
||||
)
|
||||
|
||||
# Create the LoRA model instance
|
||||
from vllm.lora.lora_model import LoRAModel
|
||||
|
||||
lora_model_instance = LoRAModel(lora_model_id, peft_helper.r, loras)
|
||||
|
||||
# Store the LoRA directory path for cache key generation
|
||||
if lora_dir:
|
||||
lora_model_instance._lora_dir = lora_dir # type: ignore[attr-defined]
|
||||
|
||||
if packed_modules and len(packed_modules) > 0:
|
||||
# Helper function to get lora weights (simplified version without model context)
|
||||
def get_lora_weights(lora_model, module_name):
|
||||
return lora_model.loras.get(module_name, None)
|
||||
|
||||
# Pack modules similar to _create_merged_loras_inplace
|
||||
for module_name, new_module_names in packed_modules.items():
|
||||
replacement_loras: list[LoRALayerWeights | None] = []
|
||||
replaced_module: set[str] = set()
|
||||
has_replacement = False
|
||||
|
||||
# Collect individual projections
|
||||
for r in new_module_names:
|
||||
lora = get_lora_weights(lora_model_instance, r)
|
||||
replacement_loras.append(lora)
|
||||
if lora:
|
||||
has_replacement = True
|
||||
replaced_module.add(r)
|
||||
|
||||
if not has_replacement:
|
||||
continue
|
||||
|
||||
# Ensure None values are explicit
|
||||
for i in range(len(replacement_loras)):
|
||||
if not replacement_loras[i]:
|
||||
replacement_loras[i] = None
|
||||
|
||||
# Pack based on module type
|
||||
if module_name.endswith(".experts"):
|
||||
lora_model_instance.loras[module_name] = (
|
||||
PackedLoRALayerWeights.pack_moe(
|
||||
replacement_loras,
|
||||
module_name,
|
||||
)
|
||||
)
|
||||
else:
|
||||
lora_model_instance.loras[module_name] = PackedLoRALayerWeights.pack(
|
||||
replacement_loras
|
||||
)
|
||||
# Remove individual projections
|
||||
for module in replaced_module:
|
||||
lora_model_instance.loras.pop(module, None)
|
||||
|
||||
else:
|
||||
logger.warning(
|
||||
"[SLAB_PRE_PACK] No packed_modules provided - "
|
||||
"slab will build with unpacked structure"
|
||||
)
|
||||
|
||||
# TP SHARDING: Shard lora_b weights on CPU if fully_sharded_loras=True
|
||||
fully_sharded = (
|
||||
target_lora_config.fully_sharded_loras if target_lora_config else False
|
||||
)
|
||||
if fully_sharded and target_modules_dict:
|
||||
logger.info(
|
||||
"[SLAB_TP_SHARD] fully_sharded_loras=True, sharding lora_b weights on CPU"
|
||||
)
|
||||
|
||||
for module_name, module_lora in lora_model_instance.loras.items():
|
||||
target_module = target_modules_dict.get(module_name)
|
||||
if not target_module:
|
||||
continue
|
||||
|
||||
tp_rank = getattr(target_module, "tp_rank", 0)
|
||||
tp_size = getattr(target_module, "tp_size", 1)
|
||||
|
||||
if (
|
||||
tp_size > 1
|
||||
and hasattr(module_lora, "lora_b")
|
||||
and module_lora.lora_b is not None
|
||||
):
|
||||
if isinstance(module_lora.lora_b, list):
|
||||
# MoE: shard each expert's lora_b
|
||||
sharded_experts = []
|
||||
for expert_idx, expert_b in enumerate(module_lora.lora_b):
|
||||
if expert_b is not None:
|
||||
shards = expert_b.chunk(tp_size, dim=0)
|
||||
sharded_experts.append(shards[tp_rank])
|
||||
else:
|
||||
sharded_experts.append(None)
|
||||
module_lora.lora_b = sharded_experts
|
||||
else:
|
||||
# Single tensor: shard once
|
||||
shards = module_lora.lora_b.chunk(tp_size, dim=0)
|
||||
module_lora.lora_b = shards[tp_rank]
|
||||
|
||||
result_key = build_target_matched_slab(
|
||||
lora_model_instance, target_modules_dict, 1, target_lora_config, slab_path
|
||||
)
|
||||
|
||||
# Handle different return types (cache key vs. direct objects for cache hits)
|
||||
if isinstance(result_key, str) and result_key.startswith("slab_result_"):
|
||||
slab, metadata = _GLOBAL_RESULT_STORAGE[result_key]
|
||||
# Clean up the temporary storage
|
||||
del _GLOBAL_RESULT_STORAGE[result_key]
|
||||
|
||||
else:
|
||||
slab, metadata = result_key
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
# Return tuple for consistency even without GPU
|
||||
return lora_model_instance, None, None
|
||||
|
||||
lora_model_instance._cached_cpu_slab = slab # type: ignore[attr-defined]
|
||||
lora_model_instance._cached_metadata = metadata # type: ignore[attr-defined]
|
||||
lora_model_instance._loras_dict = loras # type: ignore[attr-defined]
|
||||
|
||||
# Return CPU slab reference for now - GPU slab created during activation
|
||||
return lora_model_instance, None, metadata
|
||||
@ -7,6 +7,7 @@ from typing import Any, Literal
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.envs import SLAB_OPTIMIZATION
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.lora_model import LoRAModel
|
||||
from vllm.lora.model_manager import (
|
||||
@ -34,10 +35,12 @@ class WorkerLoRAManager:
|
||||
vllm_config: VllmConfig,
|
||||
device: torch.device,
|
||||
embedding_modules: dict[str, str],
|
||||
embedding_padding_modules: list[str],
|
||||
lora_model_cls: type[LoRAModel] = LoRAModel,
|
||||
):
|
||||
self._lora_model_cls = lora_model_cls
|
||||
self.embedding_modules = embedding_modules
|
||||
self.embedding_padding_modules = embedding_padding_modules
|
||||
self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
|
||||
self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
|
||||
self.max_num_batched_tokens = (
|
||||
@ -82,7 +85,37 @@ class WorkerLoRAManager:
|
||||
self._adapter_manager = lora_manager
|
||||
return lora_manager.model
|
||||
|
||||
def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
|
||||
def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel | None:
|
||||
if SLAB_OPTIMIZATION:
|
||||
lora_path = get_adapter_absolute_path(lora_request.lora_path)
|
||||
# Check for dummy/fake warmup paths
|
||||
if (
|
||||
"/not/a/real/path" in lora_path
|
||||
or "warmup" in lora_request.lora_name.lower()
|
||||
or not lora_path
|
||||
or lora_path == "/not/a/real/path"
|
||||
):
|
||||
logger.warning(
|
||||
"[SLAB_OPTIMIZATION] Skipping dummy warmup LoRA: %s "
|
||||
"(path: %s) - not needed with slab optimization",
|
||||
lora_request.lora_name,
|
||||
lora_path,
|
||||
)
|
||||
return None
|
||||
|
||||
# Check if adapter_config.json exists for real LoRAs
|
||||
import os
|
||||
|
||||
lora_config_path = os.path.join(lora_path, "adapter_config.json")
|
||||
if not os.path.exists(lora_config_path):
|
||||
logger.warning(
|
||||
"[SLAB_OPTIMIZATION] Skipping LoRA %s - "
|
||||
"adapter_config.json not found at %s, likely dummy warmup",
|
||||
lora_request.lora_name,
|
||||
lora_config_path,
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
supported_lora_modules = self._adapter_manager.supported_lora_modules
|
||||
packed_modules_mapping = self._adapter_manager.packed_modules_mapping
|
||||
@ -111,6 +144,41 @@ class WorkerLoRAManager:
|
||||
# to ensure correct loading of lora weights.
|
||||
model = self._adapter_manager.model
|
||||
hf_to_vllm_mapper = getattr(model, "hf_to_vllm_mapper", None)
|
||||
# Get target modules, lora_config, AND packed_modules info
|
||||
target_modules_dict: dict | None = None
|
||||
target_lora_config: Any = None
|
||||
packed_modules_dict: dict | None = None
|
||||
packed_modules_map: dict | None = None
|
||||
|
||||
if SLAB_OPTIMIZATION and hasattr(self, "_adapter_manager"):
|
||||
target_modules_dict = getattr(self._adapter_manager, "modules", None)
|
||||
target_lora_config = getattr(self._adapter_manager, "lora_config", None)
|
||||
packed_modules_dict = getattr(
|
||||
self._adapter_manager, "packed_modules", None
|
||||
)
|
||||
packed_modules_map = getattr(
|
||||
self._adapter_manager, "packed_modules_mapping", None
|
||||
)
|
||||
|
||||
if target_modules_dict and target_lora_config:
|
||||
logger.debug(
|
||||
"[SLAB_OPTIMIZATION] Passing %d target modules and "
|
||||
"lora_config (fully_sharded_loras=%s) to LoRA creation",
|
||||
len(target_modules_dict),
|
||||
target_lora_config.fully_sharded_loras,
|
||||
)
|
||||
logger.debug(
|
||||
"[SLAB_OPTIMIZATION] Passing %d packed_modules for "
|
||||
"pre-slab packing",
|
||||
len(packed_modules_dict) if packed_modules_dict else 0,
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"[SLAB_OPTIMIZATION] Missing target info - "
|
||||
"modules: %s, lora_config: %s",
|
||||
target_modules_dict is not None,
|
||||
target_lora_config is not None,
|
||||
)
|
||||
|
||||
lora = self._lora_model_cls.from_local_checkpoint(
|
||||
lora_path,
|
||||
@ -120,8 +188,15 @@ class WorkerLoRAManager:
|
||||
device="cpu",
|
||||
dtype=self.lora_config.lora_dtype,
|
||||
model_vocab_size=self.vocab_size,
|
||||
embedding_modules=self.embedding_modules,
|
||||
embedding_padding_modules=self.embedding_padding_modules,
|
||||
tensorizer_config_dict=lora_request.tensorizer_config_dict,
|
||||
weights_mapper=hf_to_vllm_mapper,
|
||||
target_modules_dict=target_modules_dict,
|
||||
target_lora_config=target_lora_config,
|
||||
slab_path=lora_request.slab_path,
|
||||
packed_modules=packed_modules_dict,
|
||||
packed_modules_mapping=packed_modules_map,
|
||||
)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
@ -184,6 +259,13 @@ class WorkerLoRAManager:
|
||||
if adapter_request.adapter_id in self.list_adapters():
|
||||
return False
|
||||
loaded_adapter = self._load_adapter(adapter_request)
|
||||
if loaded_adapter is None:
|
||||
# Dummy warmup LoRA was skipped under SLAB_OPTIMIZATION
|
||||
logger.debug(
|
||||
"[SLAB_OPTIMIZATION] Skipped dummy LoRA: %s",
|
||||
adapter_request.lora_name,
|
||||
)
|
||||
return False
|
||||
loaded = self._adapter_manager.add_adapter(loaded_adapter)
|
||||
self._adapter_manager.activate_adapter(loaded_adapter.id)
|
||||
return loaded
|
||||
@ -250,6 +332,13 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager):
|
||||
# This may cause the # of loaded lora adapters to very temporarily
|
||||
# exceed `--max-cpu-loras`.
|
||||
lora = self._load_adapter(lora_request)
|
||||
if lora is None:
|
||||
# Dummy warmup LoRA was skipped under SLAB_OPTIMIZATION
|
||||
logger.debug(
|
||||
"[SLAB_OPTIMIZATION] Skipped dummy LoRA: %s",
|
||||
lora_request.lora_name,
|
||||
)
|
||||
return False
|
||||
|
||||
# Loading succeeded, now check if we will exceed cache capacity and
|
||||
# evict if the oldest adapter if so
|
||||
|
||||
@ -5,6 +5,7 @@ Define LoRA functionality mixin for model runners.
|
||||
"""
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import TypeAlias
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -20,7 +21,7 @@ from vllm.model_executor.models import supports_lora, supports_multimodal
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
|
||||
from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
|
||||
|
||||
InputBatch = TPUInputBatch | GPUInputBatch
|
||||
InputBatch: TypeAlias = TPUInputBatch | GPUInputBatch
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -43,6 +44,7 @@ class LoRAModelRunnerMixin:
|
||||
vllm_config,
|
||||
device,
|
||||
model.embedding_modules,
|
||||
getattr(model, "embedding_padding_modules", []),
|
||||
)
|
||||
return self.lora_manager.create_lora_manager(model)
|
||||
|
||||
@ -80,7 +82,9 @@ class LoRAModelRunnerMixin:
|
||||
token_lora_mapping: tuple[int, ...] # of size np.sum(num_scheduled_tokens)
|
||||
lora_requests: set[LoRARequest]
|
||||
prompt_lora_mapping, token_lora_mapping, lora_requests = (
|
||||
input_batch.make_lora_inputs(num_scheduled_tokens, num_sampled_tokens)
|
||||
input_batch.make_lora_inputs( # type: ignore[attr-defined]
|
||||
num_scheduled_tokens, num_sampled_tokens
|
||||
)
|
||||
)
|
||||
return self._set_active_loras(
|
||||
prompt_lora_mapping, token_lora_mapping, lora_requests
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user