[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
This commit is contained in:
Sanger Steel 2025-05-22 21:44:18 -04:00 committed by GitHub
parent c91fe7b1b9
commit c32e249a23
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 606 additions and 197 deletions

View File

@ -128,6 +128,7 @@ steps:
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min

View File

@ -6,11 +6,12 @@ import json
import os
import uuid
from vllm import LLM
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
TensorizerConfig,
tensorize_vllm_model)
from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import (
TensorizerArgs, TensorizerConfig, tensorize_lora_adapter,
tensorize_vllm_model)
from vllm.utils import FlexibleArgumentParser
# yapf conflicts with isort for this docstring
@ -27,7 +28,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python -m examples.other.tensorize_vllm_model \
python examples/other/tensorize_vllm_model.py \
--model facebook/opt-125m \
serialize \
--serialized-directory s3://my-bucket \
@ -47,7 +48,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root
level of this repository:
python -m examples.other.tensorize_vllm_model \
python examples/other/tensorize_vllm_model.py \
--model EleutherAI/gpt-j-6B \
--dtype float16 \
deserialize \
@ -69,7 +70,7 @@ For more information on the available arguments for serializing, run
Or for deserializing:
`python -m examples.other.tensorize_vllm_model deserialize --help`.
`python examples/other/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models:
@ -90,11 +91,27 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run:
`python -m examples.other.tensorize_vllm_model deserialize --help`
`python examples/other/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
`--path-to-tensors` are functionally the same in this case.
Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
can be serialized directly with the path to the LoRA adapter on HF Hub and
a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
will serialize the LoRA adapter artifacts to `--serialized-directory`.
You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring
the LoRA artifacts are in your model artifacts directory and specifying
`--enable-lora`. For instance:
```
vllm serve <model_path> \
--load-format tensorizer \
--model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
--enable-lora
```
"""
@ -107,6 +124,19 @@ def parse_args():
"also supported, although libsodium must be installed to "
"use it.")
parser = EngineArgs.add_cli_args(parser)
parser.add_argument(
"--lora-path",
type=str,
required=False,
help="Path to a LoRA adapter to "
"serialize along with model tensors. This can then be deserialized "
"along with the model by passing a tensorizer_config kwarg to "
"LoRARequest with type TensorizerConfig. See the docstring for this "
"for a usage example."
)
subparsers = parser.add_subparsers(dest='command')
serialize_parser = subparsers.add_parser(
@ -169,11 +199,42 @@ def parse_args():
def deserialize():
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config
)
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
enable_lora=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
)
# Truncating this as the extra text isn't necessary
prompts = [
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load
print(
llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("sql-lora",
1,
args.lora_path,
tensorizer_config = tensorizer_config)
)
)
else:
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config
)
return llm
@ -197,7 +258,10 @@ if __name__ == '__main__':
model_name = model_ref.split("/")[1]
keyfile = args.keyfile if args.keyfile else None
if args.command == "serialize" or args.command == "deserialize":
keyfile = args.keyfile
else:
keyfile = None
if args.model_loader_extra_config:
config = json.loads(args.model_loader_extra_config)
@ -228,6 +292,10 @@ if __name__ == '__main__':
encryption_keyfile=keyfile,
**credentials)
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize":

View File

@ -0,0 +1,97 @@
# SPDX-License-Identifier: Apache-2.0
import gc
import json
import tempfile
import openai
import pytest
import pytest_asyncio
import torch.cuda
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
from ...utils import RemoteOpenAIServer
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
LORA_PATH = "davzoku/finqa_adapter_1b"
def _cleanup():
gc.collect()
torch.cuda.empty_cache()
@pytest.fixture(autouse=True)
def cleanup():
_cleanup()
@pytest.fixture(scope='module')
def tmp_dir():
with tempfile.TemporaryDirectory() as path:
yield path
@pytest.fixture(scope='module')
def model_uri(tmp_dir):
yield f"{tmp_dir}/model.tensors"
@pytest.fixture(scope="module")
def tensorize_model_and_lora(tmp_dir, model_uri):
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
lora_dir=tmp_dir)
args = EngineArgs(model=MODEL_NAME, device="cuda")
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
tensorize_vllm_model(args, tensorizer_config)
# Manually invoke a _cleanup() here, as the cleanup()
# fixture won't be guaranteed to be called after this
# when this fixture is used for a test
_cleanup()
yield
@pytest.fixture(scope="module")
def server(model_uri, tensorize_model_and_lora):
model_loader_extra_config = {
"tensorizer_uri": model_uri,
}
## Start OpenAI API server
args = [
"--load-format", "tensorizer", "--device", "cuda",
"--model-loader-extra-config",
json.dumps(model_loader_extra_config), "--enable-lora"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
_cleanup()
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
assert completion.model == MODEL_NAME
assert len(completion.choices) == 1
assert len(completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)

View File

@ -1,12 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
import subprocess
import sys
from typing import Union
import pytest
import ray
import vllm
from vllm import LLM
from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from ..utils import create_new_process_for_each_test, multi_gpu_test
from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora):
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
def do_sample(llm: vllm.LLM,
lora_path: str,
lora_id: int,
tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=256,
skip_special_tokens=False,
stop=["[/assistant]"])
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
if tensorizer_config_dict is not None:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(
str(lora_id),
lora_id,
lora_path,
tensorizer_config_dict=tensorizer_config_dict)
if lora_id else None)
else:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts
def generate_and_test(llm, sql_lora_files):
def generate_and_test(llm,
sql_lora_files,
tensorizer_config_dict: Union[dict, None] = None):
print("lora adapter created")
assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1")
assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
print("no lora")
assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 2")
assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2) == EXPECTED_LORA_OUTPUT
print("removing lora")
@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
enable_chunked_prefill=True,
)
generate_and_test(llm, sql_lora_files)
@multi_gpu_test(num_gpus=2)
@create_new_process_for_each_test()
def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
sql_lora_huggingface_id):
# Run the tensorizing of the LoRA adapter and the model in a subprocess
# to guarantee cleanup
tp_size = 2
model_name = "model-rank-%03d.tensors"
model_ref = MODEL_PATH
lora_path = sql_lora_huggingface_id
suffix = "test"
try:
result = subprocess.run([
sys.executable,
f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix
],
check=True,
capture_output=True,
text=True)
except subprocess.CalledProcessError as e:
print("Tensorizing failed.")
print("STDOUT:\n", e.stdout)
print("STDERR:\n", e.stderr)
raise
print("STDOUT:\n", result.stdout)
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
loaded_vllm_model = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
tensorizer_config_dict = tensorizer_config.to_dict()
print("lora adapter created")
assert do_sample(loaded_vllm_model,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1")
assert do_sample(loaded_vllm_model,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT

View File

@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Tensorizer only tested on V0 so far.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(autouse=True)
def cleanup():
cleanup_dist_env_and_memory(shutdown_ray=True)

View File

@ -1,17 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
import gc
import json
import os
import pathlib
import subprocess
from functools import partial
from unittest.mock import MagicMock, patch
import openai
import pytest
import torch
from huggingface_hub import snapshot_download
from vllm import SamplingParams
from vllm.engine.arg_utils import EngineArgs
@ -22,12 +18,11 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
is_vllm_tensorized,
load_with_tensorizer,
open_stream,
serialize_vllm_model,
tensorize_vllm_model)
# yapf: enable
from vllm.utils import PlaceholderModule, import_from_path
from vllm.utils import PlaceholderModule
from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import VLLM_PATH
try:
from tensorizer import EncryptionParams
@ -103,6 +98,7 @@ def test_can_deserialize_s3(vllm_runner):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_deserialized_encrypted_vllm_model_has_same_outputs(
vllm_runner, tmp_path):
args = EngineArgs(model=model_ref)
with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors")
key_path = tmp_path / (model_ref + ".key")
@ -110,15 +106,13 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
outputs = vllm_model.generate(prompts, sampling_params)
config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
encryption_keyfile=key_path)
config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path),
encryption_keyfile=str(key_path))
vllm_model.apply_model(
partial(serialize_vllm_model,
tensorizer_config=config_for_serializing))
tensorize_vllm_model(args, config_for_serializing)
config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
encryption_keyfile=key_path)
config_for_deserializing = TensorizerConfig(
tensorizer_uri=str(model_path), encryption_keyfile=str(key_path))
with vllm_runner(model_ref,
load_format="tensorizer",
@ -154,113 +148,46 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
assert outputs == deserialized_outputs
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
multilora_inference = import_from_path(
"examples.offline_inference.multilora_inference",
EXAMPLES_PATH / "offline_inference/multilora_inference.py",
)
model_ref = "meta-llama/Llama-2-7b-hf"
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
test_prompts = multilora_inference.create_test_prompts(lora_path)
# Serialize model before deserializing and binding LoRA adapters
with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors")
vllm_model.apply_model(
partial(
serialize_vllm_model,
tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
with vllm_runner(
model_ref,
load_format="tensorizer",
model_loader_extra_config=TensorizerConfig(
tensorizer_uri=model_path,
num_readers=1,
),
enable_lora=True,
max_loras=1,
max_lora_rank=8,
max_cpu_loras=2,
max_num_seqs=50,
max_model_len=1000,
) as loaded_vllm_model:
multilora_inference.process_requests(
loaded_vllm_model.model.llm_engine, test_prompts)
assert loaded_vllm_model
def test_load_without_tensorizer_load_format(vllm_runner):
def test_load_without_tensorizer_load_format(vllm_runner, capfd):
model = None
with pytest.raises(ValueError):
try:
model = vllm_runner(
model_ref,
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
del model
gc.collect()
torch.cuda.empty_cache()
except RuntimeError:
out, err = capfd.readouterr()
combined_output = out + err
assert ("ValueError: Model loader extra config "
"is not supported for load "
"format LoadFormat.AUTO") in combined_output
finally:
del model
gc.collect()
torch.cuda.empty_cache()
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
## Serialize model
with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors")
vllm_model.apply_model(
partial(
serialize_vllm_model,
tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
model_loader_extra_config = {
"tensorizer_uri": str(model_path),
}
## Start OpenAI API server
openai_args = [
"--dtype",
"float16",
"--load-format",
"tensorizer",
"--model-loader-extra-config",
json.dumps(model_loader_extra_config),
]
with RemoteOpenAIServer(model_ref, openai_args) as server:
print("Server ready.")
client = server.get_client()
completion = client.completions.create(model=model_ref,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert len(completion.choices) == 1
assert len(completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
def test_raise_value_error_on_invalid_load_format(vllm_runner):
def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd):
model = None
with pytest.raises(ValueError):
try:
model = vllm_runner(
model_ref,
load_format="safetensors",
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
del model
gc.collect()
torch.cuda.empty_cache()
except RuntimeError:
out, err = capfd.readouterr()
combined_output = out + err
assert ("ValueError: Model loader extra config is not supported "
"for load format LoadFormat.SAFETENSORS") in combined_output
finally:
del model
gc.collect()
torch.cuda.empty_cache()
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
def test_tensorizer_with_tp_path_without_template(vllm_runner):
with pytest.raises(ValueError):
def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
try:
model_ref = "EleutherAI/pythia-1.4b"
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
@ -275,6 +202,13 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
tensor_parallel_size=2,
disable_custom_all_reduce=True,
)
except RuntimeError:
out, err = capfd.readouterr()
combined_output = out + err
assert ("ValueError: For a sharded model, tensorizer_uri "
"should include a string format template like '%04d' "
"to be formatted with the rank "
"of the shard") in combined_output
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
@ -288,7 +222,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
enforce_eager=True,
) as base_model:
outputs = base_model.generate(prompts, sampling_params)
base_model.model.llm_engine.model_executor.shutdown()
# load model with two shards and serialize with encryption
model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@ -296,7 +229,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
tensorizer_config = TensorizerConfig(
tensorizer_uri=model_path,
encryption_keyfile=key_path,
encryption_keyfile=str(key_path),
)
tensorize_vllm_model(
@ -331,14 +264,13 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
model_ref = "facebook/opt-125m"
model_path = tmp_path / (model_ref + ".tensors")
config = TensorizerConfig(tensorizer_uri=str(model_path))
args = EngineArgs(model=model_ref, device="cuda")
with vllm_runner(model_ref) as vllm_model:
outputs = vllm_model.generate(prompts, sampling_params)
vllm_model.apply_model(
partial(serialize_vllm_model, tensorizer_config=config))
assert is_vllm_tensorized(config)
tensorize_vllm_model(args, config)
assert is_vllm_tensorized(config)
with vllm_runner(model_ref,
load_format="tensorizer",

View File

@ -1195,8 +1195,7 @@ class EngineArgs:
#############################################################
# Unsupported Feature Flags on V1.
if (self.load_format == LoadFormat.TENSORIZER.value
or self.load_format == LoadFormat.SHARDED_STATE.value):
if self.load_format == LoadFormat.SHARDED_STATE.value:
_raise_or_fallback(
feature_name=f"--load_format {self.load_format}",
recommend_to_remove=False)

View File

@ -29,6 +29,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor,
get_supported_lora_modules,
is_regex_target_modules,
parse_fine_tuned_lora_name, replace_submodule)
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
from vllm.model_executor.models.interfaces import is_pooling_model
from vllm.model_executor.models.module_mapping import MultiModelKeys
@ -185,19 +186,19 @@ class LoRAModel(AdapterModel):
@classmethod
def from_local_checkpoint(
cls,
lora_dir: str,
expected_lora_modules: list[str],
peft_helper: PEFTHelper,
*,
lora_model_id: Optional[int] = None,
device: str = "cuda",
dtype: Optional[torch.dtype] = None,
target_embedding_padding: Optional[int] = None,
embedding_modules: Optional[dict[str, str]] = None,
embedding_padding_modules: Optional[list[str]] = None,
weights_mapper: Optional[WeightsMapper] = None,
) -> "LoRAModel":
cls,
lora_dir: str,
expected_lora_modules: list[str],
peft_helper: PEFTHelper,
*,
lora_model_id: Optional[int] = None,
device: str = "cuda",
dtype: Optional[torch.dtype] = None,
target_embedding_padding: Optional[int] = None,
embedding_modules: Optional[dict[str, str]] = None,
embedding_padding_modules: Optional[list[str]] = None,
weights_mapper: Optional[WeightsMapper] = None,
tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
"""Create a LoRAModel from a local checkpoint.
Args:
@ -219,10 +220,36 @@ class LoRAModel(AdapterModel):
lora_dir, "new_embeddings.safetensors")
new_embeddings_bin_file_path = os.path.join(lora_dir,
"new_embeddings.bin")
tensors: dict[str, torch.Tensor] = {}
unexpected_modules: list[Union[list[str], str]] = []
unexpected_modules: list[Union[list[str], str]]
if os.path.isfile(lora_tensor_path):
tensors: dict[str, torch.Tensor] = {}
def check_unexpected_modules(modules: dict):
for lora_module in modules.keys(): # noqa
module_name, _, _ = parse_fine_tuned_lora_name(
lora_module, weights_mapper)
part_name = module_name.split(".")[-1]
if part_name not in expected_lora_modules:
unexpected_modules.append(module_name)
if unexpected_modules:
raise ValueError(
f"While loading {lora_dir}, expected"
f" target modules in {expected_lora_modules}"
f" but received {unexpected_modules}."
f" Please verify that the loaded LoRA module is correct")
if tensorizer_config_dict:
from tensorizer import TensorDeserializer
tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir,
"adapter_model.tensors")
tensorizer_args = tensorizer_config._construct_tensorizer_args()
tensors = TensorDeserializer(lora_tensor_path,
dtype=tensorizer_config.dtype,
**tensorizer_args.deserializer_params)
check_unexpected_modules(tensors)
elif os.path.isfile(lora_tensor_path):
# Find unexpected modules.
# Use safetensor key as a source of truth to find expected modules.
# in peft if you have target_modules A, B, C and C does not exist
@ -232,20 +259,8 @@ class LoRAModel(AdapterModel):
unexpected_modules = []
with safetensors.safe_open(lora_tensor_path,
framework="pt") as f: # type: ignore
for lora_module in f.keys(): # noqa
module_name, _, _ = parse_fine_tuned_lora_name(
lora_module, weights_mapper)
part_name = module_name.split(".")[-1]
if part_name not in expected_lora_modules:
unexpected_modules.append(module_name)
if unexpected_modules:
raise ValueError(
f"While loading {lora_dir}, expected"
f" target modules in {expected_lora_modules}"
f" but received {unexpected_modules}."
f" Please verify that the loaded LoRA module is correct"
)
# Load tensors if there are only expected modules.
check_unexpected_modules(f)
for module in f.keys(): # noqa
tensors[module] = f.get_tensor(module)
elif os.path.isfile(lora_bin_file_path):

View File

@ -10,6 +10,7 @@ from typing import Literal, Optional, Union
from vllm.config import LoRAConfig
from vllm.logger import init_logger
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
logger = init_logger(__name__)
@ -89,12 +90,31 @@ class PEFTHelper:
return cls(**filtered_dict)
@classmethod
def from_local_dir(cls, lora_path: str,
max_position_embeddings: Optional[int]) -> "PEFTHelper":
def from_local_dir(
cls,
lora_path: str,
max_position_embeddings: Optional[int],
tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper":
lora_config_path = os.path.join(lora_path, "adapter_config.json")
with open(lora_config_path) as f:
config = json.load(f)
if tensorizer_config_dict:
tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
tensorizer_args = tensorizer_config._construct_tensorizer_args()
from tensorizer.stream_io import open_stream
lora_config_path = os.path.join(tensorizer_config.lora_dir,
"adapter_config.json")
with open_stream(lora_config_path,
mode="rb",
**tensorizer_args.stream_params) as f:
config = json.load(f)
logger.info("Successfully deserialized LoRA config from %s",
tensorizer_config.lora_dir)
else:
with open(lora_config_path) as f:
config = json.load(f)
config["vllm_max_position_embeddings"] = max_position_embeddings
return cls.from_dict(config)

View File

@ -31,6 +31,7 @@ class LoRARequest(
lora_local_path: Optional[str] = msgspec.field(default=None)
long_lora_max_len: Optional[int] = None
base_model_name: Optional[str] = msgspec.field(default=None)
tensorizer_config_dict: Optional[dict] = None
def __post_init__(self):
if self.lora_local_path:

View File

@ -100,7 +100,8 @@ class WorkerLoRAManager(AbstractWorkerManager):
lora_path = get_adapter_absolute_path(lora_request.lora_path)
peft_helper = PEFTHelper.from_local_dir(
lora_path, self.max_position_embeddings)
lora_path, self.max_position_embeddings,
lora_request.tensorizer_config_dict)
# Validates the LoRA configuration against requirements before
# loading weights, throwing an exception if validation fails.
@ -125,6 +126,7 @@ class WorkerLoRAManager(AbstractWorkerManager):
self.lora_config.lora_extra_vocab_size,
embedding_modules=self.embedding_modules,
embedding_padding_modules=self.embedding_padding_modules,
tensorizer_config_dict=lora_request.tensorizer_config_dict,
weights_mapper=hf_to_vllm_mapper)
except FileNotFoundError as e:

View File

@ -1,24 +1,28 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import contextlib
import contextvars
import dataclasses
import io
import json
import os
import re
import threading
import time
from collections.abc import Generator
from dataclasses import dataclass
from functools import partial
from typing import BinaryIO, Optional, Union
from typing import Any, BinaryIO, Optional, Union
import torch
from torch import nn
from torch.utils._python_dispatch import TorchDispatchMode
from transformers import PretrainedConfig
import vllm.envs as envs
from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.logger import init_logger
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
@ -58,9 +62,79 @@ __all__ = [
logger = init_logger(__name__)
class MetaTensorMode(TorchDispatchMode):
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
kwargs = kwargs or {}
if func._schema.name == "aten::empty" and "device" not in kwargs:
kwargs["device"] = "meta"
return func(*args, **kwargs)
def meta_tensor_mode(loading_code=None, ):
if loading_code is None:
return _NoInitOrTensorImpl.context_manager()
elif callable(loading_code):
with _NoInitOrTensorImpl.context_manager():
return loading_code()
else:
raise TypeError(
"expected a callable to evaluate,"
" or None if being used as a context manager;"
f' got an object of type "{type(loading_code).__name__}" instead.')
class _NoInitOrTensorImpl:
_MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm)
_MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES)
is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active",
default=False)
_count_active: int = 0
_count_active_lock = threading.Lock()
@classmethod
@contextlib.contextmanager
def context_manager(cls):
if cls.is_active.get():
yield
return
with cls._count_active_lock:
cls._count_active += 1
if cls._count_active == 1:
for mod in cls._MODULES:
mod.reset_parameters = cls._disable(mod.reset_parameters)
reset_token = cls.is_active.set(True)
try:
with MetaTensorMode():
yield
finally:
cls.is_active.reset(reset_token)
with cls._count_active_lock:
cls._count_active -= 1
if cls._count_active == 0:
for mod, original in cls._MODULE_ORIGINALS:
mod.reset_parameters = original
@staticmethod
def _disable(func):
def wrapper(*args, **kwargs):
if not _NoInitOrTensorImpl.is_active.get():
return func(*args, **kwargs)
return wrapper
@dataclass
class TensorizerConfig:
tensorizer_uri: str
tensorizer_uri: Union[str, None] = None
vllm_tensorized: Optional[bool] = False
verify_hash: Optional[bool] = False
num_readers: Optional[int] = None
@ -71,12 +145,29 @@ class TensorizerConfig:
model_class: Optional[type[torch.nn.Module]] = None
hf_config: Optional[PretrainedConfig] = None
dtype: Optional[Union[str, torch.dtype]] = None
lora_dir: Optional[str] = None
_is_sharded: bool = False
def __post_init__(self):
# check if the configuration is for a sharded vLLM model
self._is_sharded = isinstance(self.tensorizer_uri, str) \
and re.search(r'%0\dd', self.tensorizer_uri) is not None
if not self.tensorizer_uri and not self.lora_dir:
raise ValueError("tensorizer_uri must be provided.")
if not self.tensorizer_uri and self.lora_dir:
self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
assert self.tensorizer_uri is not None, ("tensorizer_uri must be "
"provided.")
self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
self.lora_dir = self.tensorizer_dir
@classmethod
def as_dict(cls, *args, **kwargs) -> dict[str, Any]:
cfg = TensorizerConfig(*args, **kwargs)
return dataclasses.asdict(cfg)
def to_dict(self) -> dict[str, Any]:
return dataclasses.asdict(self)
def _construct_tensorizer_args(self) -> "TensorizerArgs":
tensorizer_args = {
@ -140,7 +231,9 @@ class TensorizerArgs:
Args:
tensorizer_uri: Path to serialized model tensors. Can be a local file
path or a S3 URI.
path or a S3 URI. This is a required field unless lora_dir is
provided and the config is meant to be used for the
`tensorize_lora_adapter` function.
vllm_tensorized: If True, indicates that the serialized model is a
vLLM model. This is used to determine the behavior of the
TensorDeserializer when loading tensors from a serialized model.
@ -296,10 +389,10 @@ class TensorizerAgent:
model_args.torch_dtype = self.tensorizer_config.dtype
assert self.tensorizer_config.model_class is not None
# TODO: Do we need to consider old-style model class?
with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
check_compile=True):
with meta_tensor_mode(), set_current_vllm_config(self.vllm_config,
check_compile=True):
return self.tensorizer_config.model_class(
vllm_config=self.vllm_config, )
vllm_config=self.vllm_config)
def _resize_lora_embeddings(self):
"""Modify LoRA embedding layers to use bigger tensors
@ -467,8 +560,73 @@ def tensorize_vllm_model(engine_args: EngineArgs,
) as stream:
stream.write(encryption_params.key)
engine = LLMEngine.from_engine_args(engine_args)
engine.model_executor.collective_rpc(
"save_tensorized_model",
kwargs=dict(tensorizer_config=tensorizer_config),
)
from vllm import LLMEngine
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
if not envs.VLLM_USE_V1:
engine = LLMEngine.from_engine_args(engine_args)
engine.model_executor.collective_rpc(
"save_tensorized_model",
kwargs=dict(tensorizer_config=tensorizer_config),
)
else:
engine = V1LLMEngine.from_vllm_config(engine_config)
engine.collective_rpc(
"save_tensorized_model",
kwargs=dict(tensorizer_config=tensorizer_config),
)
def tensorize_lora_adapter(lora_path: str,
tensorizer_config: TensorizerConfig):
"""
Uses tensorizer to serialize a LoRA adapter. Assumes that the files
needed to load a LoRA adapter are a safetensors-format file called
adapter_model.safetensors and a json config file called adapter_config.json.
Serializes the files in the tensorizer_config.lora_dir
"""
import safetensors
from vllm.lora.utils import get_adapter_absolute_path
lora_dir = get_adapter_absolute_path(lora_path)
tensor_path = config_path = ""
for file in os.listdir(lora_dir):
if file.startswith("adapter_model"):
tensor_path = lora_dir + "/" + file
if file.startswith("adapter_config"):
config_path = lora_dir + "/" + file
if tensor_path and config_path:
break
if tensor_path.endswith(".safetensors"):
tensors = safetensors.torch.load_file(tensor_path)
elif tensor_path.endswith(".bin"):
tensors = torch.load(tensor_path)
else:
raise ValueError("Unsupported file: %s", tensor_path)
with open(config_path) as f:
config = json.load(f)
tensorizer_args = tensorizer_config._construct_tensorizer_args()
with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json",
mode="wb+",
**tensorizer_args.stream_params) as f:
f.write(json.dumps(config).encode("utf-8"))
lora_uri = (f"{tensorizer_config.lora_dir}"
f"/adapter_model.tensors")
with open_stream(lora_uri, mode="wb+",
**tensorizer_args.stream_params) as f:
serializer = TensorSerializer(f)
serializer.write_state_dict(tensors)
serializer.close()
logger.info("Successfully serialized LoRA files to %s",
str(tensorizer_config.lora_dir))

View File

@ -2,6 +2,7 @@
# ruff: noqa: SIM117
import copy
from collections.abc import Generator
from typing import Union
import torch
from torch import nn
@ -111,8 +112,10 @@ class TensorizerLoader(BaseModelLoader):
@staticmethod
def save_model(
model: torch.nn.Module,
tensorizer_config: TensorizerConfig,
tensorizer_config: Union[TensorizerConfig, dict],
) -> None:
if isinstance(tensorizer_config, dict):
tensorizer_config = TensorizerConfig(**tensorizer_config)
serialize_vllm_model(
model=model,
tensorizer_config=tensorizer_config,

View File

@ -340,6 +340,13 @@ class EngineCore:
return self.model_executor.collective_rpc(method, timeout, args,
kwargs)
def save_tensorized_model(
self,
tensorizer_config,
) -> None:
self.model_executor.save_tensorized_model(
tensorizer_config=tensorizer_config, )
class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process."""

View File

@ -25,7 +25,7 @@ from vllm.distributed.parallel_state import (
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import init_logger
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.model_loader import TensorizerLoader, get_model
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.multimodal.utils import group_mm_inputs_by_modality
@ -60,6 +60,7 @@ from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
if TYPE_CHECKING:
import xgrammar as xgr
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.v1.core.sched.output import SchedulerOutput
else:
xgr = LazyLoader("xgr", globals(), "xgrammar")
@ -1534,6 +1535,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
time_after_load - time_before_load)
prepare_communication_buffer_for_model(self.model)
def save_tensorized_model(
self,
tensorizer_config: "TensorizerConfig",
) -> None:
TensorizerLoader.save_model(
self.model,
tensorizer_config=tensorizer_config,
)
def _get_prompt_logprobs_dict(
self,
hidden_states: torch.Tensor,

View File

@ -31,6 +31,7 @@ from vllm.v1.worker.worker_base import WorkerBase
logger = init_logger(__name__)
if TYPE_CHECKING:
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.v1.core.sched.output import SchedulerOutput
@ -326,6 +327,13 @@ class Worker(WorkerBase):
max_size=max_size,
)
def save_tensorized_model(
self,
tensorizer_config: "TensorizerConfig",
) -> None:
self.model_runner.save_tensorized_model(
tensorizer_config=tensorizer_config, )
def init_worker_distributed_environment(
vllm_config: VllmConfig,