mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-02 12:37:13 +08:00
[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)
Signed-off-by: Sanger Steel <sangersteel@gmail.com>
This commit is contained in:
parent
c91fe7b1b9
commit
c32e249a23
@ -128,6 +128,7 @@ steps:
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
|
||||
@ -6,11 +6,12 @@ import json
|
||||
import os
|
||||
import uuid
|
||||
|
||||
from vllm import LLM
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
|
||||
TensorizerConfig,
|
||||
tensorize_vllm_model)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.model_loader.tensorizer import (
|
||||
TensorizerArgs, TensorizerConfig, tensorize_lora_adapter,
|
||||
tensorize_vllm_model)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
# yapf conflicts with isort for this docstring
|
||||
@ -27,7 +28,7 @@ https://github.com/coreweave/tensorizer
|
||||
To serialize a model, install vLLM from source, then run something
|
||||
like this from the root level of this repository:
|
||||
|
||||
python -m examples.other.tensorize_vllm_model \
|
||||
python examples/other/tensorize_vllm_model.py \
|
||||
--model facebook/opt-125m \
|
||||
serialize \
|
||||
--serialized-directory s3://my-bucket \
|
||||
@ -47,7 +48,7 @@ providing a `--keyfile` argument.
|
||||
To deserialize a model, you can run something like this from the root
|
||||
level of this repository:
|
||||
|
||||
python -m examples.other.tensorize_vllm_model \
|
||||
python examples/other/tensorize_vllm_model.py \
|
||||
--model EleutherAI/gpt-j-6B \
|
||||
--dtype float16 \
|
||||
deserialize \
|
||||
@ -69,7 +70,7 @@ For more information on the available arguments for serializing, run
|
||||
|
||||
Or for deserializing:
|
||||
|
||||
`python -m examples.other.tensorize_vllm_model deserialize --help`.
|
||||
`python examples/other/tensorize_vllm_model.py deserialize --help`.
|
||||
|
||||
Once a model is serialized, tensorizer can be invoked with the `LLM` class
|
||||
directly to load models:
|
||||
@ -90,11 +91,27 @@ TensorizerConfig arguments desired.
|
||||
In order to see all of the available arguments usable to configure
|
||||
loading with tensorizer that are given to `TensorizerConfig`, run:
|
||||
|
||||
`python -m examples.other.tensorize_vllm_model deserialize --help`
|
||||
`python examples/other/tensorize_vllm_model.py deserialize --help`
|
||||
|
||||
under the `tensorizer options` section. These can also be used for
|
||||
deserialization in this example script, although `--tensorizer-uri` and
|
||||
`--path-to-tensors` are functionally the same in this case.
|
||||
|
||||
Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
|
||||
can be serialized directly with the path to the LoRA adapter on HF Hub and
|
||||
a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
|
||||
will serialize the LoRA adapter artifacts to `--serialized-directory`.
|
||||
|
||||
You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring
|
||||
the LoRA artifacts are in your model artifacts directory and specifying
|
||||
`--enable-lora`. For instance:
|
||||
|
||||
```
|
||||
vllm serve <model_path> \
|
||||
--load-format tensorizer \
|
||||
--model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
|
||||
--enable-lora
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
@ -107,6 +124,19 @@ def parse_args():
|
||||
"also supported, although libsodium must be installed to "
|
||||
"use it.")
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--lora-path",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Path to a LoRA adapter to "
|
||||
"serialize along with model tensors. This can then be deserialized "
|
||||
"along with the model by passing a tensorizer_config kwarg to "
|
||||
"LoRARequest with type TensorizerConfig. See the docstring for this "
|
||||
"for a usage example."
|
||||
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command')
|
||||
|
||||
serialize_parser = subparsers.add_parser(
|
||||
@ -169,11 +199,42 @@ def parse_args():
|
||||
|
||||
|
||||
def deserialize():
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config
|
||||
)
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
enable_lora=True,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"]
|
||||
)
|
||||
|
||||
# Truncating this as the extra text isn't necessary
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on ..."
|
||||
]
|
||||
|
||||
# Test LoRA load
|
||||
print(
|
||||
llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql-lora",
|
||||
1,
|
||||
args.lora_path,
|
||||
tensorizer_config = tensorizer_config)
|
||||
)
|
||||
)
|
||||
else:
|
||||
llm = LLM(model=args.model,
|
||||
load_format="tensorizer",
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
model_loader_extra_config=tensorizer_config
|
||||
)
|
||||
return llm
|
||||
|
||||
|
||||
@ -197,7 +258,10 @@ if __name__ == '__main__':
|
||||
|
||||
model_name = model_ref.split("/")[1]
|
||||
|
||||
keyfile = args.keyfile if args.keyfile else None
|
||||
if args.command == "serialize" or args.command == "deserialize":
|
||||
keyfile = args.keyfile
|
||||
else:
|
||||
keyfile = None
|
||||
|
||||
if args.model_loader_extra_config:
|
||||
config = json.loads(args.model_loader_extra_config)
|
||||
@ -228,6 +292,10 @@ if __name__ == '__main__':
|
||||
encryption_keyfile=keyfile,
|
||||
**credentials)
|
||||
|
||||
if args.lora_path:
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
tensorize_lora_adapter(args.lora_path, tensorizer_config)
|
||||
|
||||
tensorize_vllm_model(engine_args, tensorizer_config)
|
||||
|
||||
elif args.command == "deserialize":
|
||||
|
||||
97
tests/entrypoints/openai/test_tensorizer_entrypoint.py
Normal file
97
tests/entrypoints/openai/test_tensorizer_entrypoint.py
Normal file
@ -0,0 +1,97 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import gc
|
||||
import json
|
||||
import tempfile
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch.cuda
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.model_executor.model_loader.tensorizer import (
|
||||
TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
|
||||
LORA_PATH = "davzoku/finqa_adapter_1b"
|
||||
|
||||
|
||||
def _cleanup():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup():
|
||||
_cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def tmp_dir():
|
||||
with tempfile.TemporaryDirectory() as path:
|
||||
yield path
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def model_uri(tmp_dir):
|
||||
yield f"{tmp_dir}/model.tensors"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tensorize_model_and_lora(tmp_dir, model_uri):
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
|
||||
lora_dir=tmp_dir)
|
||||
args = EngineArgs(model=MODEL_NAME, device="cuda")
|
||||
|
||||
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
|
||||
tensorize_vllm_model(args, tensorizer_config)
|
||||
|
||||
# Manually invoke a _cleanup() here, as the cleanup()
|
||||
# fixture won't be guaranteed to be called after this
|
||||
# when this fixture is used for a test
|
||||
_cleanup()
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(model_uri, tensorize_model_and_lora):
|
||||
model_loader_extra_config = {
|
||||
"tensorizer_uri": model_uri,
|
||||
}
|
||||
|
||||
## Start OpenAI API server
|
||||
args = [
|
||||
"--load-format", "tensorizer", "--device", "cuda",
|
||||
"--model-loader-extra-config",
|
||||
json.dumps(model_loader_extra_config), "--enable-lora"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
_cleanup()
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
assert completion.model == MODEL_NAME
|
||||
assert len(completion.choices) == 1
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
assert completion.choices[0].finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
||||
@ -1,12 +1,17 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
import vllm
|
||||
from vllm import LLM
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora):
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
|
||||
@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
|
||||
]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=256,
|
||||
skip_special_tokens=False,
|
||||
stop=["[/assistant]"])
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
|
||||
if tensorizer_config_dict is not None:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(
|
||||
str(lora_id),
|
||||
lora_id,
|
||||
lora_path,
|
||||
tensorizer_config_dict=tensorizer_config_dict)
|
||||
if lora_id else None)
|
||||
else:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
def generate_and_test(llm, sql_lora_files):
|
||||
def generate_and_test(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict: Union[dict, None] = None):
|
||||
print("lora adapter created")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||
assert do_sample(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||
|
||||
print("lora 1")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||
assert do_sample(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||
|
||||
print("no lora")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||
assert do_sample(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||
|
||||
print("lora 2")
|
||||
assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
|
||||
assert do_sample(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2) == EXPECTED_LORA_OUTPUT
|
||||
|
||||
print("removing lora")
|
||||
|
||||
@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
||||
sql_lora_huggingface_id):
|
||||
|
||||
# Run the tensorizing of the LoRA adapter and the model in a subprocess
|
||||
# to guarantee cleanup
|
||||
|
||||
tp_size = 2
|
||||
model_name = "model-rank-%03d.tensors"
|
||||
|
||||
model_ref = MODEL_PATH
|
||||
lora_path = sql_lora_huggingface_id
|
||||
suffix = "test"
|
||||
try:
|
||||
result = subprocess.run([
|
||||
sys.executable,
|
||||
f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model",
|
||||
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
|
||||
str(tp_size), "serialize", "--serialized-directory",
|
||||
str(tmp_path), "--suffix", suffix
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Tensorizing failed.")
|
||||
print("STDOUT:\n", e.stdout)
|
||||
print("STDERR:\n", e.stderr)
|
||||
raise
|
||||
|
||||
print("STDOUT:\n", result.stdout)
|
||||
|
||||
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
||||
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
|
||||
|
||||
loaded_vllm_model = LLM(model=model_ref,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=13,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2)
|
||||
|
||||
tensorizer_config_dict = tensorizer_config.to_dict()
|
||||
|
||||
print("lora adapter created")
|
||||
assert do_sample(loaded_vllm_model,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
|
||||
|
||||
print("lora 1")
|
||||
assert do_sample(loaded_vllm_model,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||
|
||||
@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Tensorizer only tested on V0 so far.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup():
|
||||
cleanup_dist_env_and_memory(shutdown_ray=True)
|
||||
|
||||
@ -1,17 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import subprocess
|
||||
from functools import partial
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@ -22,12 +18,11 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
|
||||
is_vllm_tensorized,
|
||||
load_with_tensorizer,
|
||||
open_stream,
|
||||
serialize_vllm_model,
|
||||
tensorize_vllm_model)
|
||||
# yapf: enable
|
||||
from vllm.utils import PlaceholderModule, import_from_path
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from ..utils import VLLM_PATH
|
||||
|
||||
try:
|
||||
from tensorizer import EncryptionParams
|
||||
@ -103,6 +98,7 @@ def test_can_deserialize_s3(vllm_runner):
|
||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||
def test_deserialized_encrypted_vllm_model_has_same_outputs(
|
||||
vllm_runner, tmp_path):
|
||||
args = EngineArgs(model=model_ref)
|
||||
with vllm_runner(model_ref) as vllm_model:
|
||||
model_path = tmp_path / (model_ref + ".tensors")
|
||||
key_path = tmp_path / (model_ref + ".key")
|
||||
@ -110,15 +106,13 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
|
||||
|
||||
outputs = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
|
||||
encryption_keyfile=key_path)
|
||||
config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path),
|
||||
encryption_keyfile=str(key_path))
|
||||
|
||||
vllm_model.apply_model(
|
||||
partial(serialize_vllm_model,
|
||||
tensorizer_config=config_for_serializing))
|
||||
tensorize_vllm_model(args, config_for_serializing)
|
||||
|
||||
config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
|
||||
encryption_keyfile=key_path)
|
||||
config_for_deserializing = TensorizerConfig(
|
||||
tensorizer_uri=str(model_path), encryption_keyfile=str(key_path))
|
||||
|
||||
with vllm_runner(model_ref,
|
||||
load_format="tensorizer",
|
||||
@ -154,113 +148,46 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
|
||||
assert outputs == deserialized_outputs
|
||||
|
||||
|
||||
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
|
||||
multilora_inference = import_from_path(
|
||||
"examples.offline_inference.multilora_inference",
|
||||
EXAMPLES_PATH / "offline_inference/multilora_inference.py",
|
||||
)
|
||||
|
||||
model_ref = "meta-llama/Llama-2-7b-hf"
|
||||
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
test_prompts = multilora_inference.create_test_prompts(lora_path)
|
||||
|
||||
# Serialize model before deserializing and binding LoRA adapters
|
||||
with vllm_runner(model_ref) as vllm_model:
|
||||
model_path = tmp_path / (model_ref + ".tensors")
|
||||
|
||||
vllm_model.apply_model(
|
||||
partial(
|
||||
serialize_vllm_model,
|
||||
tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
|
||||
|
||||
with vllm_runner(
|
||||
model_ref,
|
||||
load_format="tensorizer",
|
||||
model_loader_extra_config=TensorizerConfig(
|
||||
tensorizer_uri=model_path,
|
||||
num_readers=1,
|
||||
),
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_lora_rank=8,
|
||||
max_cpu_loras=2,
|
||||
max_num_seqs=50,
|
||||
max_model_len=1000,
|
||||
) as loaded_vllm_model:
|
||||
multilora_inference.process_requests(
|
||||
loaded_vllm_model.model.llm_engine, test_prompts)
|
||||
|
||||
assert loaded_vllm_model
|
||||
|
||||
|
||||
def test_load_without_tensorizer_load_format(vllm_runner):
|
||||
def test_load_without_tensorizer_load_format(vllm_runner, capfd):
|
||||
model = None
|
||||
with pytest.raises(ValueError):
|
||||
try:
|
||||
model = vllm_runner(
|
||||
model_ref,
|
||||
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
|
||||
del model
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
except RuntimeError:
|
||||
out, err = capfd.readouterr()
|
||||
combined_output = out + err
|
||||
assert ("ValueError: Model loader extra config "
|
||||
"is not supported for load "
|
||||
"format LoadFormat.AUTO") in combined_output
|
||||
finally:
|
||||
del model
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
|
||||
## Serialize model
|
||||
with vllm_runner(model_ref) as vllm_model:
|
||||
model_path = tmp_path / (model_ref + ".tensors")
|
||||
|
||||
vllm_model.apply_model(
|
||||
partial(
|
||||
serialize_vllm_model,
|
||||
tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
|
||||
|
||||
model_loader_extra_config = {
|
||||
"tensorizer_uri": str(model_path),
|
||||
}
|
||||
|
||||
## Start OpenAI API server
|
||||
openai_args = [
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--load-format",
|
||||
"tensorizer",
|
||||
"--model-loader-extra-config",
|
||||
json.dumps(model_loader_extra_config),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(model_ref, openai_args) as server:
|
||||
print("Server ready.")
|
||||
|
||||
client = server.get_client()
|
||||
completion = client.completions.create(model=model_ref,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 1
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
assert completion.choices[0].finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
||||
|
||||
|
||||
def test_raise_value_error_on_invalid_load_format(vllm_runner):
|
||||
def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd):
|
||||
model = None
|
||||
with pytest.raises(ValueError):
|
||||
try:
|
||||
model = vllm_runner(
|
||||
model_ref,
|
||||
load_format="safetensors",
|
||||
model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
|
||||
del model
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
except RuntimeError:
|
||||
out, err = capfd.readouterr()
|
||||
|
||||
combined_output = out + err
|
||||
assert ("ValueError: Model loader extra config is not supported "
|
||||
"for load format LoadFormat.SAFETENSORS") in combined_output
|
||||
finally:
|
||||
del model
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
|
||||
def test_tensorizer_with_tp_path_without_template(vllm_runner):
|
||||
with pytest.raises(ValueError):
|
||||
def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
|
||||
try:
|
||||
model_ref = "EleutherAI/pythia-1.4b"
|
||||
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
|
||||
|
||||
@ -275,6 +202,13 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
|
||||
tensor_parallel_size=2,
|
||||
disable_custom_all_reduce=True,
|
||||
)
|
||||
except RuntimeError:
|
||||
out, err = capfd.readouterr()
|
||||
combined_output = out + err
|
||||
assert ("ValueError: For a sharded model, tensorizer_uri "
|
||||
"should include a string format template like '%04d' "
|
||||
"to be formatted with the rank "
|
||||
"of the shard") in combined_output
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
|
||||
@ -288,7 +222,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
|
||||
enforce_eager=True,
|
||||
) as base_model:
|
||||
outputs = base_model.generate(prompts, sampling_params)
|
||||
base_model.model.llm_engine.model_executor.shutdown()
|
||||
|
||||
# load model with two shards and serialize with encryption
|
||||
model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
|
||||
@ -296,7 +229,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
|
||||
|
||||
tensorizer_config = TensorizerConfig(
|
||||
tensorizer_uri=model_path,
|
||||
encryption_keyfile=key_path,
|
||||
encryption_keyfile=str(key_path),
|
||||
)
|
||||
|
||||
tensorize_vllm_model(
|
||||
@ -331,14 +264,13 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
|
||||
model_ref = "facebook/opt-125m"
|
||||
model_path = tmp_path / (model_ref + ".tensors")
|
||||
config = TensorizerConfig(tensorizer_uri=str(model_path))
|
||||
args = EngineArgs(model=model_ref, device="cuda")
|
||||
|
||||
with vllm_runner(model_ref) as vllm_model:
|
||||
outputs = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_model.apply_model(
|
||||
partial(serialize_vllm_model, tensorizer_config=config))
|
||||
|
||||
assert is_vllm_tensorized(config)
|
||||
tensorize_vllm_model(args, config)
|
||||
assert is_vllm_tensorized(config)
|
||||
|
||||
with vllm_runner(model_ref,
|
||||
load_format="tensorizer",
|
||||
|
||||
@ -1195,8 +1195,7 @@ class EngineArgs:
|
||||
#############################################################
|
||||
# Unsupported Feature Flags on V1.
|
||||
|
||||
if (self.load_format == LoadFormat.TENSORIZER.value
|
||||
or self.load_format == LoadFormat.SHARDED_STATE.value):
|
||||
if self.load_format == LoadFormat.SHARDED_STATE.value:
|
||||
_raise_or_fallback(
|
||||
feature_name=f"--load_format {self.load_format}",
|
||||
recommend_to_remove=False)
|
||||
|
||||
@ -29,6 +29,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor,
|
||||
get_supported_lora_modules,
|
||||
is_regex_target_modules,
|
||||
parse_fine_tuned_lora_name, replace_submodule)
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
|
||||
from vllm.model_executor.models.interfaces import is_pooling_model
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
@ -185,19 +186,19 @@ class LoRAModel(AdapterModel):
|
||||
|
||||
@classmethod
|
||||
def from_local_checkpoint(
|
||||
cls,
|
||||
lora_dir: str,
|
||||
expected_lora_modules: list[str],
|
||||
peft_helper: PEFTHelper,
|
||||
*,
|
||||
lora_model_id: Optional[int] = None,
|
||||
device: str = "cuda",
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
target_embedding_padding: Optional[int] = None,
|
||||
embedding_modules: Optional[dict[str, str]] = None,
|
||||
embedding_padding_modules: Optional[list[str]] = None,
|
||||
weights_mapper: Optional[WeightsMapper] = None,
|
||||
) -> "LoRAModel":
|
||||
cls,
|
||||
lora_dir: str,
|
||||
expected_lora_modules: list[str],
|
||||
peft_helper: PEFTHelper,
|
||||
*,
|
||||
lora_model_id: Optional[int] = None,
|
||||
device: str = "cuda",
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
target_embedding_padding: Optional[int] = None,
|
||||
embedding_modules: Optional[dict[str, str]] = None,
|
||||
embedding_padding_modules: Optional[list[str]] = None,
|
||||
weights_mapper: Optional[WeightsMapper] = None,
|
||||
tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
|
||||
"""Create a LoRAModel from a local checkpoint.
|
||||
|
||||
Args:
|
||||
@ -219,10 +220,36 @@ class LoRAModel(AdapterModel):
|
||||
lora_dir, "new_embeddings.safetensors")
|
||||
new_embeddings_bin_file_path = os.path.join(lora_dir,
|
||||
"new_embeddings.bin")
|
||||
tensors: dict[str, torch.Tensor] = {}
|
||||
unexpected_modules: list[Union[list[str], str]] = []
|
||||
|
||||
unexpected_modules: list[Union[list[str], str]]
|
||||
if os.path.isfile(lora_tensor_path):
|
||||
tensors: dict[str, torch.Tensor] = {}
|
||||
def check_unexpected_modules(modules: dict):
|
||||
for lora_module in modules.keys(): # noqa
|
||||
module_name, _, _ = parse_fine_tuned_lora_name(
|
||||
lora_module, weights_mapper)
|
||||
part_name = module_name.split(".")[-1]
|
||||
if part_name not in expected_lora_modules:
|
||||
unexpected_modules.append(module_name)
|
||||
if unexpected_modules:
|
||||
raise ValueError(
|
||||
f"While loading {lora_dir}, expected"
|
||||
f" target modules in {expected_lora_modules}"
|
||||
f" but received {unexpected_modules}."
|
||||
f" Please verify that the loaded LoRA module is correct")
|
||||
|
||||
if tensorizer_config_dict:
|
||||
from tensorizer import TensorDeserializer
|
||||
|
||||
tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
|
||||
lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir,
|
||||
"adapter_model.tensors")
|
||||
tensorizer_args = tensorizer_config._construct_tensorizer_args()
|
||||
tensors = TensorDeserializer(lora_tensor_path,
|
||||
dtype=tensorizer_config.dtype,
|
||||
**tensorizer_args.deserializer_params)
|
||||
check_unexpected_modules(tensors)
|
||||
|
||||
elif os.path.isfile(lora_tensor_path):
|
||||
# Find unexpected modules.
|
||||
# Use safetensor key as a source of truth to find expected modules.
|
||||
# in peft if you have target_modules A, B, C and C does not exist
|
||||
@ -232,20 +259,8 @@ class LoRAModel(AdapterModel):
|
||||
unexpected_modules = []
|
||||
with safetensors.safe_open(lora_tensor_path,
|
||||
framework="pt") as f: # type: ignore
|
||||
for lora_module in f.keys(): # noqa
|
||||
module_name, _, _ = parse_fine_tuned_lora_name(
|
||||
lora_module, weights_mapper)
|
||||
part_name = module_name.split(".")[-1]
|
||||
if part_name not in expected_lora_modules:
|
||||
unexpected_modules.append(module_name)
|
||||
if unexpected_modules:
|
||||
raise ValueError(
|
||||
f"While loading {lora_dir}, expected"
|
||||
f" target modules in {expected_lora_modules}"
|
||||
f" but received {unexpected_modules}."
|
||||
f" Please verify that the loaded LoRA module is correct"
|
||||
)
|
||||
# Load tensors if there are only expected modules.
|
||||
check_unexpected_modules(f)
|
||||
for module in f.keys(): # noqa
|
||||
tensors[module] = f.get_tensor(module)
|
||||
elif os.path.isfile(lora_bin_file_path):
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import Literal, Optional, Union
|
||||
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -89,12 +90,31 @@ class PEFTHelper:
|
||||
return cls(**filtered_dict)
|
||||
|
||||
@classmethod
|
||||
def from_local_dir(cls, lora_path: str,
|
||||
max_position_embeddings: Optional[int]) -> "PEFTHelper":
|
||||
def from_local_dir(
|
||||
cls,
|
||||
lora_path: str,
|
||||
max_position_embeddings: Optional[int],
|
||||
tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper":
|
||||
lora_config_path = os.path.join(lora_path, "adapter_config.json")
|
||||
|
||||
with open(lora_config_path) as f:
|
||||
config = json.load(f)
|
||||
if tensorizer_config_dict:
|
||||
tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
|
||||
tensorizer_args = tensorizer_config._construct_tensorizer_args()
|
||||
from tensorizer.stream_io import open_stream
|
||||
lora_config_path = os.path.join(tensorizer_config.lora_dir,
|
||||
"adapter_config.json")
|
||||
with open_stream(lora_config_path,
|
||||
mode="rb",
|
||||
**tensorizer_args.stream_params) as f:
|
||||
config = json.load(f)
|
||||
|
||||
logger.info("Successfully deserialized LoRA config from %s",
|
||||
tensorizer_config.lora_dir)
|
||||
|
||||
else:
|
||||
with open(lora_config_path) as f:
|
||||
config = json.load(f)
|
||||
|
||||
config["vllm_max_position_embeddings"] = max_position_embeddings
|
||||
return cls.from_dict(config)
|
||||
|
||||
|
||||
@ -31,6 +31,7 @@ class LoRARequest(
|
||||
lora_local_path: Optional[str] = msgspec.field(default=None)
|
||||
long_lora_max_len: Optional[int] = None
|
||||
base_model_name: Optional[str] = msgspec.field(default=None)
|
||||
tensorizer_config_dict: Optional[dict] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.lora_local_path:
|
||||
|
||||
@ -100,7 +100,8 @@ class WorkerLoRAManager(AbstractWorkerManager):
|
||||
lora_path = get_adapter_absolute_path(lora_request.lora_path)
|
||||
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
lora_path, self.max_position_embeddings)
|
||||
lora_path, self.max_position_embeddings,
|
||||
lora_request.tensorizer_config_dict)
|
||||
|
||||
# Validates the LoRA configuration against requirements before
|
||||
# loading weights, throwing an exception if validation fails.
|
||||
@ -125,6 +126,7 @@ class WorkerLoRAManager(AbstractWorkerManager):
|
||||
self.lora_config.lora_extra_vocab_size,
|
||||
embedding_modules=self.embedding_modules,
|
||||
embedding_padding_modules=self.embedding_padding_modules,
|
||||
tensorizer_config_dict=lora_request.tensorizer_config_dict,
|
||||
weights_mapper=hf_to_vllm_mapper)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
|
||||
@ -1,24 +1,28 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import contextvars
|
||||
import dataclasses
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from typing import BinaryIO, Optional, Union
|
||||
from typing import Any, BinaryIO, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
@ -58,9 +62,79 @@ __all__ = [
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MetaTensorMode(TorchDispatchMode):
|
||||
|
||||
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
|
||||
kwargs = kwargs or {}
|
||||
|
||||
if func._schema.name == "aten::empty" and "device" not in kwargs:
|
||||
kwargs["device"] = "meta"
|
||||
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
def meta_tensor_mode(loading_code=None, ):
|
||||
|
||||
if loading_code is None:
|
||||
return _NoInitOrTensorImpl.context_manager()
|
||||
elif callable(loading_code):
|
||||
with _NoInitOrTensorImpl.context_manager():
|
||||
return loading_code()
|
||||
else:
|
||||
raise TypeError(
|
||||
"expected a callable to evaluate,"
|
||||
" or None if being used as a context manager;"
|
||||
f' got an object of type "{type(loading_code).__name__}" instead.')
|
||||
|
||||
|
||||
class _NoInitOrTensorImpl:
|
||||
_MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm)
|
||||
_MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES)
|
||||
|
||||
is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active",
|
||||
default=False)
|
||||
_count_active: int = 0
|
||||
_count_active_lock = threading.Lock()
|
||||
|
||||
@classmethod
|
||||
@contextlib.contextmanager
|
||||
def context_manager(cls):
|
||||
if cls.is_active.get():
|
||||
yield
|
||||
return
|
||||
|
||||
with cls._count_active_lock:
|
||||
cls._count_active += 1
|
||||
if cls._count_active == 1:
|
||||
for mod in cls._MODULES:
|
||||
mod.reset_parameters = cls._disable(mod.reset_parameters)
|
||||
|
||||
reset_token = cls.is_active.set(True)
|
||||
|
||||
try:
|
||||
with MetaTensorMode():
|
||||
yield
|
||||
finally:
|
||||
cls.is_active.reset(reset_token)
|
||||
with cls._count_active_lock:
|
||||
cls._count_active -= 1
|
||||
if cls._count_active == 0:
|
||||
for mod, original in cls._MODULE_ORIGINALS:
|
||||
mod.reset_parameters = original
|
||||
|
||||
@staticmethod
|
||||
def _disable(func):
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
if not _NoInitOrTensorImpl.is_active.get():
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
@dataclass
|
||||
class TensorizerConfig:
|
||||
tensorizer_uri: str
|
||||
tensorizer_uri: Union[str, None] = None
|
||||
vllm_tensorized: Optional[bool] = False
|
||||
verify_hash: Optional[bool] = False
|
||||
num_readers: Optional[int] = None
|
||||
@ -71,12 +145,29 @@ class TensorizerConfig:
|
||||
model_class: Optional[type[torch.nn.Module]] = None
|
||||
hf_config: Optional[PretrainedConfig] = None
|
||||
dtype: Optional[Union[str, torch.dtype]] = None
|
||||
lora_dir: Optional[str] = None
|
||||
_is_sharded: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
# check if the configuration is for a sharded vLLM model
|
||||
self._is_sharded = isinstance(self.tensorizer_uri, str) \
|
||||
and re.search(r'%0\dd', self.tensorizer_uri) is not None
|
||||
if not self.tensorizer_uri and not self.lora_dir:
|
||||
raise ValueError("tensorizer_uri must be provided.")
|
||||
if not self.tensorizer_uri and self.lora_dir:
|
||||
self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
|
||||
assert self.tensorizer_uri is not None, ("tensorizer_uri must be "
|
||||
"provided.")
|
||||
self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
|
||||
self.lora_dir = self.tensorizer_dir
|
||||
|
||||
@classmethod
|
||||
def as_dict(cls, *args, **kwargs) -> dict[str, Any]:
|
||||
cfg = TensorizerConfig(*args, **kwargs)
|
||||
return dataclasses.asdict(cfg)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return dataclasses.asdict(self)
|
||||
|
||||
def _construct_tensorizer_args(self) -> "TensorizerArgs":
|
||||
tensorizer_args = {
|
||||
@ -140,7 +231,9 @@ class TensorizerArgs:
|
||||
|
||||
Args:
|
||||
tensorizer_uri: Path to serialized model tensors. Can be a local file
|
||||
path or a S3 URI.
|
||||
path or a S3 URI. This is a required field unless lora_dir is
|
||||
provided and the config is meant to be used for the
|
||||
`tensorize_lora_adapter` function.
|
||||
vllm_tensorized: If True, indicates that the serialized model is a
|
||||
vLLM model. This is used to determine the behavior of the
|
||||
TensorDeserializer when loading tensors from a serialized model.
|
||||
@ -296,10 +389,10 @@ class TensorizerAgent:
|
||||
model_args.torch_dtype = self.tensorizer_config.dtype
|
||||
assert self.tensorizer_config.model_class is not None
|
||||
# TODO: Do we need to consider old-style model class?
|
||||
with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
|
||||
check_compile=True):
|
||||
with meta_tensor_mode(), set_current_vllm_config(self.vllm_config,
|
||||
check_compile=True):
|
||||
return self.tensorizer_config.model_class(
|
||||
vllm_config=self.vllm_config, )
|
||||
vllm_config=self.vllm_config)
|
||||
|
||||
def _resize_lora_embeddings(self):
|
||||
"""Modify LoRA embedding layers to use bigger tensors
|
||||
@ -467,8 +560,73 @@ def tensorize_vllm_model(engine_args: EngineArgs,
|
||||
) as stream:
|
||||
stream.write(encryption_params.key)
|
||||
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
engine.model_executor.collective_rpc(
|
||||
"save_tensorized_model",
|
||||
kwargs=dict(tensorizer_config=tensorizer_config),
|
||||
)
|
||||
from vllm import LLMEngine
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
engine.model_executor.collective_rpc(
|
||||
"save_tensorized_model",
|
||||
kwargs=dict(tensorizer_config=tensorizer_config),
|
||||
)
|
||||
else:
|
||||
engine = V1LLMEngine.from_vllm_config(engine_config)
|
||||
engine.collective_rpc(
|
||||
"save_tensorized_model",
|
||||
kwargs=dict(tensorizer_config=tensorizer_config),
|
||||
)
|
||||
|
||||
|
||||
def tensorize_lora_adapter(lora_path: str,
|
||||
tensorizer_config: TensorizerConfig):
|
||||
"""
|
||||
Uses tensorizer to serialize a LoRA adapter. Assumes that the files
|
||||
needed to load a LoRA adapter are a safetensors-format file called
|
||||
adapter_model.safetensors and a json config file called adapter_config.json.
|
||||
|
||||
Serializes the files in the tensorizer_config.lora_dir
|
||||
"""
|
||||
import safetensors
|
||||
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
|
||||
lora_dir = get_adapter_absolute_path(lora_path)
|
||||
|
||||
tensor_path = config_path = ""
|
||||
|
||||
for file in os.listdir(lora_dir):
|
||||
if file.startswith("adapter_model"):
|
||||
tensor_path = lora_dir + "/" + file
|
||||
if file.startswith("adapter_config"):
|
||||
config_path = lora_dir + "/" + file
|
||||
if tensor_path and config_path:
|
||||
break
|
||||
|
||||
if tensor_path.endswith(".safetensors"):
|
||||
tensors = safetensors.torch.load_file(tensor_path)
|
||||
elif tensor_path.endswith(".bin"):
|
||||
tensors = torch.load(tensor_path)
|
||||
else:
|
||||
raise ValueError("Unsupported file: %s", tensor_path)
|
||||
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
|
||||
tensorizer_args = tensorizer_config._construct_tensorizer_args()
|
||||
|
||||
with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json",
|
||||
mode="wb+",
|
||||
**tensorizer_args.stream_params) as f:
|
||||
|
||||
f.write(json.dumps(config).encode("utf-8"))
|
||||
|
||||
lora_uri = (f"{tensorizer_config.lora_dir}"
|
||||
f"/adapter_model.tensors")
|
||||
with open_stream(lora_uri, mode="wb+",
|
||||
**tensorizer_args.stream_params) as f:
|
||||
serializer = TensorSerializer(f)
|
||||
serializer.write_state_dict(tensors)
|
||||
serializer.close()
|
||||
|
||||
logger.info("Successfully serialized LoRA files to %s",
|
||||
str(tensorizer_config.lora_dir))
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# ruff: noqa: SIM117
|
||||
import copy
|
||||
from collections.abc import Generator
|
||||
from typing import Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -111,8 +112,10 @@ class TensorizerLoader(BaseModelLoader):
|
||||
@staticmethod
|
||||
def save_model(
|
||||
model: torch.nn.Module,
|
||||
tensorizer_config: TensorizerConfig,
|
||||
tensorizer_config: Union[TensorizerConfig, dict],
|
||||
) -> None:
|
||||
if isinstance(tensorizer_config, dict):
|
||||
tensorizer_config = TensorizerConfig(**tensorizer_config)
|
||||
serialize_vllm_model(
|
||||
model=model,
|
||||
tensorizer_config=tensorizer_config,
|
||||
|
||||
@ -340,6 +340,13 @@ class EngineCore:
|
||||
return self.model_executor.collective_rpc(method, timeout, args,
|
||||
kwargs)
|
||||
|
||||
def save_tensorized_model(
|
||||
self,
|
||||
tensorizer_config,
|
||||
) -> None:
|
||||
self.model_executor.save_tensorized_model(
|
||||
tensorizer_config=tensorizer_config, )
|
||||
|
||||
|
||||
class EngineCoreProc(EngineCore):
|
||||
"""ZMQ-wrapper for running EngineCore in background process."""
|
||||
|
||||
@ -25,7 +25,7 @@ from vllm.distributed.parallel_state import (
|
||||
from vllm.forward_context import get_forward_context, set_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.model_loader import TensorizerLoader, get_model
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.multimodal.utils import group_mm_inputs_by_modality
|
||||
@ -60,6 +60,7 @@ from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr
|
||||
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
@ -1534,6 +1535,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
time_after_load - time_before_load)
|
||||
prepare_communication_buffer_for_model(self.model)
|
||||
|
||||
def save_tensorized_model(
|
||||
self,
|
||||
tensorizer_config: "TensorizerConfig",
|
||||
) -> None:
|
||||
TensorizerLoader.save_model(
|
||||
self.model,
|
||||
tensorizer_config=tensorizer_config,
|
||||
)
|
||||
|
||||
def _get_prompt_logprobs_dict(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@ -31,6 +31,7 @@ from vllm.v1.worker.worker_base import WorkerBase
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
|
||||
@ -326,6 +327,13 @@ class Worker(WorkerBase):
|
||||
max_size=max_size,
|
||||
)
|
||||
|
||||
def save_tensorized_model(
|
||||
self,
|
||||
tensorizer_config: "TensorizerConfig",
|
||||
) -> None:
|
||||
self.model_runner.save_tensorized_model(
|
||||
tensorizer_config=tensorizer_config, )
|
||||
|
||||
|
||||
def init_worker_distributed_environment(
|
||||
vllm_config: VllmConfig,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user