From c32e249a23169353ccc02d7c6099a8c90ca4bbf6 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Thu, 22 May 2025 21:44:18 -0400 Subject: [PATCH] [Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926) Signed-off-by: Sanger Steel --- .buildkite/test-pipeline.yaml | 1 + examples/other/tensorize_vllm_model.py | 96 +++++++-- .../openai/test_tensorizer_entrypoint.py | 97 ++++++++++ tests/lora/test_llama_tp.py | 120 ++++++++++-- tests/tensorizer_loader/conftest.py | 8 - tests/tensorizer_loader/test_tensorizer.py | 158 +++++---------- vllm/engine/arg_utils.py | 3 +- vllm/lora/models.py | 73 ++++--- vllm/lora/peft_helper.py | 28 ++- vllm/lora/request.py | 1 + vllm/lora/worker_manager.py | 4 +- .../model_executor/model_loader/tensorizer.py | 182 ++++++++++++++++-- .../model_loader/tensorizer_loader.py | 5 +- vllm/v1/engine/core.py | 7 + vllm/v1/worker/gpu_model_runner.py | 12 +- vllm/v1/worker/gpu_worker.py | 8 + 16 files changed, 606 insertions(+), 197 deletions(-) create mode 100644 tests/entrypoints/openai/test_tensorizer_entrypoint.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0e4a0e2a531b7..017dba3d2d559 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -128,6 +128,7 @@ steps: - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py index 7d11ba51a0943..b1f2ce871bb4a 100644 --- a/examples/other/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -6,11 +6,12 @@ import json import os import uuid -from vllm import LLM +from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, - TensorizerConfig, - tensorize_vllm_model) +from vllm.lora.request import LoRARequest +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerArgs, TensorizerConfig, tensorize_lora_adapter, + tensorize_vllm_model) from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring @@ -27,7 +28,7 @@ https://github.com/coreweave/tensorizer To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.other.tensorize_vllm_model \ +python examples/other/tensorize_vllm_model.py \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -47,7 +48,7 @@ providing a `--keyfile` argument. To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.other.tensorize_vllm_model \ +python examples/other/tensorize_vllm_model.py \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -69,7 +70,7 @@ For more information on the available arguments for serializing, run Or for deserializing: -`python -m examples.other.tensorize_vllm_model deserialize --help`. +`python examples/other/tensorize_vllm_model.py deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -90,11 +91,27 @@ TensorizerConfig arguments desired. In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.other.tensorize_vllm_model deserialize --help` +`python examples/other/tensorize_vllm_model.py deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and `--path-to-tensors` are functionally the same in this case. + +Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter +can be serialized directly with the path to the LoRA adapter on HF Hub and +a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter +will serialize the LoRA adapter artifacts to `--serialized-directory`. + +You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring +the LoRA artifacts are in your model artifacts directory and specifying +`--enable-lora`. For instance: + +``` +vllm serve \ + --load-format tensorizer \ + --model-loader-extra-config '{"tensorizer_uri": ".tensors"}' \ + --enable-lora +``` """ @@ -107,6 +124,19 @@ def parse_args(): "also supported, although libsodium must be installed to " "use it.") parser = EngineArgs.add_cli_args(parser) + + parser.add_argument( + "--lora-path", + type=str, + required=False, + help="Path to a LoRA adapter to " + "serialize along with model tensors. This can then be deserialized " + "along with the model by passing a tensorizer_config kwarg to " + "LoRARequest with type TensorizerConfig. See the docstring for this " + "for a usage example." + + ) + subparsers = parser.add_subparsers(dest='command') serialize_parser = subparsers.add_parser( @@ -169,11 +199,42 @@ def parse_args(): def deserialize(): - llm = LLM(model=args.model, - load_format="tensorizer", - tensor_parallel_size=args.tensor_parallel_size, - model_loader_extra_config=tensorizer_config - ) + if args.lora_path: + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + llm = LLM(model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config, + enable_lora=True, + ) + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] + ) + + # Truncating this as the extra text isn't necessary + prompts = [ + "[user] Write a SQL query to answer the question based on ..." + ] + + # Test LoRA load + print( + llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql-lora", + 1, + args.lora_path, + tensorizer_config = tensorizer_config) + ) + ) + else: + llm = LLM(model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config + ) return llm @@ -197,7 +258,10 @@ if __name__ == '__main__': model_name = model_ref.split("/")[1] - keyfile = args.keyfile if args.keyfile else None + if args.command == "serialize" or args.command == "deserialize": + keyfile = args.keyfile + else: + keyfile = None if args.model_loader_extra_config: config = json.loads(args.model_loader_extra_config) @@ -228,6 +292,10 @@ if __name__ == '__main__': encryption_keyfile=keyfile, **credentials) + if args.lora_path: + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + tensorize_lora_adapter(args.lora_path, tensorizer_config) + tensorize_vllm_model(engine_args, tensorizer_config) elif args.command == "deserialize": diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py new file mode 100644 index 0000000000000..f1ab7223048db --- /dev/null +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +import gc +import json +import tempfile + +import openai +import pytest +import pytest_asyncio +import torch.cuda + +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model) + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "unsloth/llama-3.2-1b-Instruct" +LORA_PATH = "davzoku/finqa_adapter_1b" + + +def _cleanup(): + gc.collect() + torch.cuda.empty_cache() + + +@pytest.fixture(autouse=True) +def cleanup(): + _cleanup() + + +@pytest.fixture(scope='module') +def tmp_dir(): + with tempfile.TemporaryDirectory() as path: + yield path + + +@pytest.fixture(scope='module') +def model_uri(tmp_dir): + yield f"{tmp_dir}/model.tensors" + + +@pytest.fixture(scope="module") +def tensorize_model_and_lora(tmp_dir, model_uri): + tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, + lora_dir=tmp_dir) + args = EngineArgs(model=MODEL_NAME, device="cuda") + + tensorize_lora_adapter(LORA_PATH, tensorizer_config) + tensorize_vllm_model(args, tensorizer_config) + + # Manually invoke a _cleanup() here, as the cleanup() + # fixture won't be guaranteed to be called after this + # when this fixture is used for a test + _cleanup() + yield + + +@pytest.fixture(scope="module") +def server(model_uri, tensorize_model_and_lora): + model_loader_extra_config = { + "tensorizer_uri": model_uri, + } + + ## Start OpenAI API server + args = [ + "--load-format", "tensorizer", "--device", "cuda", + "--model-loader-extra-config", + json.dumps(model_loader_extra_config), "--enable-lora" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): + _cleanup() + completion = await client.completions.create(model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + assert completion.model == MODEL_NAME + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index e3a054bd62064..37bbc3cfa7d06 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +import subprocess +import sys +from typing import Union import pytest import ray import vllm +from vllm import LLM from vllm.lora.request import LoRARequest +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from ..utils import create_new_process_for_each_test, multi_gpu_test +from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora): pass -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: +def do_sample(llm: vllm.LLM, + lora_path: str, + lora_id: int, + tensorizer_config_dict: Union[dict, None] = None) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) + + if tensorizer_config_dict is not None: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest( + str(lora_id), + lora_id, + lora_path, + tensorizer_config_dict=tensorizer_config_dict) + if lora_id else None) + else: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) # Print the outputs. generated_texts: list[str] = [] for output in outputs: @@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -def generate_and_test(llm, sql_lora_files): +def generate_and_test(llm, + sql_lora_files, + tensorizer_config_dict: Union[dict, None] = None): print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1) == EXPECTED_LORA_OUTPUT print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=2) == EXPECTED_LORA_OUTPUT print("removing lora") @@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) + + +@multi_gpu_test(num_gpus=2) +@create_new_process_for_each_test() +def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, + sql_lora_huggingface_id): + + # Run the tensorizing of the LoRA adapter and the model in a subprocess + # to guarantee cleanup + + tp_size = 2 + model_name = "model-rank-%03d.tensors" + + model_ref = MODEL_PATH + lora_path = sql_lora_huggingface_id + suffix = "test" + try: + result = subprocess.run([ + sys.executable, + f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model", + MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size", + str(tp_size), "serialize", "--serialized-directory", + str(tmp_path), "--suffix", suffix + ], + check=True, + capture_output=True, + text=True) + except subprocess.CalledProcessError as e: + print("Tensorizing failed.") + print("STDOUT:\n", e.stdout) + print("STDERR:\n", e.stderr) + raise + + print("STDOUT:\n", result.stdout) + + model_uri = tmp_path / "vllm" / model_ref / suffix / model_name + tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + + loaded_vllm_model = LLM(model=model_ref, + load_format="tensorizer", + enable_lora=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config, + max_num_seqs=13, + tensor_parallel_size=2, + max_loras=2) + + tensorizer_config_dict = tensorizer_config.to_dict() + + print("lora adapter created") + assert do_sample(loaded_vllm_model, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(loaded_vllm_model, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1) == EXPECTED_LORA_OUTPUT diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 7efef163d2b92..ce8689f5b89c1 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -5,14 +5,6 @@ from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Tensorizer only tested on V0 so far. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - @pytest.fixture(autouse=True) def cleanup(): cleanup_dist_env_and_memory(shutdown_ray=True) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 7136dd44de03d..b6286e1483976 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import gc -import json import os import pathlib import subprocess -from functools import partial from unittest.mock import MagicMock, patch -import openai import pytest import torch -from huggingface_hub import snapshot_download from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -22,12 +18,11 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, open_stream, - serialize_vllm_model, tensorize_vllm_model) # yapf: enable -from vllm.utils import PlaceholderModule, import_from_path +from vllm.utils import PlaceholderModule -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import VLLM_PATH try: from tensorizer import EncryptionParams @@ -103,6 +98,7 @@ def test_can_deserialize_s3(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): + args = EngineArgs(model=model_ref) with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") @@ -110,15 +106,13 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, - encryption_keyfile=key_path) + config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path), + encryption_keyfile=str(key_path)) - vllm_model.apply_model( - partial(serialize_vllm_model, - tensorizer_config=config_for_serializing)) + tensorize_vllm_model(args, config_for_serializing) - config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, - encryption_keyfile=key_path) + config_for_deserializing = TensorizerConfig( + tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)) with vllm_runner(model_ref, load_format="tensorizer", @@ -154,113 +148,46 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, assert outputs == deserialized_outputs -def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): - multilora_inference = import_from_path( - "examples.offline_inference.multilora_inference", - EXAMPLES_PATH / "offline_inference/multilora_inference.py", - ) - - model_ref = "meta-llama/Llama-2-7b-hf" - lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = multilora_inference.create_test_prompts(lora_path) - - # Serialize model before deserializing and binding LoRA adapters - with vllm_runner(model_ref) as vllm_model: - model_path = tmp_path / (model_ref + ".tensors") - - vllm_model.apply_model( - partial( - serialize_vllm_model, - tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) - - with vllm_runner( - model_ref, - load_format="tensorizer", - model_loader_extra_config=TensorizerConfig( - tensorizer_uri=model_path, - num_readers=1, - ), - enable_lora=True, - max_loras=1, - max_lora_rank=8, - max_cpu_loras=2, - max_num_seqs=50, - max_model_len=1000, - ) as loaded_vllm_model: - multilora_inference.process_requests( - loaded_vllm_model.model.llm_engine, test_prompts) - - assert loaded_vllm_model - - -def test_load_without_tensorizer_load_format(vllm_runner): +def test_load_without_tensorizer_load_format(vllm_runner, capfd): model = None - with pytest.raises(ValueError): + try: model = vllm_runner( model_ref, model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) - del model - gc.collect() - torch.cuda.empty_cache() + except RuntimeError: + out, err = capfd.readouterr() + combined_output = out + err + assert ("ValueError: Model loader extra config " + "is not supported for load " + "format LoadFormat.AUTO") in combined_output + finally: + del model + gc.collect() + torch.cuda.empty_cache() -@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") -def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): - ## Serialize model - with vllm_runner(model_ref) as vllm_model: - model_path = tmp_path / (model_ref + ".tensors") - - vllm_model.apply_model( - partial( - serialize_vllm_model, - tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) - - model_loader_extra_config = { - "tensorizer_uri": str(model_path), - } - - ## Start OpenAI API server - openai_args = [ - "--dtype", - "float16", - "--load-format", - "tensorizer", - "--model-loader-extra-config", - json.dumps(model_loader_extra_config), - ] - - with RemoteOpenAIServer(model_ref, openai_args) as server: - print("Server ready.") - - client = server.get_client() - completion = client.completions.create(model=model_ref, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert len(completion.choices) == 1 - assert len(completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - -def test_raise_value_error_on_invalid_load_format(vllm_runner): +def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd): model = None - with pytest.raises(ValueError): + try: model = vllm_runner( model_ref, load_format="safetensors", model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) - del model - gc.collect() - torch.cuda.empty_cache() + except RuntimeError: + out, err = capfd.readouterr() + + combined_output = out + err + assert ("ValueError: Model loader extra config is not supported " + "for load format LoadFormat.SAFETENSORS") in combined_output + finally: + del model + gc.collect() + torch.cuda.empty_cache() @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") -def test_tensorizer_with_tp_path_without_template(vllm_runner): - with pytest.raises(ValueError): +def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd): + try: model_ref = "EleutherAI/pythia-1.4b" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" @@ -275,6 +202,13 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): tensor_parallel_size=2, disable_custom_all_reduce=True, ) + except RuntimeError: + out, err = capfd.readouterr() + combined_output = out + err + assert ("ValueError: For a sharded model, tensorizer_uri " + "should include a string format template like '%04d' " + "to be formatted with the rank " + "of the shard") in combined_output @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") @@ -288,7 +222,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( enforce_eager=True, ) as base_model: outputs = base_model.generate(prompts, sampling_params) - base_model.model.llm_engine.model_executor.shutdown() # load model with two shards and serialize with encryption model_path = str(tmp_path / (model_ref + "-%02d.tensors")) @@ -296,7 +229,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( tensorizer_config = TensorizerConfig( tensorizer_uri=model_path, - encryption_keyfile=key_path, + encryption_keyfile=str(key_path), ) tensorize_vllm_model( @@ -331,14 +264,13 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): model_ref = "facebook/opt-125m" model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) + args = EngineArgs(model=model_ref, device="cuda") with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - vllm_model.apply_model( - partial(serialize_vllm_model, tensorizer_config=config)) - - assert is_vllm_tensorized(config) + tensorize_vllm_model(args, config) + assert is_vllm_tensorized(config) with vllm_runner(model_ref, load_format="tensorizer", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5650742ff972f..12c306e98048b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1195,8 +1195,7 @@ class EngineArgs: ############################################################# # Unsupported Feature Flags on V1. - if (self.load_format == LoadFormat.TENSORIZER.value - or self.load_format == LoadFormat.SHARDED_STATE.value): + if self.load_format == LoadFormat.SHARDED_STATE.value: _raise_or_fallback( feature_name=f"--load_format {self.load_format}", recommend_to_remove=False) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 959fe4a672a6d..83aef62451a17 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -29,6 +29,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, get_supported_lora_modules, is_regex_target_modules, parse_fine_tuned_lora_name, replace_submodule) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -185,19 +186,19 @@ class LoRAModel(AdapterModel): @classmethod def from_local_checkpoint( - cls, - lora_dir: str, - expected_lora_modules: list[str], - peft_helper: PEFTHelper, - *, - lora_model_id: Optional[int] = None, - device: str = "cuda", - dtype: Optional[torch.dtype] = None, - target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[dict[str, str]] = None, - embedding_padding_modules: Optional[list[str]] = None, - weights_mapper: Optional[WeightsMapper] = None, - ) -> "LoRAModel": + cls, + lora_dir: str, + expected_lora_modules: list[str], + peft_helper: PEFTHelper, + *, + lora_model_id: Optional[int] = None, + device: str = "cuda", + dtype: Optional[torch.dtype] = None, + target_embedding_padding: Optional[int] = None, + embedding_modules: Optional[dict[str, str]] = None, + embedding_padding_modules: Optional[list[str]] = None, + weights_mapper: Optional[WeightsMapper] = None, + tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. Args: @@ -219,10 +220,36 @@ class LoRAModel(AdapterModel): lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + tensors: dict[str, torch.Tensor] = {} + unexpected_modules: list[Union[list[str], str]] = [] - unexpected_modules: list[Union[list[str], str]] - if os.path.isfile(lora_tensor_path): - tensors: dict[str, torch.Tensor] = {} + def check_unexpected_modules(modules: dict): + for lora_module in modules.keys(): # noqa + module_name, _, _ = parse_fine_tuned_lora_name( + lora_module, weights_mapper) + part_name = module_name.split(".")[-1] + if part_name not in expected_lora_modules: + unexpected_modules.append(module_name) + if unexpected_modules: + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct") + + if tensorizer_config_dict: + from tensorizer import TensorDeserializer + + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir, + "adapter_model.tensors") + tensorizer_args = tensorizer_config._construct_tensorizer_args() + tensors = TensorDeserializer(lora_tensor_path, + dtype=tensorizer_config.dtype, + **tensorizer_args.deserializer_params) + check_unexpected_modules(tensors) + + elif os.path.isfile(lora_tensor_path): # Find unexpected modules. # Use safetensor key as a source of truth to find expected modules. # in peft if you have target_modules A, B, C and C does not exist @@ -232,20 +259,8 @@ class LoRAModel(AdapterModel): unexpected_modules = [] with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore - for lora_module in f.keys(): # noqa - module_name, _, _ = parse_fine_tuned_lora_name( - lora_module, weights_mapper) - part_name = module_name.split(".")[-1] - if part_name not in expected_lora_modules: - unexpected_modules.append(module_name) - if unexpected_modules: - raise ValueError( - f"While loading {lora_dir}, expected" - f" target modules in {expected_lora_modules}" - f" but received {unexpected_modules}." - f" Please verify that the loaded LoRA module is correct" - ) # Load tensors if there are only expected modules. + check_unexpected_modules(f) for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) elif os.path.isfile(lora_bin_file_path): diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index d5de63f5baade..7d335e5f7fab1 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -10,6 +10,7 @@ from typing import Literal, Optional, Union from vllm.config import LoRAConfig from vllm.logger import init_logger +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig logger = init_logger(__name__) @@ -89,12 +90,31 @@ class PEFTHelper: return cls(**filtered_dict) @classmethod - def from_local_dir(cls, lora_path: str, - max_position_embeddings: Optional[int]) -> "PEFTHelper": + def from_local_dir( + cls, + lora_path: str, + max_position_embeddings: Optional[int], + tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper": lora_config_path = os.path.join(lora_path, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) + if tensorizer_config_dict: + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + tensorizer_args = tensorizer_config._construct_tensorizer_args() + from tensorizer.stream_io import open_stream + lora_config_path = os.path.join(tensorizer_config.lora_dir, + "adapter_config.json") + with open_stream(lora_config_path, + mode="rb", + **tensorizer_args.stream_params) as f: + config = json.load(f) + + logger.info("Successfully deserialized LoRA config from %s", + tensorizer_config.lora_dir) + + else: + with open(lora_config_path) as f: + config = json.load(f) + config["vllm_max_position_embeddings"] = max_position_embeddings return cls.from_dict(config) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index badfaa4193774..616e94f8d678f 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -31,6 +31,7 @@ class LoRARequest( lora_local_path: Optional[str] = msgspec.field(default=None) long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) + tensorizer_config_dict: Optional[dict] = None def __post_init__(self): if self.lora_local_path: diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 8e5bc61066593..afc8a8dc3b260 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -100,7 +100,8 @@ class WorkerLoRAManager(AbstractWorkerManager): lora_path = get_adapter_absolute_path(lora_request.lora_path) peft_helper = PEFTHelper.from_local_dir( - lora_path, self.max_position_embeddings) + lora_path, self.max_position_embeddings, + lora_request.tensorizer_config_dict) # Validates the LoRA configuration against requirements before # loading weights, throwing an exception if validation fails. @@ -125,6 +126,7 @@ class WorkerLoRAManager(AbstractWorkerManager): self.lora_config.lora_extra_vocab_size, embedding_modules=self.embedding_modules, embedding_padding_modules=self.embedding_padding_modules, + tensorizer_config_dict=lora_request.tensorizer_config_dict, weights_mapper=hf_to_vllm_mapper) except FileNotFoundError as e: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 459c4b4392e3f..900f12ebe6cab 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,24 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import contextlib +import contextvars import dataclasses import io +import json import os import re +import threading import time from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Optional, Union +from typing import Any, BinaryIO, Optional, Union import torch from torch import nn +from torch.utils._python_dispatch import TorchDispatchMode from transformers import PretrainedConfig import vllm.envs as envs from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -58,9 +62,79 @@ __all__ = [ logger = init_logger(__name__) +class MetaTensorMode(TorchDispatchMode): + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + if func._schema.name == "aten::empty" and "device" not in kwargs: + kwargs["device"] = "meta" + + return func(*args, **kwargs) + + +def meta_tensor_mode(loading_code=None, ): + + if loading_code is None: + return _NoInitOrTensorImpl.context_manager() + elif callable(loading_code): + with _NoInitOrTensorImpl.context_manager(): + return loading_code() + else: + raise TypeError( + "expected a callable to evaluate," + " or None if being used as a context manager;" + f' got an object of type "{type(loading_code).__name__}" instead.') + + +class _NoInitOrTensorImpl: + _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm) + _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES) + + is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active", + default=False) + _count_active: int = 0 + _count_active_lock = threading.Lock() + + @classmethod + @contextlib.contextmanager + def context_manager(cls): + if cls.is_active.get(): + yield + return + + with cls._count_active_lock: + cls._count_active += 1 + if cls._count_active == 1: + for mod in cls._MODULES: + mod.reset_parameters = cls._disable(mod.reset_parameters) + + reset_token = cls.is_active.set(True) + + try: + with MetaTensorMode(): + yield + finally: + cls.is_active.reset(reset_token) + with cls._count_active_lock: + cls._count_active -= 1 + if cls._count_active == 0: + for mod, original in cls._MODULE_ORIGINALS: + mod.reset_parameters = original + + @staticmethod + def _disable(func): + + def wrapper(*args, **kwargs): + if not _NoInitOrTensorImpl.is_active.get(): + return func(*args, **kwargs) + + return wrapper + + @dataclass class TensorizerConfig: - tensorizer_uri: str + tensorizer_uri: Union[str, None] = None vllm_tensorized: Optional[bool] = False verify_hash: Optional[bool] = False num_readers: Optional[int] = None @@ -71,12 +145,29 @@ class TensorizerConfig: model_class: Optional[type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None + lora_dir: Optional[str] = None _is_sharded: bool = False def __post_init__(self): # check if the configuration is for a sharded vLLM model self._is_sharded = isinstance(self.tensorizer_uri, str) \ and re.search(r'%0\dd', self.tensorizer_uri) is not None + if not self.tensorizer_uri and not self.lora_dir: + raise ValueError("tensorizer_uri must be provided.") + if not self.tensorizer_uri and self.lora_dir: + self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors" + assert self.tensorizer_uri is not None, ("tensorizer_uri must be " + "provided.") + self.tensorizer_dir = os.path.dirname(self.tensorizer_uri) + self.lora_dir = self.tensorizer_dir + + @classmethod + def as_dict(cls, *args, **kwargs) -> dict[str, Any]: + cfg = TensorizerConfig(*args, **kwargs) + return dataclasses.asdict(cfg) + + def to_dict(self) -> dict[str, Any]: + return dataclasses.asdict(self) def _construct_tensorizer_args(self) -> "TensorizerArgs": tensorizer_args = { @@ -140,7 +231,9 @@ class TensorizerArgs: Args: tensorizer_uri: Path to serialized model tensors. Can be a local file - path or a S3 URI. + path or a S3 URI. This is a required field unless lora_dir is + provided and the config is meant to be used for the + `tensorize_lora_adapter` function. vllm_tensorized: If True, indicates that the serialized model is a vLLM model. This is used to determine the behavior of the TensorDeserializer when loading tensors from a serialized model. @@ -296,10 +389,10 @@ class TensorizerAgent: model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, - check_compile=True): + with meta_tensor_mode(), set_current_vllm_config(self.vllm_config, + check_compile=True): return self.tensorizer_config.model_class( - vllm_config=self.vllm_config, ) + vllm_config=self.vllm_config) def _resize_lora_embeddings(self): """Modify LoRA embedding layers to use bigger tensors @@ -467,8 +560,73 @@ def tensorize_vllm_model(engine_args: EngineArgs, ) as stream: stream.write(encryption_params.key) - engine = LLMEngine.from_engine_args(engine_args) - engine.model_executor.collective_rpc( - "save_tensorized_model", - kwargs=dict(tensorizer_config=tensorizer_config), - ) + from vllm import LLMEngine + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + + if not envs.VLLM_USE_V1: + engine = LLMEngine.from_engine_args(engine_args) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) + else: + engine = V1LLMEngine.from_vllm_config(engine_config) + engine.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) + + +def tensorize_lora_adapter(lora_path: str, + tensorizer_config: TensorizerConfig): + """ + Uses tensorizer to serialize a LoRA adapter. Assumes that the files + needed to load a LoRA adapter are a safetensors-format file called + adapter_model.safetensors and a json config file called adapter_config.json. + + Serializes the files in the tensorizer_config.lora_dir + """ + import safetensors + + from vllm.lora.utils import get_adapter_absolute_path + + lora_dir = get_adapter_absolute_path(lora_path) + + tensor_path = config_path = "" + + for file in os.listdir(lora_dir): + if file.startswith("adapter_model"): + tensor_path = lora_dir + "/" + file + if file.startswith("adapter_config"): + config_path = lora_dir + "/" + file + if tensor_path and config_path: + break + + if tensor_path.endswith(".safetensors"): + tensors = safetensors.torch.load_file(tensor_path) + elif tensor_path.endswith(".bin"): + tensors = torch.load(tensor_path) + else: + raise ValueError("Unsupported file: %s", tensor_path) + + with open(config_path) as f: + config = json.load(f) + + tensorizer_args = tensorizer_config._construct_tensorizer_args() + + with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json", + mode="wb+", + **tensorizer_args.stream_params) as f: + + f.write(json.dumps(config).encode("utf-8")) + + lora_uri = (f"{tensorizer_config.lora_dir}" + f"/adapter_model.tensors") + with open_stream(lora_uri, mode="wb+", + **tensorizer_args.stream_params) as f: + serializer = TensorSerializer(f) + serializer.write_state_dict(tensors) + serializer.close() + + logger.info("Successfully serialized LoRA files to %s", + str(tensorizer_config.lora_dir)) diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 4107e741fd8fe..ac9ef61643880 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -2,6 +2,7 @@ # ruff: noqa: SIM117 import copy from collections.abc import Generator +from typing import Union import torch from torch import nn @@ -111,8 +112,10 @@ class TensorizerLoader(BaseModelLoader): @staticmethod def save_model( model: torch.nn.Module, - tensorizer_config: TensorizerConfig, + tensorizer_config: Union[TensorizerConfig, dict], ) -> None: + if isinstance(tensorizer_config, dict): + tensorizer_config = TensorizerConfig(**tensorizer_config) serialize_vllm_model( model=model, tensorizer_config=tensorizer_config, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 64e472457ee3c..740ba60fe231b 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -340,6 +340,13 @@ class EngineCore: return self.model_executor.collective_rpc(method, timeout, args, kwargs) + def save_tensorized_model( + self, + tensorizer_config, + ) -> None: + self.model_executor.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 759d69293a322..6d4888363d50a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -25,7 +25,7 @@ from vllm.distributed.parallel_state import ( from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader import TensorizerLoader, get_model from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -60,6 +60,7 @@ from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, if TYPE_CHECKING: import xgrammar as xgr + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") @@ -1534,6 +1535,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): time_after_load - time_before_load) prepare_communication_buffer_for_model(self.model) + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + TensorizerLoader.save_model( + self.model, + tensorizer_config=tensorizer_config, + ) + def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 93129d9879401..2b945cc4111a4 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -31,6 +31,7 @@ from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) if TYPE_CHECKING: + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput @@ -326,6 +327,13 @@ class Worker(WorkerBase): max_size=max_size, ) + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + self.model_runner.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + def init_worker_distributed_environment( vllm_config: VllmConfig,