diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 32591ecfe677..edd079bc7a38 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -8,10 +8,12 @@ from unittest.mock import MagicMock, patch import openai import pytest import torch +from huggingface_hub import snapshot_download from tensorizer import EncryptionParams from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs +# yapf conflicts with isort for this docstring # yapf: disable from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, TensorSerializer, @@ -20,13 +22,14 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, open_stream, serialize_vllm_model, tensorize_vllm_model) +# yapf: enable +from vllm.utils import import_from_path from ..conftest import VllmRunner -from ..utils import RemoteOpenAIServer +from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip -# yapf conflicts with isort for this docstring - +EXAMPLES_PATH = VLLM_PATH / "examples" prompts = [ "Hello, my name is", @@ -94,8 +97,8 @@ def test_can_deserialize_s3(vllm_runner): num_readers=1, s3_endpoint="object.ord1.coreweave.com", )) as loaded_hf_model: - deserialized_outputs = loaded_hf_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_hf_model.generate( + prompts, sampling_params) # noqa: E501 assert deserialized_outputs @@ -111,23 +114,21 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig( - tensorizer_uri=model_path, - encryption_keyfile=key_path - ) + config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, + encryption_keyfile=key_path) serialize_vllm_model(get_torch_model(vllm_model), config_for_serializing) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - with vllm_runner( - model_ref, - load_format="tensorizer", - model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501 + with vllm_runner(model_ref, + load_format="tensorizer", + model_loader_extra_config=config_for_deserializing + ) as loaded_vllm_model: # noqa: E501 - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) # noqa: E501 assert outputs == deserialized_outputs @@ -156,14 +157,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): - from huggingface_hub import snapshot_download - - from examples.multilora_inference import (create_test_prompts, - process_requests) + multilora_inference = import_from_path( + "examples.multilora_inference", + EXAMPLES_PATH / "multilora_inference.py", + ) model_ref = "meta-llama/Llama-2-7b-hf" lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = create_test_prompts(lora_path) + test_prompts = multilora_inference.create_test_prompts(lora_path) # Serialize model before deserializing and binding LoRA adapters with vllm_runner(model_ref, ) as vllm_model: @@ -186,7 +187,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): max_num_seqs=50, max_model_len=1000, ) as loaded_vllm_model: - process_requests(loaded_vllm_model.model.llm_engine, test_prompts) + multilora_inference.process_requests( + loaded_vllm_model.model.llm_engine, test_prompts) assert loaded_vllm_model @@ -217,8 +219,11 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ## Start OpenAI API server openai_args = [ - "--dtype", "float16", "--load-format", - "tensorizer", "--model-loader-extra-config", + "--dtype", + "float16", + "--load-format", + "tensorizer", + "--model-loader-extra-config", json.dumps(model_loader_extra_config), ] @@ -251,8 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner): torch.cuda.empty_cache() -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires 2 GPUs") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") def test_tensorizer_with_tp_path_without_template(vllm_runner): with pytest.raises(ValueError): model_ref = "EleutherAI/pythia-1.4b" @@ -271,10 +275,9 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): ) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires 2 GPUs") -def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, - tmp_path): +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") +def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( + vllm_runner, tmp_path): model_ref = "EleutherAI/pythia-1.4b" # record outputs from un-sharded un-tensorized model with vllm_runner( @@ -313,13 +316,12 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, disable_custom_all_reduce=True, enforce_eager=True, model_loader_extra_config=tensorizer_config) as loaded_vllm_model: - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) assert outputs == deserialized_outputs - @retry_until_skip(3) def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): gc.collect() @@ -337,8 +339,8 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref, load_format="tensorizer", model_loader_extra_config=config) as loaded_vllm_model: - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) # noqa: E501 assert outputs == deserialized_outputs diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f5299746d845..aa9c7893c4cf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2002,9 +2002,6 @@ class LLMEngine: SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) - def is_encoder_decoder_model(self): - return self.input_preprocessor.is_encoder_decoder_model() - def _validate_model_inputs(self, inputs: ProcessorInputs, lora_request: Optional[LoRARequest]): if is_encoder_decoder_inputs(inputs): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3ab467e649b5..4b33fc1458ee 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -964,6 +964,3 @@ class LLM: # This is necessary because some requests may be finished earlier than # its previous requests. return sorted(outputs, key=lambda x: int(x.request_id)) - - def _is_encoder_decoder_model(self): - return self.llm_engine.is_encoder_decoder_model() diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 5ce31bd4d941..aa7c20109893 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,5 +1,3 @@ -import importlib -import importlib.util import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Type, Union @@ -9,7 +7,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ExtractedToolCallInformation) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import is_list_of +from vllm.utils import import_from_path, is_list_of logger = init_logger(__name__) @@ -149,13 +147,14 @@ class ToolParserManager: @classmethod def import_tool_parser(cls, plugin_path: str) -> None: """ - Import a user defined tool parser by the path of the tool parser define + Import a user-defined tool parser by the path of the tool parser define file. """ module_name = os.path.splitext(os.path.basename(plugin_path))[0] - spec = importlib.util.spec_from_file_location(module_name, plugin_path) - if spec is None or spec.loader is None: - logger.error("load %s from %s failed.", module_name, plugin_path) + + try: + import_from_path(module_name, plugin_path) + except Exception: + logger.exception("Failed to load module '%s' from %s.", + module_name, plugin_path) return - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index fdf28615fda1..aacff87df6d7 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,7 +67,7 @@ class InputPreprocessor: model config is unavailable. ''' - if not self.is_encoder_decoder_model(): + if not self.model_config.is_encoder_decoder: print_warning_once("Using None for decoder start token id because " "this is not an encoder/decoder model.") return None @@ -632,7 +632,7 @@ class InputPreprocessor: prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return self._process_encoder_decoder_prompt( @@ -660,7 +660,7 @@ class InputPreprocessor: prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return await self._process_encoder_decoder_prompt_async( @@ -679,6 +679,3 @@ class InputPreprocessor: lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - - def is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder diff --git a/vllm/utils.py b/vllm/utils.py index 1b02cbff79f7..111460a29de4 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -5,6 +5,7 @@ import datetime import enum import gc import getpass +import importlib.util import inspect import ipaddress import os @@ -1539,6 +1540,25 @@ def is_in_doc_build() -> bool: return False +def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): + """ + Import a Python file according to its file path. + + Based on the official recipe: + https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly + """ + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ModuleNotFoundError(f"No module named '{module_name}'") + + assert spec.loader is not None + + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + # create a library to hold the custom op vllm_lib = Library("vllm", "FRAGMENT") # noqa diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4ebfff958426..75a77be750ac 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -163,9 +163,6 @@ class LLMEngine: def get_model_config(self): pass - def is_encoder_decoder_model(self): - pass - def start_profile(self): pass