diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index c8b7a5cbf7470..17347300b40c8 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -6,6 +6,8 @@ import pytest import vllm.envs as env from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.inputs import TextPrompt from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -16,14 +18,6 @@ LORA_RANK = 64 DEFAULT_MAX_LORAS = 4 * 3 -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def get_lora_requests(lora_path) -> list[LoRARequest]: lora_requests: list[LoRARequest] = [ LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path) @@ -88,17 +82,6 @@ async def test_add_lora(chatglm3_lora_files): trust_remote_code=True, enforce_eager=True) - # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` - # environment variable. reload vllm.enging.async_llm_engine as - # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the - # env var. - import importlib - - import vllm.engine.async_llm_engine - importlib.reload(vllm.engine.async_llm_engine) - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - # split lora_requests into 3 parts part_size = len(lora_requests) // 3 dummy_run_requests = lora_requests[:part_size] diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 2c18a115be487..cd9526c8b1012 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest - import vllm from vllm.lora.request import LoRARequest @@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 580992dea53da..54daea5b9dbf0 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -33,14 +33,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 7ae33a848a0aa..fd80f61a59773 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -2,26 +2,24 @@ """ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ - -import os - import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.llm_engine import LLMEngine +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.lora.request import LoRARequest MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" LORA_RANK = 8 - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass +# @pytest.fixture(autouse=True) +# def v1(run_with_both_engines_lora): +# # Simple autouse wrapper to run both engines for each test +# # This can be promoted up to conftest.py to run for every +# # test in a package +# pass def make_lora_request(lora_id: int): @@ -79,22 +77,6 @@ def test_lora_functions_sync(): @pytest.mark.asyncio async def test_lora_functions_async(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip( - reason= - "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions") - - # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` - # environment variable. reload vllm.enging.async_llm_engine as - # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the - # env var. - import importlib - - import vllm.engine.async_llm_engine - importlib.reload(vllm.engine.async_llm_engine) - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - max_loras = 4 engine_args = AsyncEngineArgs(model=MODEL_PATH, enable_lora=True, diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index aea7691935dfe..4e77c5559e164 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -10,14 +10,6 @@ from vllm.platforms import current_platform MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]) -> list[str]: diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 7a76ffb740ef2..43e2975cd87c0 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -37,14 +37,6 @@ else: ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 162714df2f130..20a1ae67db2dc 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -13,14 +13,6 @@ from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams -@pytest.fixture(autouse=not current_platform.is_cpu()) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @dataclass class TestConfig: model_path: str diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index e5ae660af1400..1a5d527164d0b 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -6,8 +6,6 @@ import tempfile from typing import Union from unittest.mock import patch -import pytest - import vllm.envs as envs from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, @@ -18,14 +16,6 @@ from vllm.v1.worker.gpu_worker import Worker as V1Worker from vllm.worker.worker import Worker -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files):