From 79455cf42113efb026e39342997f7a878af8bc38 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 1 Apr 2025 04:53:56 -0400 Subject: [PATCH] [Misc] Enable V1 LoRA by default (#15320) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/entrypoints/openai/test_chat.py | 60 +++++++++++++++++++++++++-- tests/lora/test_baichuan.py | 16 +++---- tests/lora/test_chatglm3_tp.py | 16 +++---- tests/lora/test_gemma.py | 16 +++---- tests/lora/test_layers.py | 5 --- tests/lora/test_llama_tp.py | 20 ++++----- tests/lora/test_lora_manager.py | 14 +++++-- tests/lora/test_phi.py | 16 +++---- tests/lora/test_quant_model.py | 16 +++---- tests/lora/test_transfomers_model.py | 19 ++++----- tests/v1/test_oracle.py | 10 +---- vllm/engine/arg_utils.py | 4 -- 12 files changed, 125 insertions(+), 87 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 25e4595cef6f6..4d13421adee0b 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -24,7 +24,23 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] @pytest.fixture(scope="module") -def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module", params=[False, True]) +def server( + request, + monkeypatch_module, + zephyr_lora_files, #noqa: F811 + zephyr_lora_added_tokens_files): # noqa: F811 + + use_v1 = request.param + monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 yield remote_server +@pytest.fixture +def is_v1_server(server): + import os + assert os.environ['VLLM_USE_V1'] in ['0', '1'] + return os.environ['VLLM_USE_V1'] == '1' + + @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: @@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat(client: openai.AsyncOpenAI, + is_v1_server: bool, guided_decoding_backend: str, sample_guided_choice): + + if is_v1_server and guided_decoding_backend != 'xgrammar': + pytest.skip("Only xgrammar backend is supported with V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_guided_json_chat(client: openai.AsyncOpenAI, +async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool, guided_decoding_backend: str, sample_json_schema): + + if is_v1_server: + pytest.skip("sample_json_schema has features unsupported in V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_regex_chat(client: openai.AsyncOpenAI, + is_v1_server: bool, guided_decoding_backend: str, sample_regex): + + if is_v1_server and guided_decoding_backend != 'xgrammar': + pytest.skip("Only xgrammar backend is supported with V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, + is_v1_server: bool, guided_decoding_backend: str, sample_guided_choice): + + if is_v1_server and guided_decoding_backend != 'xgrammar': + pytest.skip("Only xgrammar backend is supported with V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) -async def test_named_tool_use(client: openai.AsyncOpenAI, +async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool, guided_decoding_backend: str, sample_json_schema): + + if is_v1_server: + pytest.skip("sample_json_schema has features unsupported on V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI, sample_json_schema): + + if is_v1_server: + pytest.skip("sample_json_schema has features unsupported on V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, sample_json_schema): + + if is_v1_server: + pytest.skip("sample_json_schema has features unsupported on V1") + messages = [{ "role": "system", "content": "you are a helpful assistant" diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 9103ba425af18..3aa30b7b3c723 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -11,6 +11,14 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), @@ -40,14 +48,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def test_baichuan_lora(baichuan_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index fa8c66d10309d..28a6f163d115a 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), @@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @create_new_process_for_each_test() def test_chatglm3_lora(chatglm3_lora_files): llm = vllm.LLM(MODEL_PATH, diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 8f07e39d20d3b..610bc405ede5c 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -9,6 +9,14 @@ from vllm.platforms import current_platform MODEL_PATH = "google/gemma-7b" +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "Quote: Imagination is", @@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - # The V1 lora test for this model requires more than 24GB. @pytest.mark.skip_v1 @pytest.mark.xfail(current_platform.is_rocm(), diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 8c8e55edae67b..56da97b6a06d8 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import importlib import random from copy import deepcopy from dataclasses import dataclass @@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora): # This can be promoted up to conftest.py to run for every # test in a package - # Reload punica_gpu as the kernels used are tied to engine type. - from vllm.lora.punica_wrapper import punica_gpu - importlib.reload(punica_gpu) - # Release any memory we might be holding on to. CI runs OOMs otherwise. from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, _LORA_B_PTR_DICT) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 7026f705026fb..9f20e47c2f948 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 @@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files): print("removing lora") -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -# V1 Test: Failing due to numerics on V1. -@pytest.mark.skip_v1 @create_new_process_for_each_test() def test_llama_lora(sql_lora_files): @@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files): "less when using lora than when not using lora") -# V1 Test: Failing due to numerics on V1. -@pytest.mark.skip_v1 @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_llama_lora_tp4(sql_lora_files): diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index db6a6ec78fa2f..576d95a471547 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -7,7 +7,6 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm import envs from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, @@ -33,6 +32,17 @@ DEVICES = ([ ] if current_platform.is_cuda_alike() else ["cpu"]) +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch: pytest.MonkeyPatch): + """ + Some tests depend on V0 internals. Since both V0 and V1 use the same + LoRAModelManager it is okay to just test V0. + """ + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield + + @pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( @@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): assert manager.device == device -@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.") @pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): @@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, device) -@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.") @pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, sql_lora_files, device): diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 8596d3999799c..7375cabbc36d9 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format( @@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - # Skipping for V1 for now as we are hitting, # "Head size 80 is not supported by FlashAttention." error. @pytest.mark.skip_v1 diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index d607bf66ebd45..a4a47a9c2acdf 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -37,6 +37,14 @@ else: ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, @@ -69,14 +77,6 @@ def do_sample(llm: vllm.LLM, return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", [1]) def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index f65fb1cdbbd56..0f18de42cd9cb 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), @@ -46,15 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -@pytest.mark.skip_v1 @create_new_process_for_each_test() def test_ilama_lora(ilama_lora_files): llm = vllm.LLM(MODEL_PATH, @@ -74,7 +73,6 @@ def test_ilama_lora(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] -@pytest.mark.skip_v1 @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4(ilama_lora_files): @@ -96,7 +94,6 @@ def test_ilama_lora_tp4(ilama_lora_files): assert output2[i] == EXPECTED_LORA_OUTPUT[i] -@pytest.mark.skip_v1 @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files): diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 762c7bada324c..1448641f6a570 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -104,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch): assert envs.VLLM_USE_V1 m.delenv("VLLM_USE_V1") - # Should fall back to V0 for experimental config. - _ = AsyncEngineArgs( - model=MODEL, - enable_lora=True, - ).create_engine_config() - assert not envs.VLLM_USE_V1 - m.delenv("VLLM_USE_V1") - # Should fall back to V0 for supported model. _ = AsyncEngineArgs( model=UNSUPPORTED_MODELS_V1[0]).create_engine_config() @@ -125,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch): m.delenv("VLLM_USE_V1") # Should default to V1 for supported config. - model = LLM(MODEL, enforce_eager=True) + model = LLM(MODEL, enforce_eager=True, enable_lora=True) print(model.generate("Hello my name is")) assert hasattr(model.llm_engine, "engine_core") m.delenv("VLLM_USE_V1") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 019cbe18397e6..ecdcab50e4524 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1512,10 +1512,6 @@ class EngineArgs: and _warn_or_fallback("Engine in background thread")): return False - # LoRA is supported on V1, but off by default for now. - if self.enable_lora and _warn_or_fallback("LORA"): - return False - # PP is supported on V1 with Ray distributed executor, # but off for MP distributed executor for now. if (self.pipeline_parallel_size > 1