mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 18:25:40 +08:00
[Misc] Enable V1 LoRA by default (#15320)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
parent
30d6a015e0
commit
79455cf421
@ -24,7 +24,23 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def server(
|
||||
request,
|
||||
monkeypatch_module,
|
||||
zephyr_lora_files, #noqa: F811
|
||||
zephyr_lora_added_tokens_files): # noqa: F811
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def is_v1_server(server):
|
||||
import os
|
||||
assert os.environ['VLLM_USE_V1'] in ['0', '1']
|
||||
return os.environ['VLLM_USE_V1'] == '1'
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported in V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str, sample_regex):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI,
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
|
||||
@ -11,6 +11,14 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
|
||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
@ -40,14 +48,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_baichuan_lora(baichuan_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
|
||||
@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
|
||||
@ -9,6 +9,14 @@ from vllm.platforms import current_platform
|
||||
MODEL_PATH = "google/gemma-7b"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
"Quote: Imagination is",
|
||||
@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
# The V1 lora test for this model requires more than 24GB.
|
||||
@pytest.mark.skip_v1
|
||||
@pytest.mark.xfail(current_platform.is_rocm(),
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import importlib
|
||||
import random
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora):
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
|
||||
# Reload punica_gpu as the kernels used are tied to engine type.
|
||||
from vllm.lora.punica_wrapper import punica_gpu
|
||||
importlib.reload(punica_gpu)
|
||||
|
||||
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
|
||||
@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files):
|
||||
"less when using lora than when not using lora")
|
||||
|
||||
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
@ -7,7 +7,6 @@ import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
@ -33,6 +32,17 @@ DEVICES = ([
|
||||
] if current_platform.is_cuda_alike() else ["cpu"])
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Some tests depend on V0 internals. Since both V0 and V1 use the same
|
||||
LoRAModelManager it is okay to just test V0.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv('VLLM_USE_V1', '0')
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_from_lora_tensors(sql_lora_files, device):
|
||||
tensors = load_file(
|
||||
@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
|
||||
assert manager.device == device
|
||||
|
||||
|
||||
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
|
||||
sql_lora_files, device):
|
||||
@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
|
||||
device)
|
||||
|
||||
|
||||
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
|
||||
sql_lora_files, device):
|
||||
|
||||
@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2"
|
||||
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(
|
||||
@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
# Skipping for V1 for now as we are hitting,
|
||||
# "Head size 80 is not supported by FlashAttention." error.
|
||||
@pytest.mark.skip_v1
|
||||
|
||||
@ -37,6 +37,14 @@ else:
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
@ -69,14 +77,6 @@ def do_sample(llm: vllm.LLM,
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
|
||||
|
||||
@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
@ -46,15 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
@ -74,7 +73,6 @@ def test_ilama_lora(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
@ -96,7 +94,6 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
|
||||
@ -104,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
|
||||
assert envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should fall back to V0 for experimental config.
|
||||
_ = AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
enable_lora=True,
|
||||
).create_engine_config()
|
||||
assert not envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should fall back to V0 for supported model.
|
||||
_ = AsyncEngineArgs(
|
||||
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
|
||||
@ -125,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should default to V1 for supported config.
|
||||
model = LLM(MODEL, enforce_eager=True)
|
||||
model = LLM(MODEL, enforce_eager=True, enable_lora=True)
|
||||
print(model.generate("Hello my name is"))
|
||||
assert hasattr(model.llm_engine, "engine_core")
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
@ -1512,10 +1512,6 @@ class EngineArgs:
|
||||
and _warn_or_fallback("Engine in background thread")):
|
||||
return False
|
||||
|
||||
# LoRA is supported on V1, but off by default for now.
|
||||
if self.enable_lora and _warn_or_fallback("LORA"):
|
||||
return False
|
||||
|
||||
# PP is supported on V1 with Ray distributed executor,
|
||||
# but off for MP distributed executor for now.
|
||||
if (self.pipeline_parallel_size > 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user