mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 01:55:36 +08:00
[CI/Build] Fix LoRA test (#19350)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
0eca5eacd0
commit
95a6568b5c
@ -164,11 +164,6 @@ def mixtral_lora_files():
|
|||||||
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
|
return snapshot_download(repo_id="SangBinCho/mixtral-lora")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def gemma_lora_files():
|
|
||||||
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def chatglm3_lora_files():
|
def chatglm3_lora_files():
|
||||||
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
|
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
|
||||||
|
|||||||
@ -4,9 +4,6 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import pytest
|
|
||||||
import ray
|
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files):
|
|||||||
generate_and_test(llm, sql_lora_files)
|
generate_and_test(llm, sql_lora_files)
|
||||||
|
|
||||||
|
|
||||||
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
|
|
||||||
# used by the engine yet.
|
|
||||||
@pytest.mark.skip_v1
|
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_llama_lora_warmup(sql_lora_files):
|
|
||||||
"""Test that the LLM initialization works with a warmup LORA path and
|
|
||||||
is more conservative"""
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1)
|
|
||||||
def get_num_gpu_blocks_lora():
|
|
||||||
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
|
|
||||||
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
|
|
||||||
return num_gpu_blocks_lora_warmup
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1)
|
|
||||||
def get_num_gpu_blocks_no_lora():
|
|
||||||
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
|
|
||||||
num_gpu_blocks_no_lora_warmup = (
|
|
||||||
llm.llm_engine.cache_config.num_gpu_blocks)
|
|
||||||
return num_gpu_blocks_no_lora_warmup
|
|
||||||
|
|
||||||
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
|
|
||||||
num_gpu_blocks_no_lora_warmup = ray.get(
|
|
||||||
get_num_gpu_blocks_no_lora.remote())
|
|
||||||
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
|
|
||||||
"The warmup with lora should be more "
|
|
||||||
"conservative than without lora, therefore the number of "
|
|
||||||
"memory blocks for the KV cache should be "
|
|
||||||
"less when using lora than when not using lora")
|
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_llama_lora_tp4(sql_lora_files):
|
def test_llama_lora_tp4(sql_lora_files):
|
||||||
|
|||||||
@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
|||||||
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||||
LORA_RANK = 8
|
LORA_RANK = 8
|
||||||
|
|
||||||
# @pytest.fixture(autouse=True)
|
|
||||||
# def v1(run_with_both_engines_lora):
|
|
||||||
# # Simple autouse wrapper to run both engines for each test
|
|
||||||
# # This can be promoted up to conftest.py to run for every
|
|
||||||
# # test in a package
|
|
||||||
# pass
|
|
||||||
|
|
||||||
|
|
||||||
def make_lora_request(lora_id: int):
|
def make_lora_request(lora_id: int):
|
||||||
return LoRARequest(lora_name=f"{lora_id}",
|
return LoRARequest(lora_name=f"{lora_id}",
|
||||||
|
|||||||
@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
|
|||||||
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
|
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def v1(run_with_both_engines_lora):
|
|
||||||
# Simple autouse wrapper to run both engines for each test
|
|
||||||
# This can be promoted up to conftest.py to run for every
|
|
||||||
# test in a package
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||||
prompts = [
|
prompts = [
|
||||||
PROMPT_TEMPLATE.format(
|
PROMPT_TEMPLATE.format(
|
||||||
@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
|||||||
|
|
||||||
# Skipping for V1 for now as we are hitting,
|
# Skipping for V1 for now as we are hitting,
|
||||||
# "Head size 80 is not supported by FlashAttention." error.
|
# "Head size 80 is not supported by FlashAttention." error.
|
||||||
@pytest.mark.skip_v1
|
@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
|
||||||
def test_phi2_lora(phi2_lora_files):
|
def test_phi2_lora(phi2_lora_files):
|
||||||
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
|
||||||
# Otherwise, the lora-test will fail due to CUDA OOM.
|
# Otherwise, the lora-test will fail due to CUDA OOM.
|
||||||
|
|||||||
@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.v1.worker.gpu_worker import Worker as V1Worker
|
from vllm.v1.worker.gpu_worker import Worker as V1Worker
|
||||||
from vllm.worker.worker import Worker
|
from vllm.worker.worker import Worker
|
||||||
|
|
||||||
|
NUM_LORAS = 16
|
||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"RANK": "0"})
|
@patch.dict(os.environ, {"RANK": "0"})
|
||||||
def test_worker_apply_lora(sql_lora_files):
|
def test_worker_apply_lora(sql_lora_files):
|
||||||
@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files):
|
|||||||
device_config=DeviceConfig("cuda"),
|
device_config=DeviceConfig("cuda"),
|
||||||
cache_config=CacheConfig(
|
cache_config=CacheConfig(
|
||||||
block_size=16,
|
block_size=16,
|
||||||
gpu_memory_utilization=1.0,
|
|
||||||
swap_space=0,
|
swap_space=0,
|
||||||
cache_dtype="auto",
|
cache_dtype="auto",
|
||||||
),
|
),
|
||||||
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
|
lora_config=LoRAConfig(max_lora_rank=8,
|
||||||
max_loras=32),
|
max_cpu_loras=NUM_LORAS,
|
||||||
|
max_loras=NUM_LORAS),
|
||||||
)
|
)
|
||||||
worker = worker_cls(
|
worker = worker_cls(
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files):
|
|||||||
set_active_loras(worker, [])
|
set_active_loras(worker, [])
|
||||||
assert worker.list_loras() == set()
|
assert worker.list_loras() == set()
|
||||||
|
|
||||||
n_loras = 32
|
|
||||||
lora_requests = [
|
lora_requests = [
|
||||||
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
|
LoRARequest(str(i + 1), i + 1, sql_lora_files)
|
||||||
|
for i in range(NUM_LORAS)
|
||||||
]
|
]
|
||||||
|
|
||||||
set_active_loras(worker, lora_requests)
|
set_active_loras(worker, lora_requests)
|
||||||
@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files):
|
|||||||
for lora_request in lora_requests
|
for lora_request in lora_requests
|
||||||
}
|
}
|
||||||
|
|
||||||
for i in range(32):
|
for i in range(NUM_LORAS):
|
||||||
random.seed(i)
|
random.seed(i)
|
||||||
iter_lora_requests = random.choices(lora_requests,
|
iter_lora_requests = random.choices(lora_requests,
|
||||||
k=random.randint(1, n_loras))
|
k=random.randint(1, NUM_LORAS))
|
||||||
random.shuffle(iter_lora_requests)
|
random.shuffle(iter_lora_requests)
|
||||||
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
|
iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
|
||||||
set_active_loras(worker, lora_requests)
|
set_active_loras(worker, lora_requests)
|
||||||
assert worker.list_loras().issuperset(
|
assert worker.list_loras().issuperset(
|
||||||
{lora_request.lora_int_id
|
{lora_request.lora_int_id
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user