From 95a6568b5c366359fabf519f7863021377cc0dd6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 9 Jun 2025 17:52:10 +0800 Subject: [PATCH] [CI/Build] Fix LoRA test (#19350) Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 5 ----- tests/lora/test_llama_tp.py | 34 ------------------------------- tests/lora/test_lora_functions.py | 7 ------- tests/lora/test_phi.py | 10 +-------- tests/lora/test_worker.py | 18 ++++++++-------- 5 files changed, 11 insertions(+), 63 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0737bb886e43e..4908f9a060f7f 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -164,11 +164,6 @@ def mixtral_lora_files(): return snapshot_download(repo_id="SangBinCho/mixtral-lora") -@pytest.fixture(scope="session") -def gemma_lora_files(): - return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") - - @pytest.fixture(scope="session") def chatglm3_lora_files(): return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider") diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 23819f03dc51f..3ac3b80ec827c 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -4,9 +4,6 @@ import subprocess import sys from typing import Union -import pytest -import ray - import vllm from vllm import LLM from vllm.lora.request import LoRARequest @@ -121,37 +118,6 @@ def test_llama_lora(sql_lora_files): generate_and_test(llm, sql_lora_files) -# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks -# used by the engine yet. -@pytest.mark.skip_v1 -@create_new_process_for_each_test() -def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and - is more conservative""" - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_lora(): - llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) - num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks - return num_gpu_blocks_lora_warmup - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_no_lora(): - llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = ( - llm.llm_engine.cache_config.num_gpu_blocks) - return num_gpu_blocks_no_lora_warmup - - num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) - num_gpu_blocks_no_lora_warmup = ray.get( - get_num_gpu_blocks_no_lora.remote()) - assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more " - "conservative than without lora, therefore the number of " - "memory blocks for the KV cache should be " - "less when using lora than when not using lora") - - @multi_gpu_test(num_gpus=4) @create_new_process_for_each_test() def test_llama_lora_tp4(sql_lora_files): diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index e9a52e1b63573..50c60341f0d88 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -15,13 +15,6 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" LORA_RANK = 8 -# @pytest.fixture(autouse=True) -# def v1(run_with_both_engines_lora): -# # Simple autouse wrapper to run both engines for each test -# # This can be promoted up to conftest.py to run for every -# # test in a package -# pass - def make_lora_request(lora_id: int): return LoRARequest(lora_name=f"{lora_id}", diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index a21de070517b1..9d75512a248be 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -11,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2" PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format( @@ -59,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: # Skipping for V1 for now as we are hitting, # "Head size 80 is not supported by FlashAttention." error. -@pytest.mark.skip_v1 +@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention") def test_phi2_lora(phi2_lora_files): # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, # Otherwise, the lora-test will fail due to CUDA OOM. diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 6f13e663a78bb..9999c1be54ea5 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -16,6 +16,8 @@ from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker as V1Worker from vllm.worker.worker import Worker +NUM_LORAS = 16 + @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): @@ -58,12 +60,12 @@ def test_worker_apply_lora(sql_lora_files): device_config=DeviceConfig("cuda"), cache_config=CacheConfig( block_size=16, - gpu_memory_utilization=1.0, swap_space=0, cache_dtype="auto", ), - lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, - max_loras=32), + lora_config=LoRAConfig(max_lora_rank=8, + max_cpu_loras=NUM_LORAS, + max_loras=NUM_LORAS), ) worker = worker_cls( vllm_config=vllm_config, @@ -78,9 +80,9 @@ def test_worker_apply_lora(sql_lora_files): set_active_loras(worker, []) assert worker.list_loras() == set() - n_loras = 32 lora_requests = [ - LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras) + LoRARequest(str(i + 1), i + 1, sql_lora_files) + for i in range(NUM_LORAS) ] set_active_loras(worker, lora_requests) @@ -89,12 +91,12 @@ def test_worker_apply_lora(sql_lora_files): for lora_request in lora_requests } - for i in range(32): + for i in range(NUM_LORAS): random.seed(i) iter_lora_requests = random.choices(lora_requests, - k=random.randint(1, n_loras)) + k=random.randint(1, NUM_LORAS)) random.shuffle(iter_lora_requests) - iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)] + iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)] set_active_loras(worker, lora_requests) assert worker.list_loras().issuperset( {lora_request.lora_int_id