mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 10:57:10 +08:00
[CI/Build] Further clean up LoRA tests (#15920)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
cdb57015a7
commit
4203926f10
@ -289,7 +289,7 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
@ -602,8 +602,6 @@ steps:
|
|||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
|
||||||
- pytest -v -s -x lora/test_transfomers_model.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import TypedDict
|
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
class ContextIDInfo(TypedDict):
|
|
||||||
lora_id: int
|
|
||||||
context_length: str
|
|
||||||
|
|
||||||
|
|
||||||
class ContextInfo(TypedDict):
|
|
||||||
lora: str
|
|
||||||
context_length: str
|
|
||||||
|
|
||||||
|
|
||||||
LONG_LORA_INFOS: list[ContextIDInfo] = [{
|
|
||||||
"lora_id": 1,
|
|
||||||
"context_length": "16k",
|
|
||||||
}, {
|
|
||||||
"lora_id": 2,
|
|
||||||
"context_length": "16k",
|
|
||||||
}, {
|
|
||||||
"lora_id": 3,
|
|
||||||
"context_length": "32k",
|
|
||||||
}]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def should_do_global_cleanup_after_test(request) -> bool:
|
def should_do_global_cleanup_after_test(request) -> bool:
|
||||||
"""Allow subdirectories to skip global cleanup by overriding this fixture.
|
"""Allow subdirectories to skip global cleanup by overriding this fixture.
|
||||||
|
|||||||
@ -59,7 +59,7 @@ DEVICES = ([
|
|||||||
# prefill stage(True) or decode stage(False)
|
# prefill stage(True) or decode stage(False)
|
||||||
STAGES = [True, False]
|
STAGES = [True, False]
|
||||||
|
|
||||||
NUM_RANDOM_SEEDS = 10
|
NUM_RANDOM_SEEDS = 6
|
||||||
|
|
||||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||||
|
|
||||||
|
|||||||
@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
|||||||
enable_chunked_prefill=True,
|
enable_chunked_prefill=True,
|
||||||
)
|
)
|
||||||
generate_and_test(llm, sql_lora_files)
|
generate_and_test(llm, sql_lora_files)
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=4)
|
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
|
|
||||||
|
|
||||||
llm = vllm.LLM(
|
|
||||||
MODEL_PATH,
|
|
||||||
enable_lora=True,
|
|
||||||
max_num_seqs=16,
|
|
||||||
max_loras=4,
|
|
||||||
tensor_parallel_size=4,
|
|
||||||
fully_sharded_loras=True,
|
|
||||||
enable_lora_bias=True,
|
|
||||||
enable_chunked_prefill=True,
|
|
||||||
)
|
|
||||||
generate_and_test(llm, sql_lora_files)
|
|
||||||
|
|||||||
@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
|||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
current_platform.is_rocm(),
|
current_platform.is_rocm(),
|
||||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_minicpmv_lora(minicpmv_lora_files):
|
def test_minicpmv_lora(minicpmv_lora_files):
|
||||||
llm = vllm.LLM(
|
llm = vllm.LLM(
|
||||||
MODEL_PATH,
|
MODEL_PATH,
|
||||||
|
|||||||
@ -1,7 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||||
|
|
||||||
@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
|||||||
return generated_texts
|
return generated_texts
|
||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
|
||||||
def test_ilama_lora(ilama_lora_files):
|
def test_ilama_lora(ilama_lora_files):
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
llm = vllm.LLM(MODEL_PATH,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files):
|
|||||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||||
|
reason="Skipping to avoid redundant model tests")
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_ilama_lora_tp4(ilama_lora_files):
|
def test_ilama_lora_tp4(ilama_lora_files):
|
||||||
@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
|||||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||||
|
reason="Skipping to avoid redundant model tests")
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user