mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 23:45:01 +08:00
[CI/Build] Clean up LoRA tests (#15867)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
e59ca942f5
commit
dfa82e2a3d
@ -11,14 +11,6 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B"
|
|||||||
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def v1(run_with_both_engines_lora):
|
|
||||||
# Simple autouse wrapper to run both engines for each test
|
|
||||||
# This can be promoted up to conftest.py to run for every
|
|
||||||
# test in a package
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||||
prompts = [
|
prompts = [
|
||||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||||
|
|||||||
@ -1,65 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import vllm
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
|
|
||||||
MODEL_PATH = "google/gemma-7b"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def v1(run_with_both_engines_lora):
|
|
||||||
# Simple autouse wrapper to run both engines for each test
|
|
||||||
# This can be promoted up to conftest.py to run for every
|
|
||||||
# test in a package
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
|
||||||
prompts = [
|
|
||||||
"Quote: Imagination is",
|
|
||||||
"Quote: Be yourself;",
|
|
||||||
"Quote: Painting is poetry that is seen rather than felt,",
|
|
||||||
]
|
|
||||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
|
||||||
outputs = llm.generate(
|
|
||||||
prompts,
|
|
||||||
sampling_params,
|
|
||||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
|
||||||
if lora_id else None)
|
|
||||||
# Print the outputs.
|
|
||||||
generated_texts: list[str] = []
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text.strip()
|
|
||||||
generated_texts.append(generated_text)
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
return generated_texts
|
|
||||||
|
|
||||||
|
|
||||||
# The V1 lora test for this model requires more than 24GB.
|
|
||||||
@pytest.mark.skip_v1
|
|
||||||
@pytest.mark.xfail(current_platform.is_rocm(),
|
|
||||||
reason="There can be output mismatch on ROCm")
|
|
||||||
def test_gemma_lora(gemma_lora_files):
|
|
||||||
llm = vllm.LLM(MODEL_PATH,
|
|
||||||
max_model_len=1024,
|
|
||||||
enable_lora=True,
|
|
||||||
max_loras=4,
|
|
||||||
enable_chunked_prefill=True)
|
|
||||||
|
|
||||||
expected_lora_output = [
|
|
||||||
"more important than knowledge.\nAuthor: Albert Einstein\n",
|
|
||||||
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
|
|
||||||
"and poetry is painting that is felt rather than seen.\n"
|
|
||||||
"Author: Leonardo da Vinci\n",
|
|
||||||
]
|
|
||||||
|
|
||||||
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
|
|
||||||
for i in range(len(expected_lora_output)):
|
|
||||||
assert output1[i].startswith(expected_lora_output[i])
|
|
||||||
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
|
|
||||||
for i in range(len(expected_lora_output)):
|
|
||||||
assert output2[i].startswith(expected_lora_output[i])
|
|
||||||
@ -19,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
|
|||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||||
LinearScalingRotaryEmbeddingWithLoRA,
|
|
||||||
LogitsProcessorWithLoRA, LoRAMapping,
|
LogitsProcessorWithLoRA, LoRAMapping,
|
||||||
MergedColumnParallelLinearWithLoRA,
|
MergedColumnParallelLinearWithLoRA,
|
||||||
MergedQKVParallelLinearWithLoRA,
|
MergedQKVParallelLinearWithLoRA,
|
||||||
@ -28,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
|||||||
RowParallelLinearWithLoRA,
|
RowParallelLinearWithLoRA,
|
||||||
VocabParallelEmbeddingWithLoRA)
|
VocabParallelEmbeddingWithLoRA)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
|
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
|
||||||
PackedLoRALayerWeights)
|
|
||||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
MergedColumnParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
@ -37,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
|||||||
ReplicatedLinear,
|
ReplicatedLinear,
|
||||||
RowParallelLinear)
|
RowParallelLinear)
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
|
ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
|
||||||
from vllm.model_executor.utils import set_random_seed
|
from vllm.model_executor.utils import set_random_seed
|
||||||
@ -59,28 +56,16 @@ DEVICES = ([
|
|||||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
] if current_platform.is_cuda_alike() else ["cpu"])
|
] if current_platform.is_cuda_alike() else ["cpu"])
|
||||||
|
|
||||||
#For GPU, we will launch different triton kernels between the prefill and decode
|
# prefill stage(True) or decode stage(False)
|
||||||
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
|
|
||||||
STAGES = [True, False]
|
STAGES = [True, False]
|
||||||
|
|
||||||
# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
|
NUM_RANDOM_SEEDS = 10
|
||||||
# the tests in this file run twice, once with the V0 engine and then with
|
|
||||||
# the V1 engine.
|
|
||||||
# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
|
|
||||||
# with the inclusion of V1 tests to maintain the CI test times.
|
|
||||||
NUM_RANDOM_SEEDS = 5
|
|
||||||
# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
|
|
||||||
# 256 before. It is cut to half with the inclusion of V1 tests to maintain
|
|
||||||
# the CI test times.
|
|
||||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def v1(run_with_both_engines_lora):
|
def clean_cache():
|
||||||
# Simple autouse wrapper to run both engines for each test
|
|
||||||
# This can be promoted up to conftest.py to run for every
|
|
||||||
# test in a package
|
|
||||||
|
|
||||||
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
||||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||||
_LORA_B_PTR_DICT)
|
_LORA_B_PTR_DICT)
|
||||||
@ -90,6 +75,24 @@ def v1(run_with_both_engines_lora):
|
|||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def skip_cuda_with_stage_false(request):
|
||||||
|
"""
|
||||||
|
On cuda-like platforms, we use the same kernels for prefill and decode
|
||||||
|
stage, and 'stage' is generally ignored, so we only need to test once.
|
||||||
|
"""
|
||||||
|
if current_platform.is_cuda_alike():
|
||||||
|
try:
|
||||||
|
if hasattr(request.node, "callspec") and hasattr(
|
||||||
|
request.node.callspec, "params"):
|
||||||
|
params = request.node.callspec.params
|
||||||
|
if "stage" in params and params["stage"] is False:
|
||||||
|
pytest.skip("Skip test when stage=False")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
def get_random_id_to_index(num_loras: int,
|
def get_random_id_to_index(num_loras: int,
|
||||||
num_slots: int,
|
num_slots: int,
|
||||||
log: bool = True) -> list[Optional[int]]:
|
log: bool = True) -> list[Optional[int]]:
|
||||||
@ -1011,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
|
|||||||
atol=atol)
|
atol=atol)
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
@pytest.mark.parametrize("num_loras", [1, 8])
|
|
||||||
@pytest.mark.parametrize("device", ["cuda"])
|
|
||||||
@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
|
|
||||||
(6.0, 1.0)])
|
|
||||||
@pytest.mark.parametrize("max_position", [11, 4096, 32768])
|
|
||||||
@pytest.mark.parametrize("is_neox_style", [True, False])
|
|
||||||
@pytest.mark.parametrize("rotary_dim", [None, 32])
|
|
||||||
@pytest.mark.parametrize("head_size", [32, 108])
|
|
||||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
|
||||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
|
||||||
reason="Only CUDA backends are supported")
|
|
||||||
def test_rotary_embedding_long_context(dist_init, num_loras, device,
|
|
||||||
scaling_factors, max_position,
|
|
||||||
is_neox_style, rotary_dim, head_size,
|
|
||||||
seq_len) -> None:
|
|
||||||
dtype = torch.float16
|
|
||||||
max_loras = 8
|
|
||||||
seed = 0
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
torch.set_default_device(device)
|
|
||||||
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
|
|
||||||
assert check_punica_wrapper(punica_wrapper)
|
|
||||||
lora_config = LoRAConfig(max_loras=max_loras,
|
|
||||||
max_lora_rank=8,
|
|
||||||
long_lora_scaling_factors=scaling_factors,
|
|
||||||
lora_dtype=dtype)
|
|
||||||
|
|
||||||
if rotary_dim is None:
|
|
||||||
rotary_dim = head_size
|
|
||||||
base = 10000
|
|
||||||
batch_size = 5 * num_loras
|
|
||||||
num_heads = 7
|
|
||||||
|
|
||||||
# Verify lora is equivalent to linear scaling rotary embedding.
|
|
||||||
rope = get_rope(
|
|
||||||
head_size,
|
|
||||||
rotary_dim,
|
|
||||||
max_position,
|
|
||||||
base,
|
|
||||||
is_neox_style,
|
|
||||||
)
|
|
||||||
lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
|
|
||||||
lora_rope.set_mapping(punica_wrapper)
|
|
||||||
lora_rope.create_lora_weights(max_loras, lora_config)
|
|
||||||
linear_rope = get_rope(head_size, rotary_dim, max_position, base,
|
|
||||||
is_neox_style, {
|
|
||||||
"rope_type": "linear",
|
|
||||||
"factor": scaling_factors
|
|
||||||
})
|
|
||||||
linear_rope = linear_rope.to(dtype=dtype)
|
|
||||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
|
||||||
_, index_mapping, prompt_mapping = create_random_inputs(
|
|
||||||
active_lora_ids=[0],
|
|
||||||
num_inputs=batch_size,
|
|
||||||
input_size=(1, max_position),
|
|
||||||
input_range=(0, lora_config.lora_extra_vocab_size),
|
|
||||||
input_type=torch.float16,
|
|
||||||
device=device)
|
|
||||||
|
|
||||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
|
||||||
long_lora_context = LongContextLoRAContext(list(scaling_factors),
|
|
||||||
rotary_dim)
|
|
||||||
|
|
||||||
next_expected_offset = 0
|
|
||||||
# Make sure the offset is correct.
|
|
||||||
scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
|
|
||||||
for scaling_factor, offset in scaling_factor_to_offset.items():
|
|
||||||
assert offset == next_expected_offset
|
|
||||||
next_expected_offset += scaling_factor * max_position
|
|
||||||
|
|
||||||
for i in range(len(scaling_factors)):
|
|
||||||
long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
|
|
||||||
scaling_factors[i], 0)
|
|
||||||
punica_wrapper.update_metadata(
|
|
||||||
lora_mapping,
|
|
||||||
id_to_index,
|
|
||||||
max_loras,
|
|
||||||
512,
|
|
||||||
lora_config.lora_extra_vocab_size,
|
|
||||||
long_lora_context=long_lora_context,
|
|
||||||
)
|
|
||||||
# lora_rope.set_mapping(*mapping_info)
|
|
||||||
|
|
||||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
|
||||||
query = torch.randn(batch_size,
|
|
||||||
seq_len,
|
|
||||||
num_heads * head_size,
|
|
||||||
dtype=dtype)
|
|
||||||
key = torch.randn_like(query)
|
|
||||||
ref_q, ref_k = linear_rope(positions, query, key)
|
|
||||||
actual_q, actual_k = lora_rope(positions, query, key)
|
|
||||||
|
|
||||||
torch.allclose(ref_q, actual_q)
|
|
||||||
torch.allclose(ref_k, actual_k)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
|
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
|
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
|
||||||
|
|||||||
@ -78,6 +78,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
|||||||
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||||
|
reason="Skipping to avoid redundant model tests")
|
||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
current_platform.is_rocm(),
|
current_platform.is_rocm(),
|
||||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||||
@ -99,6 +101,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
|||||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||||
|
reason="Skipping to avoid redundant model tests")
|
||||||
@pytest.mark.xfail(
|
@pytest.mark.xfail(
|
||||||
current_platform.is_rocm(),
|
current_platform.is_rocm(),
|
||||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|
||||||
@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def v1(run_with_both_engines_lora):
|
|
||||||
# Simple autouse wrapper to run both engines for each test
|
|
||||||
# This can be promoted up to conftest.py to run for every
|
|
||||||
# test in a package
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||||
prompts = [
|
prompts = [
|
||||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user