diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 3aa30b7b3c72..4dacbe26f3d9 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -11,14 +11,6 @@ MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py deleted file mode 100644 index 610bc405ede5..000000000000 --- a/tests/lora/test_gemma.py +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -import vllm -from vllm.lora.request import LoRARequest -from vllm.platforms import current_platform - -MODEL_PATH = "google/gemma-7b" - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: - prompts = [ - "Quote: Imagination is", - "Quote: Be yourself;", - "Quote: Painting is poetry that is seen rather than felt,", - ] - sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts: list[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -# The V1 lora test for this model requires more than 24GB. -@pytest.mark.skip_v1 -@pytest.mark.xfail(current_platform.is_rocm(), - reason="There can be output mismatch on ROCm") -def test_gemma_lora(gemma_lora_files): - llm = vllm.LLM(MODEL_PATH, - max_model_len=1024, - enable_lora=True, - max_loras=4, - enable_chunked_prefill=True) - - expected_lora_output = [ - "more important than knowledge.\nAuthor: Albert Einstein\n", - "everyone else is already taken.\nAuthor: Oscar Wilde\n", - "and poetry is painting that is felt rather than seen.\n" - "Author: Leonardo da Vinci\n", - ] - - output1 = do_sample(llm, gemma_lora_files, lora_id=1) - for i in range(len(expected_lora_output)): - assert output1[i].startswith(expected_lora_output[i]) - output2 = do_sample(llm, gemma_lora_files, lora_id=2) - for i in range(len(expected_lora_output)): - assert output2[i].startswith(expected_lora_output[i]) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 56da97b6a06d..99d60b332e65 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -19,7 +19,6 @@ from vllm.lora.fully_sharded_layers import ( # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, - LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, LoRAMapping, MergedColumnParallelLinearWithLoRA, MergedQKVParallelLinearWithLoRA, @@ -28,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA, VocabParallelEmbeddingWithLoRA) # yapf: enable -from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, - PackedLoRALayerWeights) +from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -37,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed @@ -59,28 +56,16 @@ DEVICES = ([ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] if current_platform.is_cuda_alike() else ["cpu"]) -#For GPU, we will launch different triton kernels between the prefill and decode -# stages, so we need to verify this. prefill stage(True) or decode stage(False) +# prefill stage(True) or decode stage(False) STAGES = [True, False] -# With the inclusion of V1 tests (look at the run_with_both_engines_lora), -# the tests in this file run twice, once with the V0 engine and then with -# the V1 engine. -# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half -# with the inclusion of V1 tests to maintain the CI test times. -NUM_RANDOM_SEEDS = 5 -# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to -# 256 before. It is cut to half with the inclusion of V1 tests to maintain -# the CI test times. +NUM_RANDOM_SEEDS = 10 + VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 @pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - +def clean_cache(): # Release any memory we might be holding on to. CI runs OOMs otherwise. from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, _LORA_B_PTR_DICT) @@ -90,6 +75,24 @@ def v1(run_with_both_engines_lora): yield +@pytest.fixture(autouse=True) +def skip_cuda_with_stage_false(request): + """ + On cuda-like platforms, we use the same kernels for prefill and decode + stage, and 'stage' is generally ignored, so we only need to test once. + """ + if current_platform.is_cuda_alike(): + try: + if hasattr(request.node, "callspec") and hasattr( + request.node.callspec, "params"): + params = request.node.callspec.params + if "stage" in params and params["stage"] is False: + pytest.skip("Skip test when stage=False") + except Exception: + pass + yield + + def get_random_id_to_index(num_loras: int, num_slots: int, log: bool = True) -> list[Optional[int]]: @@ -1011,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, atol=atol) -@torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 8]) -@pytest.mark.parametrize("device", ["cuda"]) -@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0), - (6.0, 1.0)]) -@pytest.mark.parametrize("max_position", [11, 4096, 32768]) -@pytest.mark.parametrize("is_neox_style", [True, False]) -@pytest.mark.parametrize("rotary_dim", [None, 32]) -@pytest.mark.parametrize("head_size", [32, 108]) -@pytest.mark.parametrize("seq_len", [11, 1024]) -@pytest.mark.skipif(not current_platform.is_cuda_alike(), - reason="Only CUDA backends are supported") -def test_rotary_embedding_long_context(dist_init, num_loras, device, - scaling_factors, max_position, - is_neox_style, rotary_dim, head_size, - seq_len) -> None: - dtype = torch.float16 - max_loras = 8 - seed = 0 - current_platform.seed_everything(seed) - torch.set_default_device(device) - punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) - assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - long_lora_scaling_factors=scaling_factors, - lora_dtype=dtype) - - if rotary_dim is None: - rotary_dim = head_size - base = 10000 - batch_size = 5 * num_loras - num_heads = 7 - - # Verify lora is equivalent to linear scaling rotary embedding. - rope = get_rope( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - ) - lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope) - lora_rope.set_mapping(punica_wrapper) - lora_rope.create_lora_weights(max_loras, lora_config) - linear_rope = get_rope(head_size, rotary_dim, max_position, base, - is_neox_style, { - "rope_type": "linear", - "factor": scaling_factors - }) - linear_rope = linear_rope.to(dtype=dtype) - id_to_index = get_random_id_to_index(num_loras, max_loras) - _, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=batch_size, - input_size=(1, max_position), - input_range=(0, lora_config.lora_extra_vocab_size), - input_type=torch.float16, - device=device) - - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - long_lora_context = LongContextLoRAContext(list(scaling_factors), - rotary_dim) - - next_expected_offset = 0 - # Make sure the offset is correct. - scaling_factor_to_offset = lora_rope.scaling_factor_to_offset - for scaling_factor, offset in scaling_factor_to_offset.items(): - assert offset == next_expected_offset - next_expected_offset += scaling_factor * max_position - - for i in range(len(scaling_factors)): - long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get( - scaling_factors[i], 0) - punica_wrapper.update_metadata( - lora_mapping, - id_to_index, - max_loras, - 512, - lora_config.lora_extra_vocab_size, - long_lora_context=long_lora_context, - ) - # lora_rope.set_mapping(*mapping_info) - - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=dtype) - key = torch.randn_like(query) - ref_q, ref_k = linear_rope(positions, query, key) - actual_q, actual_k = lora_rope(positions, query, key) - - torch.allclose(ref_q, actual_q) - torch.allclose(ref_k, actual_k) - - @pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) @pytest.mark.parametrize( "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index ee0d7b5da3a9..00e6fe7c61de 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -78,6 +78,8 @@ def test_minicpmv_lora(minicpmv_lora_files): assert EXPECTED_OUTPUT[i].startswith(output2[i]) +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") @@ -99,6 +101,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) +@pytest.mark.skipif(current_platform.is_cuda_alike(), + reason="Skipping to avoid redundant model tests") @pytest.mark.xfail( current_platform.is_rocm(), reason="MiniCPM-V dependency xformers incompatible with ROCm") diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index 0f18de42cd9c..87db0b4bbde0 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest - import vllm from vllm.lora.request import LoRARequest @@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"),