vllm/tests/v1/e2e/test_lora_with_spec_decode.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script contains:
1. test lora with speculative decoding for batch inference
"""

import random

import numpy as np
import pytest
import torch

from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform

LORA_TEST_PROMPT_MAP: dict[str, str] = {}

LORA_TEST_PROMPT_MAP["premjatin/qwen-linear-algebra-coder"] = """
### INSTRUCTION:
You are an AI assistant that generates Python code to solve linear
algebra problems.

### PROBLEM:
Find the eigenvalues and eigenvectors of the following 3x3 matrix:
[[3, 2, 0],
 [2, 3, 0],
 [0, 0, 2]]

### OUTPUT FORMAT (STRICT):
Numbers should be represented as integers only.

### PYTHON SOLUTION:
"""

SEED = 42


@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
@pytest.mark.parametrize(
    "model_setup",
    [
        (
            "eagle3",
            "Qwen/Qwen3-1.7B",
            "AngelSlim/Qwen3-1.7B_eagle3",
            "premjatin/qwen-linear-algebra-coder",
            1,
        )
    ],
)
def test_batch_inference_correctness(
    monkeypatch: pytest.MonkeyPatch,
    model_setup: tuple[str, str, str, str, int],
):
    """
    Compare the outputs of a LLM with only Lora and a LLM with both SD and Lora.
    Should be the same and no failure when doing batch inference.
    model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")

        # Disable randomness
        m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
        torch.manual_seed(SEED)
        np.random.seed(SEED)
        random.seed(SEED)
        torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

        method, model_name, spec_model_name, lora_path, tp_size = model_setup

        # without speculative decoding
        ref_llm = LLM(
            model=model_name,
            trust_remote_code=True,
            tensor_parallel_size=tp_size,
            max_model_len=2048,
            max_num_seqs=4,
            enable_lora=True,
            max_loras=1,
            max_cpu_loras=1,
            max_lora_rank=16,
        )

        prompts = [LORA_TEST_PROMPT_MAP[lora_path]] * 100
        lora_request = LoRARequest("adapter", 1, lora_path)
        sampling_params = SamplingParams(
            temperature=0.0, top_p=1.0, top_k=-1, seed=SEED, max_tokens=128
        )

        ref_outputs = ref_llm.generate(
            prompts, sampling_params, lora_request=lora_request
        )
        del ref_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()

        lora_spec_llm = LLM(
            model=model_name,
            trust_remote_code=True,
            tensor_parallel_size=tp_size,
            speculative_config={
                "method": method,
                "model": spec_model_name,
                "num_speculative_tokens": 3,
                "max_model_len": 2048,
            },
            max_model_len=2048,
            max_num_seqs=4,
            enable_lora=True,
            max_loras=1,
            max_cpu_loras=1,
            max_lora_rank=16,
        )

        lora_spec_outputs = lora_spec_llm.generate(
            prompts, sampling_params, lora_request=lora_request
        )

        matches = 0
        misses = 0
        for ref_output, spec_output in zip(ref_outputs, lora_spec_outputs):
            if ref_output.outputs[0].text == spec_output.outputs[0].text:
                matches += 1
            else:
                misses += 1
                print(f"ref_output: {ref_output.outputs[0].text}")
                print(f"spec_output: {spec_output.outputs[0].text}")

        # Heuristic: expect at least 90% of the prompts to match exactly
        # Upon failure, inspect the outputs to check for inaccuracy.
        print(f"match ratio: {matches}/{len(ref_outputs)}")
        assert matches > int(0.90 * len(ref_outputs))
        del lora_spec_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()