mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:15:51 +08:00
[V0 Deprecation] Remove misc V0 tests (#25118)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
2fc24e94f9
commit
6c036615dc
@ -1,98 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
|
||||
class MockLogitsProcessor(LogitsProcessor):
|
||||
|
||||
def __init__(self, vocab_size: int, scale: float,
|
||||
fake_logits: torch.Tensor):
|
||||
super().__init__(vocab_size=vocab_size, scale=scale)
|
||||
self.fake_logits = fake_logits.clone()
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
with patch(
|
||||
"vllm.model_executor.layers.logits_processor._prune_hidden_states",
|
||||
lambda x, y: x
|
||||
), patch(
|
||||
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
|
||||
lambda *args, **kwargs: self.fake_logits):
|
||||
return super().forward(*args, **kwargs)
|
||||
|
||||
|
||||
def _prepare_test(
|
||||
batch_size: int
|
||||
) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
|
||||
vocab_size = 32000
|
||||
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
|
||||
fake_logits = torch.full((batch_size, vocab_size),
|
||||
1e-2,
|
||||
dtype=input_tensor.dtype)
|
||||
logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
|
||||
return input_tensor, fake_logits, logits_processor
|
||||
|
||||
|
||||
RANDOM_SEEDS = list(range(128))
|
||||
CUDA_DEVICES = [
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
def test_logits_processors(seed: int, device: str):
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
batch_size = random.randint(1, 256)
|
||||
input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
|
||||
|
||||
# This sample logits processor gives infinite score to the i-th token,
|
||||
# where i is the length of the input sequence.
|
||||
# We therefore expect the output token sequence to be [0, 1, 2, ...]
|
||||
def pick_ith(token_ids, logits):
|
||||
logits[len(token_ids)] = float("inf")
|
||||
return logits
|
||||
|
||||
seq_group_metadata_list = []
|
||||
seq_lens = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
|
||||
sampling_params=SamplingParams(temperature=0,
|
||||
logits_processors=[pick_ith]),
|
||||
block_tables={0: [1]},
|
||||
))
|
||||
seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
|
||||
sampling_metadata = SamplingMetadata.prepare(
|
||||
seq_group_metadata_list,
|
||||
seq_lens,
|
||||
query_lens=seq_lens,
|
||||
device=device,
|
||||
pin_memory=is_pin_memory_available())
|
||||
logits_processor_output = logits_processor(
|
||||
lm_head=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
|
||||
assert torch.isinf(logits_processor_output[:, 0]).all()
|
||||
|
||||
fake_logits *= logits_processor.scale
|
||||
torch.testing.assert_close(logits_processor_output[:, 1],
|
||||
fake_logits[:, 1],
|
||||
rtol=1e-4,
|
||||
atol=0.0)
|
||||
@ -1,92 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test hashing of cache blocks.
|
||||
|
||||
Run `pytest tests/test_cache_block_hashing.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.inputs import token_inputs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import Sequence
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
# Make two prefixes with different first blocks.
|
||||
prefix_start = [("You are an expert"), ("You are a")]
|
||||
prefix_common = (
|
||||
" school principal, skilled in effectively managing "
|
||||
"faculty and staff. Draft 10-15 questions for a potential first grade "
|
||||
"Head Teacher for my K-12, all-girls', independent school that emphasizes "
|
||||
"community, joyful discovery, and life-long learning. The candidate is "
|
||||
"coming in for a first-round panel interview for a 8th grade Math "
|
||||
"teaching role. They have 5 years of previous teaching experience "
|
||||
"as an assistant teacher at a co-ed, public school with experience "
|
||||
"in middle school math teaching. Based on this, fulfill "
|
||||
"the following: ")
|
||||
prefixes = [start + prefix_common for start in prefix_start]
|
||||
|
||||
# Sample prompts.
|
||||
sample_prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
|
||||
|
||||
# Helper function.
|
||||
def flatten_2d(li):
|
||||
return [lss for ls in li for lss in ls]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("max_num_seqs", [256])
|
||||
@pytest.mark.parametrize("concurrent_lora_int_ids",
|
||||
[[None], [1], [None, 1], [None, 1, 2], [1, 2]])
|
||||
def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
|
||||
concurrent_lora_int_ids: list[Optional[int]]):
|
||||
|
||||
tokenizer = get_tokenizer("facebook/opt-125m")
|
||||
|
||||
hashes: list[list[list[int]]] = []
|
||||
|
||||
for prefix in prefixes:
|
||||
for lora_int_id in concurrent_lora_int_ids:
|
||||
lora_request = None
|
||||
|
||||
if lora_int_id is not None:
|
||||
lora_request = LoRARequest(
|
||||
f"example_lora_{lora_int_id}",
|
||||
lora_int_id,
|
||||
f"example/path/to/lora_{lora_int_id}",
|
||||
)
|
||||
|
||||
hashes.append([])
|
||||
prompts = [prefix + prompt for prompt in sample_prompts]
|
||||
for seq_id, prompt in enumerate(prompts):
|
||||
hashes[-1].append([])
|
||||
prompt_token_ids = tokenizer.encode(prompt)
|
||||
seq = Sequence(seq_id,
|
||||
inputs=token_inputs(prompt_token_ids,
|
||||
prompt=prompt),
|
||||
block_size=block_size,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
lora_request=lora_request)
|
||||
|
||||
num_blocks = len(prompt_token_ids) // block_size
|
||||
for idx in range(num_blocks):
|
||||
hashes[-1][-1].append(seq.hash_of_block(idx))
|
||||
|
||||
# Check that hashes made with two prefixes with different first blocks are
|
||||
# different everywhere.
|
||||
for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
|
||||
assert (hash0 != hash1)
|
||||
|
||||
# Check that hashes of different prompts made with the same prefix are the
|
||||
# same until the hashes that contain the prompt.
|
||||
for hash_pref in hashes:
|
||||
same_hashes = [tuple(h[:-1]) for h in hash_pref]
|
||||
different_hashes = [h[-1] for h in hash_pref]
|
||||
assert (len(set(same_hashes)) == 1)
|
||||
assert (len(set(different_hashes)) == len(different_hashes))
|
||||
Loading…
x
Reference in New Issue
Block a user