[Testing] Fix core tests (#3224)

This commit is contained in:
Cade Daniel 2024-03-06 01:04:23 -08:00 committed by GitHub
parent 24aecf421a
commit a33ce60c66
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 36 additions and 23 deletions

View File

@ -6,7 +6,7 @@ from vllm import SamplingParams
from vllm.block import PhysicalTokenBlock from vllm.block import PhysicalTokenBlock
from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
from vllm.utils import Device from vllm.utils import Device
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
@ -22,7 +22,8 @@ def test_block_allocator_allocate():
for _ in range(num_cpu_blocks): for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate() block = cpu_allocator.allocate()
num_free -= 1 num_free -= 1
assert block not in cpu_allocator.free_blocks
assert block.block_hash not in cpu_allocator.evictor
assert cpu_allocator.get_num_free_blocks() == num_free assert cpu_allocator.get_num_free_blocks() == num_free
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -39,7 +40,7 @@ def test_block_allocator_free():
for _ in range(num_cpu_blocks): for _ in range(num_cpu_blocks):
block = cpu_allocator.allocate() block = cpu_allocator.allocate()
blocks.append(block) blocks.append(block)
assert block not in cpu_allocator.free_blocks assert block.block_hash not in cpu_allocator.evictor
# Free all allocated cpu blocks. # Free all allocated cpu blocks.
num_free = 0 num_free = 0
@ -47,7 +48,7 @@ def test_block_allocator_free():
for block in blocks: for block in blocks:
cpu_allocator.free(block) cpu_allocator.free(block)
num_free += 1 num_free += 1
assert block in cpu_allocator.free_blocks assert block.block_hash in cpu_allocator.evictor
assert cpu_allocator.get_num_free_blocks() == num_free assert cpu_allocator.get_num_free_blocks() == num_free
with pytest.raises(ValueError): with pytest.raises(ValueError):
@ -106,7 +107,7 @@ def test_append_slot_single_seq():
# Add block_size number of new tokens and append slot. # Add block_size number of new tokens and append slot.
for i in range(block_size): for i in range(block_size):
token_id = i + 5 token_id = i + 5
prompt.append_token_id(token_id, {token_id: 0.0}) prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
assert block_manager.can_append_slot(seq_group) assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks() before_blocks = block_manager.get_num_free_gpu_blocks()
@ -119,25 +120,37 @@ def test_append_slot_cow():
block_size = 4 block_size = 4
num_cpu_blocks = 4 num_cpu_blocks = 4
num_gpu_blocks = 4 num_gpu_blocks = 4
block_manager = BlockSpaceManager(block_size, block_manager = BlockSpaceManager(block_size=block_size,
num_cpu_blocks, num_cpu_blocks=num_cpu_blocks,
num_gpu_blocks, num_gpu_blocks=num_gpu_blocks,
watermark=0) watermark=0)
# Allocate prompt to gpu block. # Allocate prompt to gpu block. There is one slot left in the block.
prompt = Sequence(1, "one two three", [1, 2, 3], block_size) prompt = Sequence(seq_id=1,
child = prompt.fork(2) prompt="one two three",
token_id = 4 prompt_token_ids=[1, 2, 3],
child.append_token_id(token_id, {token_id: 0.0}) block_size=block_size)
# Fork the sequence, such that a COW will be required when we append a new
# token id.
child = prompt.fork(new_seq_id=2)
# Allocate space for the sequence group.
seq_group = SequenceGroup("1", [prompt, child], SamplingParams(), seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
time.time(), time.perf_counter) time.time(), time.perf_counter)
block_manager.allocate(seq_group) block_manager.allocate(seq_group)
# Append slot for child token. # Fork and append a new token id. We expect a COW to be scheduled.
# Last block being modified is shared. Copy on write occurs. token_id = 4
child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.fork(prompt, child)
assert block_manager.can_append_slot(seq_group) assert block_manager.can_append_slot(seq_group)
before_blocks = block_manager.get_num_free_gpu_blocks() before_blocks = block_manager.get_num_free_gpu_blocks()
src_block, dst_block = block_manager.append_slot(child)
maybe_src_dst_block = block_manager.append_slot(child)
assert maybe_src_dst_block is not None
src_block, dst_block = maybe_src_dst_block
assert src_block != dst_block assert src_block != dst_block
after_blocks = block_manager.get_num_free_gpu_blocks() after_blocks = block_manager.get_num_free_gpu_blocks()
@ -165,7 +178,7 @@ def test_fork():
prompt) == block_manager.get_block_table(child) prompt) == block_manager.get_block_table(child)
token_id = 4 token_id = 4
# Append token to child. Block is shared so copy on write occurs. # Append token to child. Block is shared so copy on write occurs.
child.append_token_id(token_id, {token_id: 0.0}) child.append_token_id(token_id, {token_id: Logprob(0.0)})
block_manager.append_slot(child) block_manager.append_slot(child)
assert block_manager.get_block_table( assert block_manager.get_block_table(
prompt) != block_manager.get_block_table(child) prompt) != block_manager.get_block_table(child)
@ -189,7 +202,7 @@ def test_swap():
# tokens will be written in the next forward pass. # tokens will be written in the next forward pass.
token_id = 0 token_id = 0
prompt.status = SequenceStatus.RUNNING prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: 0.0}) prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU. # Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt) gpu_blocks = block_manager.get_block_table(prompt)

View File

@ -3,7 +3,7 @@ import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.sequence import SequenceGroup from vllm.sequence import SequenceGroup, Logprob
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort():
# Append "generated" tokens, allowing the sequence to mark prompt tokens as # Append "generated" tokens, allowing the sequence to mark prompt tokens as
# processed. # processed.
token_id = 0 token_id = 0
seq_a.append_token_id(token_id, {token_id: 0.0}) seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
seq_b.append_token_id(token_id, {token_id: 0.0}) seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
# Schedule seq groups generation and preempt seq group b. # Schedule seq groups generation and preempt seq group b.
seq_group_meta, out = scheduler.schedule() seq_group_meta, out = scheduler.schedule()

View File

@ -18,7 +18,7 @@ def create_dummy_prompt(
prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt_str = " ".join([str(t) for t in prompt_tokens])
prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
seq_group = SequenceGroup(request_id, [prompt], SamplingParams(), seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
time.time(), None, None) time.time(), None)
return prompt, seq_group return prompt, seq_group

View File

@ -142,7 +142,7 @@ class Sequence:
prompt: str, prompt: str,
prompt_token_ids: List[int], prompt_token_ids: List[int],
block_size: int, block_size: int,
eos_token_id: int, eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
) -> None: ) -> None:
self.seq_id = seq_id self.seq_id = seq_id