mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 23:45:01 +08:00
[Testing] Fix core tests (#3224)
This commit is contained in:
parent
24aecf421a
commit
a33ce60c66
@ -6,7 +6,7 @@ from vllm import SamplingParams
|
|||||||
from vllm.block import PhysicalTokenBlock
|
from vllm.block import PhysicalTokenBlock
|
||||||
from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
|
from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
|
||||||
from vllm.utils import Device
|
from vllm.utils import Device
|
||||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
|
||||||
|
|
||||||
from .utils import create_dummy_prompt
|
from .utils import create_dummy_prompt
|
||||||
|
|
||||||
@ -22,7 +22,8 @@ def test_block_allocator_allocate():
|
|||||||
for _ in range(num_cpu_blocks):
|
for _ in range(num_cpu_blocks):
|
||||||
block = cpu_allocator.allocate()
|
block = cpu_allocator.allocate()
|
||||||
num_free -= 1
|
num_free -= 1
|
||||||
assert block not in cpu_allocator.free_blocks
|
|
||||||
|
assert block.block_hash not in cpu_allocator.evictor
|
||||||
assert cpu_allocator.get_num_free_blocks() == num_free
|
assert cpu_allocator.get_num_free_blocks() == num_free
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
@ -39,7 +40,7 @@ def test_block_allocator_free():
|
|||||||
for _ in range(num_cpu_blocks):
|
for _ in range(num_cpu_blocks):
|
||||||
block = cpu_allocator.allocate()
|
block = cpu_allocator.allocate()
|
||||||
blocks.append(block)
|
blocks.append(block)
|
||||||
assert block not in cpu_allocator.free_blocks
|
assert block.block_hash not in cpu_allocator.evictor
|
||||||
|
|
||||||
# Free all allocated cpu blocks.
|
# Free all allocated cpu blocks.
|
||||||
num_free = 0
|
num_free = 0
|
||||||
@ -47,7 +48,7 @@ def test_block_allocator_free():
|
|||||||
for block in blocks:
|
for block in blocks:
|
||||||
cpu_allocator.free(block)
|
cpu_allocator.free(block)
|
||||||
num_free += 1
|
num_free += 1
|
||||||
assert block in cpu_allocator.free_blocks
|
assert block.block_hash in cpu_allocator.evictor
|
||||||
assert cpu_allocator.get_num_free_blocks() == num_free
|
assert cpu_allocator.get_num_free_blocks() == num_free
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
@ -106,7 +107,7 @@ def test_append_slot_single_seq():
|
|||||||
# Add block_size number of new tokens and append slot.
|
# Add block_size number of new tokens and append slot.
|
||||||
for i in range(block_size):
|
for i in range(block_size):
|
||||||
token_id = i + 5
|
token_id = i + 5
|
||||||
prompt.append_token_id(token_id, {token_id: 0.0})
|
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
|
|
||||||
assert block_manager.can_append_slot(seq_group)
|
assert block_manager.can_append_slot(seq_group)
|
||||||
before_blocks = block_manager.get_num_free_gpu_blocks()
|
before_blocks = block_manager.get_num_free_gpu_blocks()
|
||||||
@ -119,25 +120,37 @@ def test_append_slot_cow():
|
|||||||
block_size = 4
|
block_size = 4
|
||||||
num_cpu_blocks = 4
|
num_cpu_blocks = 4
|
||||||
num_gpu_blocks = 4
|
num_gpu_blocks = 4
|
||||||
block_manager = BlockSpaceManager(block_size,
|
block_manager = BlockSpaceManager(block_size=block_size,
|
||||||
num_cpu_blocks,
|
num_cpu_blocks=num_cpu_blocks,
|
||||||
num_gpu_blocks,
|
num_gpu_blocks=num_gpu_blocks,
|
||||||
watermark=0)
|
watermark=0)
|
||||||
|
|
||||||
# Allocate prompt to gpu block.
|
# Allocate prompt to gpu block. There is one slot left in the block.
|
||||||
prompt = Sequence(1, "one two three", [1, 2, 3], block_size)
|
prompt = Sequence(seq_id=1,
|
||||||
child = prompt.fork(2)
|
prompt="one two three",
|
||||||
token_id = 4
|
prompt_token_ids=[1, 2, 3],
|
||||||
child.append_token_id(token_id, {token_id: 0.0})
|
block_size=block_size)
|
||||||
|
|
||||||
|
# Fork the sequence, such that a COW will be required when we append a new
|
||||||
|
# token id.
|
||||||
|
child = prompt.fork(new_seq_id=2)
|
||||||
|
|
||||||
|
# Allocate space for the sequence group.
|
||||||
seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
|
seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
|
||||||
time.time(), time.perf_counter)
|
time.time(), time.perf_counter)
|
||||||
block_manager.allocate(seq_group)
|
block_manager.allocate(seq_group)
|
||||||
|
|
||||||
# Append slot for child token.
|
# Fork and append a new token id. We expect a COW to be scheduled.
|
||||||
# Last block being modified is shared. Copy on write occurs.
|
token_id = 4
|
||||||
|
child.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
|
block_manager.fork(prompt, child)
|
||||||
|
|
||||||
assert block_manager.can_append_slot(seq_group)
|
assert block_manager.can_append_slot(seq_group)
|
||||||
before_blocks = block_manager.get_num_free_gpu_blocks()
|
before_blocks = block_manager.get_num_free_gpu_blocks()
|
||||||
src_block, dst_block = block_manager.append_slot(child)
|
|
||||||
|
maybe_src_dst_block = block_manager.append_slot(child)
|
||||||
|
assert maybe_src_dst_block is not None
|
||||||
|
src_block, dst_block = maybe_src_dst_block
|
||||||
assert src_block != dst_block
|
assert src_block != dst_block
|
||||||
|
|
||||||
after_blocks = block_manager.get_num_free_gpu_blocks()
|
after_blocks = block_manager.get_num_free_gpu_blocks()
|
||||||
@ -165,7 +178,7 @@ def test_fork():
|
|||||||
prompt) == block_manager.get_block_table(child)
|
prompt) == block_manager.get_block_table(child)
|
||||||
token_id = 4
|
token_id = 4
|
||||||
# Append token to child. Block is shared so copy on write occurs.
|
# Append token to child. Block is shared so copy on write occurs.
|
||||||
child.append_token_id(token_id, {token_id: 0.0})
|
child.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
block_manager.append_slot(child)
|
block_manager.append_slot(child)
|
||||||
assert block_manager.get_block_table(
|
assert block_manager.get_block_table(
|
||||||
prompt) != block_manager.get_block_table(child)
|
prompt) != block_manager.get_block_table(child)
|
||||||
@ -189,7 +202,7 @@ def test_swap():
|
|||||||
# tokens will be written in the next forward pass.
|
# tokens will be written in the next forward pass.
|
||||||
token_id = 0
|
token_id = 0
|
||||||
prompt.status = SequenceStatus.RUNNING
|
prompt.status = SequenceStatus.RUNNING
|
||||||
prompt.append_token_id(token_id, {token_id: 0.0})
|
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
|
|
||||||
# Swap seq group from GPU -> CPU.
|
# Swap seq group from GPU -> CPU.
|
||||||
gpu_blocks = block_manager.get_block_table(prompt)
|
gpu_blocks = block_manager.get_block_table(prompt)
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import pytest # noqa
|
|||||||
|
|
||||||
from vllm.config import CacheConfig, SchedulerConfig
|
from vllm.config import CacheConfig, SchedulerConfig
|
||||||
from vllm.core.scheduler import Scheduler
|
from vllm.core.scheduler import Scheduler
|
||||||
from vllm.sequence import SequenceGroup
|
from vllm.sequence import SequenceGroup, Logprob
|
||||||
|
|
||||||
from .utils import create_dummy_prompt
|
from .utils import create_dummy_prompt
|
||||||
|
|
||||||
@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort():
|
|||||||
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
|
# Append "generated" tokens, allowing the sequence to mark prompt tokens as
|
||||||
# processed.
|
# processed.
|
||||||
token_id = 0
|
token_id = 0
|
||||||
seq_a.append_token_id(token_id, {token_id: 0.0})
|
seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
seq_b.append_token_id(token_id, {token_id: 0.0})
|
seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
|
||||||
|
|
||||||
# Schedule seq groups generation and preempt seq group b.
|
# Schedule seq groups generation and preempt seq group b.
|
||||||
seq_group_meta, out = scheduler.schedule()
|
seq_group_meta, out = scheduler.schedule()
|
||||||
|
|||||||
@ -18,7 +18,7 @@ def create_dummy_prompt(
|
|||||||
prompt_str = " ".join([str(t) for t in prompt_tokens])
|
prompt_str = " ".join([str(t) for t in prompt_tokens])
|
||||||
prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
|
prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
|
||||||
seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
|
seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
|
||||||
time.time(), None, None)
|
time.time(), None)
|
||||||
|
|
||||||
return prompt, seq_group
|
return prompt, seq_group
|
||||||
|
|
||||||
|
|||||||
@ -142,7 +142,7 @@ class Sequence:
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
prompt_token_ids: List[int],
|
prompt_token_ids: List[int],
|
||||||
block_size: int,
|
block_size: int,
|
||||||
eos_token_id: int,
|
eos_token_id: Optional[int] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.seq_id = seq_id
|
self.seq_id = seq_id
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user