mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-13 18:41:21 +08:00
Merge branch 'main' into woosuk/model-runner-v2
This commit is contained in:
commit
33672774f5
@ -24,7 +24,7 @@ outlines_core == 0.2.11
|
|||||||
# required for outlines backend disk cache
|
# required for outlines backend disk cache
|
||||||
diskcache == 5.6.3
|
diskcache == 5.6.3
|
||||||
lark == 1.2.2
|
lark == 1.2.2
|
||||||
xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||||
typing_extensions >= 4.10
|
typing_extensions >= 4.10
|
||||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||||
partial-json-parser # used for parsing partial JSON outputs
|
partial-json-parser # used for parsing partial JSON outputs
|
||||||
|
|||||||
@ -76,11 +76,6 @@ def test_models(
|
|||||||
model_executor: str,
|
model_executor: str,
|
||||||
enable_prompt_embeds: bool,
|
enable_prompt_embeds: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
if enable_prompt_embeds and envs.is_set(
|
|
||||||
"VLLM_USE_V1") and envs.VLLM_USE_V1:
|
|
||||||
pytest.skip("enable_prompt_embeds is not supported in v1.")
|
|
||||||
|
|
||||||
if not envs.VLLM_USE_V1:
|
if not envs.VLLM_USE_V1:
|
||||||
if async_scheduling:
|
if async_scheduling:
|
||||||
pytest.skip("async_scheduling only supported in v1.")
|
pytest.skip("async_scheduling only supported in v1.")
|
||||||
@ -164,11 +159,6 @@ def test_models_distributed(
|
|||||||
extra_env: dict[str, str],
|
extra_env: dict[str, str],
|
||||||
enable_prompt_embeds: bool,
|
enable_prompt_embeds: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
if enable_prompt_embeds and envs.is_set(
|
|
||||||
"VLLM_USE_V1") and envs.VLLM_USE_V1:
|
|
||||||
pytest.skip("enable_prompt_embeds is not supported in v1.")
|
|
||||||
|
|
||||||
if test_suite != TARGET_TEST_SUITE:
|
if test_suite != TARGET_TEST_SUITE:
|
||||||
pytest.skip(f"Skip test for {test_suite}")
|
pytest.skip(f"Skip test for {test_suite}")
|
||||||
|
|
||||||
|
|||||||
@ -36,7 +36,6 @@ def default_server_args() -> list[str]:
|
|||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
# Prompt Embeds server args
|
# Prompt Embeds server args
|
||||||
"--enable-prompt-embeds",
|
"--enable-prompt-embeds",
|
||||||
"--no-enable-chunked-prefill",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
|||||||
assert response3.status == "completed"
|
assert response3.status == "completed"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
|
async def test_streaming_types(client: OpenAI, model_name: str):
|
||||||
|
prompts = [
|
||||||
|
"tell me a story about a cat in 20 words",
|
||||||
|
]
|
||||||
|
|
||||||
|
# this links the "done" type with the "start" type
|
||||||
|
# so every "done" type should have a corresponding "start" type
|
||||||
|
# and every open block should be closed by the end of the stream
|
||||||
|
pairs_of_event_types = {
|
||||||
|
"response.completed": "response.created",
|
||||||
|
"response.output_item.done": "response.output_item.added",
|
||||||
|
"response.content_part.done": "response.content_part.added",
|
||||||
|
"response.output_text.done": "response.output_text.delta",
|
||||||
|
"response.web_search_call.done": "response.web_search_call.added",
|
||||||
|
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||||
|
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||||
|
}
|
||||||
|
|
||||||
|
for prompt in prompts:
|
||||||
|
response = await client.responses.create(
|
||||||
|
model=model_name,
|
||||||
|
input=prompt,
|
||||||
|
reasoning={"effort": "low"},
|
||||||
|
tools=[],
|
||||||
|
stream=True,
|
||||||
|
background=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
stack_of_event_types = []
|
||||||
|
async for event in response:
|
||||||
|
if event.type == 'response.created':
|
||||||
|
stack_of_event_types.append(event.type)
|
||||||
|
elif event.type == 'response.completed':
|
||||||
|
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||||
|
event.type]
|
||||||
|
stack_of_event_types.pop()
|
||||||
|
if event.type.endswith("added"):
|
||||||
|
stack_of_event_types.append(event.type)
|
||||||
|
elif event.type.endswith("delta"):
|
||||||
|
if stack_of_event_types[-1] == event.type:
|
||||||
|
continue
|
||||||
|
stack_of_event_types.append(event.type)
|
||||||
|
elif event.type.endswith("done"):
|
||||||
|
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||||
|
event.type]
|
||||||
|
stack_of_event_types.pop()
|
||||||
|
assert len(stack_of_event_types) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||||
@pytest.mark.parametrize("background", [True, False])
|
@pytest.mark.parametrize("background", [True, False])
|
||||||
@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
|||||||
assert event.item_id == current_item_id
|
assert event.item_id == current_item_id
|
||||||
|
|
||||||
# verify content_index_id is correct
|
# verify content_index_id is correct
|
||||||
if event.type == "response.content_part.added":
|
if event.type in [
|
||||||
|
"response.content_part.added",
|
||||||
|
"response.reasoning_part.added"
|
||||||
|
]:
|
||||||
assert event.content_index != current_content_index
|
assert event.content_index != current_content_index
|
||||||
current_content_index = event.content_index
|
current_content_index = event.content_index
|
||||||
elif event.type in [
|
elif event.type in [
|
||||||
|
|||||||
@ -125,12 +125,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
|||||||
# in parts of the operators
|
# in parts of the operators
|
||||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||||
|
|
||||||
# Note: can be removed when
|
|
||||||
# https://github.com/vllm-project/vllm/pull/24278 finished
|
|
||||||
if current_platform.is_cpu() and use_prompt_embeds:
|
|
||||||
pytest.skip("Skipping use_prompt_embeds=True with "
|
|
||||||
"V1-only CPU backend.")
|
|
||||||
|
|
||||||
with hf_runner(model) as hf_model:
|
with hf_runner(model) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||||
example_prompts, max_tokens, num_logprobs)
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|||||||
175
tests/v1/kv_offload/test_cpu.py
Normal file
175
tests/v1/kv_offload/test_cpu.py
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections.abc import Iterable
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||||
|
from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
|
||||||
|
PrepareStoreOutput)
|
||||||
|
from vllm.v1.kv_offload.backends.cpu import CPUBackend
|
||||||
|
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
|
||||||
|
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExpectedPrepareStoreOutput:
|
||||||
|
block_hashes_to_store: list[int]
|
||||||
|
store_block_ids: list[int]
|
||||||
|
block_hashes_evicted: list[int]
|
||||||
|
|
||||||
|
|
||||||
|
def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
|
||||||
|
return [BlockHash(str(i).encode()) for i in int_hashes]
|
||||||
|
|
||||||
|
|
||||||
|
def verify_store_output(
|
||||||
|
prepare_store_output: Optional[PrepareStoreOutput],
|
||||||
|
expected_prepare_store_output: ExpectedPrepareStoreOutput):
|
||||||
|
assert prepare_store_output is not None
|
||||||
|
assert (prepare_store_output.block_hashes_to_store == to_hashes(
|
||||||
|
expected_prepare_store_output.block_hashes_to_store))
|
||||||
|
assert (prepare_store_output.block_hashes_evicted == to_hashes(
|
||||||
|
expected_prepare_store_output.block_hashes_evicted))
|
||||||
|
store_spec = prepare_store_output.store_spec
|
||||||
|
assert isinstance(store_spec, CPULoadStoreSpec)
|
||||||
|
expected_array = np.array(expected_prepare_store_output.store_block_ids,
|
||||||
|
dtype=np.int64)
|
||||||
|
assert np.array_equal(expected_array, store_spec.block_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_load_output(prepare_load_output: LoadStoreSpec,
|
||||||
|
expected_prepare_load_output: list[int]):
|
||||||
|
assert isinstance(prepare_load_output, CPULoadStoreSpec)
|
||||||
|
expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
|
||||||
|
assert np.array_equal(expected_array, prepare_load_output.block_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_events(events: Iterable[OffloadingEvent],
|
||||||
|
block_size: int,
|
||||||
|
expected_stores: tuple[set[int], ...] = (),
|
||||||
|
expected_evictions: tuple[set[int], ...] = ()):
|
||||||
|
stores: list[set[BlockHash]] = []
|
||||||
|
evictions: list[set[BlockHash]] = []
|
||||||
|
for event in events:
|
||||||
|
assert event.medium == CPULoadStoreSpec.medium()
|
||||||
|
assert event.block_size == block_size
|
||||||
|
if event.removed:
|
||||||
|
evictions.append(set(event.block_hashes))
|
||||||
|
else:
|
||||||
|
stores.append(set(event.block_hashes))
|
||||||
|
|
||||||
|
def to_hash_sets(
|
||||||
|
int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
|
||||||
|
return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
|
||||||
|
|
||||||
|
assert tuple(evictions) == to_hash_sets(expected_evictions)
|
||||||
|
assert tuple(stores) == to_hash_sets(expected_stores)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cpu_manager():
|
||||||
|
"""
|
||||||
|
Tests LRUOffloadingManager with a CPUBackend.
|
||||||
|
"""
|
||||||
|
# initialize a CPU backend with a capacity of 4 blocks
|
||||||
|
block_size = 256
|
||||||
|
cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
|
||||||
|
cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True)
|
||||||
|
|
||||||
|
# prepare store [1, 2]
|
||||||
|
prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2]))
|
||||||
|
verify_store_output(
|
||||||
|
prepare_store_output,
|
||||||
|
ExpectedPrepareStoreOutput(
|
||||||
|
block_hashes_to_store=[1, 2],
|
||||||
|
store_block_ids=[0, 1],
|
||||||
|
block_hashes_evicted=[],
|
||||||
|
))
|
||||||
|
|
||||||
|
# lookup [1, 2] -> not ready
|
||||||
|
assert cpu_manager.lookup(to_hashes([1, 2])) == 0
|
||||||
|
|
||||||
|
# no events so far
|
||||||
|
assert list(cpu_manager.take_events()) == []
|
||||||
|
|
||||||
|
# complete store [1, 2]
|
||||||
|
cpu_manager.complete_store(to_hashes([1, 2]))
|
||||||
|
verify_events(cpu_manager.take_events(),
|
||||||
|
block_size=block_size,
|
||||||
|
expected_stores=({1, 2}, ))
|
||||||
|
|
||||||
|
# lookup [1, 2]
|
||||||
|
assert cpu_manager.lookup(to_hashes([1])) == 1
|
||||||
|
assert cpu_manager.lookup(to_hashes([1, 2])) == 2
|
||||||
|
assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2
|
||||||
|
|
||||||
|
# prepare store [2, 3, 4, 5] -> evicts [1]
|
||||||
|
prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5]))
|
||||||
|
verify_store_output(
|
||||||
|
prepare_store_output,
|
||||||
|
ExpectedPrepareStoreOutput(
|
||||||
|
block_hashes_to_store=[3, 4, 5],
|
||||||
|
store_block_ids=[2, 3, 0],
|
||||||
|
block_hashes_evicted=[1],
|
||||||
|
))
|
||||||
|
|
||||||
|
# verify eviction event
|
||||||
|
verify_events(cpu_manager.take_events(),
|
||||||
|
block_size=block_size,
|
||||||
|
expected_evictions=({1}, ))
|
||||||
|
|
||||||
|
# prepare store with no space
|
||||||
|
assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
|
||||||
|
|
||||||
|
# complete store [2, 3, 4, 5]
|
||||||
|
cpu_manager.complete_store(to_hashes([2, 3, 4, 5]))
|
||||||
|
|
||||||
|
# prepare load [2, 3]
|
||||||
|
prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3]))
|
||||||
|
verify_load_output(prepare_load_output, [1, 2])
|
||||||
|
|
||||||
|
# prepare store with no space ([2, 3] is being loaded)
|
||||||
|
assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None
|
||||||
|
|
||||||
|
# complete load [2, 3]
|
||||||
|
cpu_manager.complete_load(to_hashes([2, 3]))
|
||||||
|
|
||||||
|
# prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
|
||||||
|
prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8]))
|
||||||
|
verify_store_output(
|
||||||
|
prepare_store_output,
|
||||||
|
ExpectedPrepareStoreOutput(
|
||||||
|
block_hashes_to_store=[6, 7, 8],
|
||||||
|
store_block_ids=[3, 2, 1],
|
||||||
|
block_hashes_evicted=[2, 3, 4],
|
||||||
|
))
|
||||||
|
|
||||||
|
# complete store [6, 7, 8]
|
||||||
|
cpu_manager.complete_store(to_hashes([6, 7, 8]))
|
||||||
|
|
||||||
|
# touch [5, 6, 7] (move to end of LRU order)
|
||||||
|
cpu_manager.touch(to_hashes([5, 6, 7]))
|
||||||
|
|
||||||
|
# prepare store [7, 9] -> evicts [8] (oldest following previous touch)
|
||||||
|
prepare_store_output = cpu_manager.prepare_store(to_hashes([9]))
|
||||||
|
verify_store_output(
|
||||||
|
prepare_store_output,
|
||||||
|
ExpectedPrepareStoreOutput(
|
||||||
|
block_hashes_to_store=[9],
|
||||||
|
store_block_ids=[1],
|
||||||
|
block_hashes_evicted=[8],
|
||||||
|
))
|
||||||
|
|
||||||
|
# complete store [7, 9] with failure
|
||||||
|
cpu_manager.complete_store(to_hashes([7, 9]), success=False)
|
||||||
|
|
||||||
|
# assert [7] is still stored, but [9] is not
|
||||||
|
assert cpu_manager.lookup(to_hashes([7])) == 1
|
||||||
|
assert cpu_manager.lookup(to_hashes([9])) == 0
|
||||||
|
|
||||||
|
verify_events(cpu_manager.take_events(),
|
||||||
|
block_size=block_size,
|
||||||
|
expected_stores=({3, 4, 5}, {6, 7, 8}),
|
||||||
|
expected_evictions=({2, 3, 4}, {8}))
|
||||||
@ -1513,12 +1513,6 @@ class EngineArgs:
|
|||||||
recommend_to_remove=False)
|
recommend_to_remove=False)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# No text embedding inputs so far.
|
|
||||||
if self.enable_prompt_embeds:
|
|
||||||
_raise_or_fallback(feature_name="--enable-prompt-embeds",
|
|
||||||
recommend_to_remove=False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# No Mamba or Encoder-Decoder so far.
|
# No Mamba or Encoder-Decoder so far.
|
||||||
if not model_config.is_v1_compatible:
|
if not model_config.is_v1_compatible:
|
||||||
_raise_or_fallback(feature_name=model_config.architectures,
|
_raise_or_fallback(feature_name=model_config.architectures,
|
||||||
@ -1651,6 +1645,13 @@ class EngineArgs:
|
|||||||
"models in V0 and has been disabled.")
|
"models in V0 and has been disabled.")
|
||||||
self.enable_prefix_caching = False
|
self.enable_prefix_caching = False
|
||||||
|
|
||||||
|
if self.enable_prompt_embeds:
|
||||||
|
logger.warning(
|
||||||
|
"--enable-prompt-embeds and --enable-prefix-caching "
|
||||||
|
"are not supported together in V0. Prefix caching has "
|
||||||
|
"been disabled.")
|
||||||
|
self.enable_prefix_caching = False
|
||||||
|
|
||||||
# Set max_num_seqs to 256 for VLLM_V0.
|
# Set max_num_seqs to 256 for VLLM_V0.
|
||||||
if self.max_num_seqs is None:
|
if self.max_num_seqs is None:
|
||||||
self.max_num_seqs = 256
|
self.max_num_seqs = 256
|
||||||
@ -1664,6 +1665,17 @@ class EngineArgs:
|
|||||||
# For pooling tasks the default is False
|
# For pooling tasks the default is False
|
||||||
if model_config.runner_type != "pooling":
|
if model_config.runner_type != "pooling":
|
||||||
self.enable_chunked_prefill = True
|
self.enable_chunked_prefill = True
|
||||||
|
|
||||||
|
# TODO: When prefix caching supports prompt embeds inputs, this
|
||||||
|
# check can be removed.
|
||||||
|
if (self.enable_prompt_embeds
|
||||||
|
and self.enable_prefix_caching is not False):
|
||||||
|
logger.warning(
|
||||||
|
"--enable-prompt-embeds and --enable-prefix-caching "
|
||||||
|
"are not supported together in V1. Prefix caching has "
|
||||||
|
"been disabled.")
|
||||||
|
self.enable_prefix_caching = False
|
||||||
|
|
||||||
if self.enable_prefix_caching is None:
|
if self.enable_prefix_caching is None:
|
||||||
self.enable_prefix_caching = True
|
self.enable_prefix_caching = True
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -31,6 +31,8 @@ from openai.types.responses import (
|
|||||||
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
|
ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
|
||||||
ResponseStatus, ResponseWebSearchCallCompletedEvent,
|
ResponseStatus, ResponseWebSearchCallCompletedEvent,
|
||||||
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
|
ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
|
||||||
|
from openai.types.responses.response_reasoning_item import (
|
||||||
|
Content as ResponseReasoningTextContent)
|
||||||
|
|
||||||
# Backward compatibility for OpenAI client versions
|
# Backward compatibility for OpenAI client versions
|
||||||
try: # For older openai versions (< 1.100.0)
|
try: # For older openai versions (< 1.100.0)
|
||||||
@ -260,26 +262,6 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
|
|||||||
ResponseReasoningItem,
|
ResponseReasoningItem,
|
||||||
ResponseFunctionToolCall]
|
ResponseFunctionToolCall]
|
||||||
|
|
||||||
StreamingResponsesResponse: TypeAlias = Union[
|
|
||||||
ResponseCreatedEvent,
|
|
||||||
ResponseInProgressEvent,
|
|
||||||
ResponseCompletedEvent,
|
|
||||||
ResponseOutputItemAddedEvent,
|
|
||||||
ResponseOutputItemDoneEvent,
|
|
||||||
ResponseContentPartAddedEvent,
|
|
||||||
ResponseContentPartDoneEvent,
|
|
||||||
ResponseReasoningTextDeltaEvent,
|
|
||||||
ResponseReasoningTextDoneEvent,
|
|
||||||
ResponseCodeInterpreterCallInProgressEvent,
|
|
||||||
ResponseCodeInterpreterCallCodeDeltaEvent,
|
|
||||||
ResponseWebSearchCallInProgressEvent,
|
|
||||||
ResponseWebSearchCallSearchingEvent,
|
|
||||||
ResponseWebSearchCallCompletedEvent,
|
|
||||||
ResponseCodeInterpreterCallCodeDoneEvent,
|
|
||||||
ResponseCodeInterpreterCallInterpretingEvent,
|
|
||||||
ResponseCodeInterpreterCallCompletedEvent,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class ResponsesRequest(OpenAIBaseModel):
|
class ResponsesRequest(OpenAIBaseModel):
|
||||||
# Ordered by official OpenAI API documentation
|
# Ordered by official OpenAI API documentation
|
||||||
@ -973,7 +955,6 @@ class CompletionRequest(OpenAIBaseModel):
|
|||||||
# https://platform.openai.com/docs/api-reference/completions/create
|
# https://platform.openai.com/docs/api-reference/completions/create
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
|
prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
|
||||||
prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
|
|
||||||
best_of: Optional[int] = None
|
best_of: Optional[int] = None
|
||||||
echo: Optional[bool] = False
|
echo: Optional[bool] = False
|
||||||
frequency_penalty: Optional[float] = 0.0
|
frequency_penalty: Optional[float] = 0.0
|
||||||
@ -1009,6 +990,7 @@ class CompletionRequest(OpenAIBaseModel):
|
|||||||
# --8<-- [end:completion-sampling-params]
|
# --8<-- [end:completion-sampling-params]
|
||||||
|
|
||||||
# --8<-- [start:completion-extra-params]
|
# --8<-- [start:completion-extra-params]
|
||||||
|
prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
|
||||||
add_special_tokens: bool = Field(
|
add_special_tokens: bool = Field(
|
||||||
default=True,
|
default=True,
|
||||||
description=(
|
description=(
|
||||||
@ -1916,6 +1898,72 @@ class ResponsesResponse(OpenAIBaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: this code can be removed once
|
||||||
|
# https://github.com/openai/openai-python/issues/2634 has been resolved
|
||||||
|
class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
|
||||||
|
content_index: int
|
||||||
|
"""The index of the content part that is done."""
|
||||||
|
|
||||||
|
item_id: str
|
||||||
|
"""The ID of the output item that the content part was added to."""
|
||||||
|
|
||||||
|
output_index: int
|
||||||
|
"""The index of the output item that the content part was added to."""
|
||||||
|
|
||||||
|
part: ResponseReasoningTextContent
|
||||||
|
"""The content part that is done."""
|
||||||
|
|
||||||
|
sequence_number: int
|
||||||
|
"""The sequence number of this event."""
|
||||||
|
|
||||||
|
type: Literal["response.reasoning_part.done"]
|
||||||
|
"""The type of the event. Always `response.reasoning_part.done`."""
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: this code can be removed once
|
||||||
|
# https://github.com/openai/openai-python/issues/2634 has been resolved
|
||||||
|
class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
|
||||||
|
content_index: int
|
||||||
|
"""The index of the content part that is done."""
|
||||||
|
|
||||||
|
item_id: str
|
||||||
|
"""The ID of the output item that the content part was added to."""
|
||||||
|
|
||||||
|
output_index: int
|
||||||
|
"""The index of the output item that the content part was added to."""
|
||||||
|
|
||||||
|
part: ResponseReasoningTextContent
|
||||||
|
"""The content part that is done."""
|
||||||
|
|
||||||
|
sequence_number: int
|
||||||
|
"""The sequence number of this event."""
|
||||||
|
|
||||||
|
type: Literal["response.reasoning_part.added"]
|
||||||
|
"""The type of the event. Always `response.reasoning_part.added`."""
|
||||||
|
|
||||||
|
|
||||||
|
StreamingResponsesResponse: TypeAlias = Union[
|
||||||
|
ResponseCreatedEvent,
|
||||||
|
ResponseInProgressEvent,
|
||||||
|
ResponseCompletedEvent,
|
||||||
|
ResponseOutputItemAddedEvent,
|
||||||
|
ResponseOutputItemDoneEvent,
|
||||||
|
ResponseContentPartAddedEvent,
|
||||||
|
ResponseContentPartDoneEvent,
|
||||||
|
ResponseReasoningTextDeltaEvent,
|
||||||
|
ResponseReasoningTextDoneEvent,
|
||||||
|
ResponseReasoningPartAddedEvent,
|
||||||
|
ResponseReasoningPartDoneEvent,
|
||||||
|
ResponseCodeInterpreterCallInProgressEvent,
|
||||||
|
ResponseCodeInterpreterCallCodeDeltaEvent,
|
||||||
|
ResponseWebSearchCallInProgressEvent,
|
||||||
|
ResponseWebSearchCallSearchingEvent,
|
||||||
|
ResponseWebSearchCallCompletedEvent,
|
||||||
|
ResponseCodeInterpreterCallCodeDoneEvent,
|
||||||
|
ResponseCodeInterpreterCallInterpretingEvent,
|
||||||
|
ResponseCodeInterpreterCallCompletedEvent,
|
||||||
|
]
|
||||||
|
|
||||||
BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
|
BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
|
||||||
ScoreRequest, RerankRequest]
|
ScoreRequest, RerankRequest]
|
||||||
|
|
||||||
|
|||||||
@ -58,6 +58,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
|
|||||||
InputTokensDetails,
|
InputTokensDetails,
|
||||||
OutputTokensDetails,
|
OutputTokensDetails,
|
||||||
RequestResponseMetadata,
|
RequestResponseMetadata,
|
||||||
|
ResponseReasoningPartAddedEvent,
|
||||||
|
ResponseReasoningPartDoneEvent,
|
||||||
ResponsesRequest,
|
ResponsesRequest,
|
||||||
ResponsesResponse, ResponseUsage,
|
ResponsesResponse, ResponseUsage,
|
||||||
StreamingResponsesResponse)
|
StreamingResponsesResponse)
|
||||||
@ -1280,14 +1282,13 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
# Deal with tool call here
|
# Deal with tool call here
|
||||||
pass
|
pass
|
||||||
elif previous_item.channel == "analysis":
|
elif previous_item.channel == "analysis":
|
||||||
|
content = ResponseReasoningTextContent(
|
||||||
|
text=previous_item.content[0].text,
|
||||||
|
type="reasoning_text",
|
||||||
|
)
|
||||||
reasoning_item = ResponseReasoningItem(
|
reasoning_item = ResponseReasoningItem(
|
||||||
type="reasoning",
|
type="reasoning",
|
||||||
content=[
|
content=[content],
|
||||||
ResponseReasoningTextContent(
|
|
||||||
text=previous_item.content[0].text,
|
|
||||||
type="reasoning_text",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
status="completed",
|
status="completed",
|
||||||
id=current_item_id,
|
id=current_item_id,
|
||||||
summary=[],
|
summary=[],
|
||||||
@ -1301,6 +1302,15 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
content_index=current_content_index,
|
content_index=current_content_index,
|
||||||
text=previous_item.content[0].text,
|
text=previous_item.content[0].text,
|
||||||
))
|
))
|
||||||
|
yield _increment_sequence_number_and_return(
|
||||||
|
ResponseReasoningPartDoneEvent(
|
||||||
|
type="response.reasoning_part.done",
|
||||||
|
sequence_number=-1,
|
||||||
|
item_id=current_item_id,
|
||||||
|
output_index=current_output_index,
|
||||||
|
content_index=current_content_index,
|
||||||
|
part=content,
|
||||||
|
))
|
||||||
yield _increment_sequence_number_and_return(
|
yield _increment_sequence_number_and_return(
|
||||||
ResponseOutputItemDoneEvent(
|
ResponseOutputItemDoneEvent(
|
||||||
type="response.output_item.done",
|
type="response.output_item.done",
|
||||||
@ -1412,17 +1422,15 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
))
|
))
|
||||||
current_content_index += 1
|
current_content_index += 1
|
||||||
yield _increment_sequence_number_and_return(
|
yield _increment_sequence_number_and_return(
|
||||||
ResponseContentPartAddedEvent(
|
ResponseReasoningPartAddedEvent(
|
||||||
type="response.content_part.added",
|
type="response.reasoning_part.added",
|
||||||
sequence_number=-1,
|
sequence_number=-1,
|
||||||
output_index=current_output_index,
|
output_index=current_output_index,
|
||||||
item_id=current_item_id,
|
item_id=current_item_id,
|
||||||
content_index=current_content_index,
|
content_index=current_content_index,
|
||||||
part=ResponseOutputText(
|
part=ResponseReasoningTextContent(
|
||||||
type="output_text",
|
|
||||||
text="",
|
text="",
|
||||||
annotations=[],
|
type="reasoning_text",
|
||||||
logprobs=[],
|
|
||||||
),
|
),
|
||||||
))
|
))
|
||||||
yield _increment_sequence_number_and_return(
|
yield _increment_sequence_number_and_return(
|
||||||
|
|||||||
@ -229,14 +229,15 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
|
|||||||
return logits
|
return logits
|
||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||||
|
|
||||||
|
def transform(inputs):
|
||||||
|
name, loaded_weight = inputs
|
||||||
|
if "lm_head" not in name:
|
||||||
|
name = "model." + name
|
||||||
|
return name, loaded_weight
|
||||||
|
|
||||||
loader = AutoWeightsLoader(
|
loader = AutoWeightsLoader(
|
||||||
self,
|
self,
|
||||||
skip_prefixes=None,
|
skip_prefixes=None,
|
||||||
)
|
)
|
||||||
|
loader.load_weights(map(transform, weights))
|
||||||
model_weights = {}
|
|
||||||
for name, loaded_weight in weights:
|
|
||||||
if "lm_head" not in name:
|
|
||||||
name = "model." + name
|
|
||||||
model_weights[name] = loaded_weight
|
|
||||||
loader.load_weights(model_weights.items())
|
|
||||||
|
|||||||
@ -205,23 +205,21 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
|
|||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str,
|
def load_weights(self, weights: Iterable[tuple[str,
|
||||||
torch.Tensor]]) -> None:
|
torch.Tensor]]) -> None:
|
||||||
|
|
||||||
|
def transform(inputs):
|
||||||
|
name, loaded_weight = inputs
|
||||||
|
name, weight = self.permute_qk_weight_for_rotary(
|
||||||
|
name, loaded_weight)
|
||||||
|
if "lm_head" not in name:
|
||||||
|
name = "model." + name
|
||||||
|
return name, weight
|
||||||
|
|
||||||
loader = AutoWeightsLoader(
|
loader = AutoWeightsLoader(
|
||||||
self,
|
self,
|
||||||
# lm_head is tied with target model (Llama4ForCausalLM)
|
# lm_head is tied with target model (Llama4ForCausalLM)
|
||||||
skip_prefixes=(["lm_head."]),
|
skip_prefixes=(["lm_head."]),
|
||||||
)
|
)
|
||||||
|
loader.load_weights(map(transform, weights))
|
||||||
model_weights = {}
|
|
||||||
weights = [
|
|
||||||
self.permute_qk_weight_for_rotary(name, loaded_weight)
|
|
||||||
for name, loaded_weight in weights
|
|
||||||
]
|
|
||||||
for name, loaded_weight in weights:
|
|
||||||
if "lm_head" not in name:
|
|
||||||
name = "model." + name
|
|
||||||
model_weights[name] = loaded_weight
|
|
||||||
|
|
||||||
loader.load_weights(model_weights.items())
|
|
||||||
|
|
||||||
def get_input_embeddings(
|
def get_input_embeddings(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -158,14 +158,15 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
|
|||||||
return self.model(input_ids, positions, hidden_states)
|
return self.model(input_ids, positions, hidden_states)
|
||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||||
|
|
||||||
|
def transform(inputs):
|
||||||
|
name, loaded_weight = inputs
|
||||||
|
if "lm_head" not in name:
|
||||||
|
name = "model." + name
|
||||||
|
return name, loaded_weight
|
||||||
|
|
||||||
loader = AutoWeightsLoader(
|
loader = AutoWeightsLoader(
|
||||||
self,
|
self,
|
||||||
skip_prefixes=None,
|
skip_prefixes=None,
|
||||||
)
|
)
|
||||||
|
loader.load_weights(map(transform, weights))
|
||||||
model_weights = {}
|
|
||||||
for name, loaded_weight in weights:
|
|
||||||
if "lm_head" not in name:
|
|
||||||
name = "model." + name
|
|
||||||
model_weights[name] = loaded_weight
|
|
||||||
loader.load_weights(model_weights.items())
|
|
||||||
|
|||||||
@ -12,7 +12,6 @@ from torch.nn import Parameter
|
|||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size)
|
get_tensor_model_parallel_world_size)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.utils import _make_synced_weight_loader
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
|
"BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
|
||||||
@ -53,8 +52,9 @@ class BasevLLMParameter(Parameter):
|
|||||||
# This sometimes causes OOM errors during model loading. To avoid this,
|
# This sometimes causes OOM errors during model loading. To avoid this,
|
||||||
# we sync the param tensor after its weight loader is called.
|
# we sync the param tensor after its weight loader is called.
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
if current_platform.is_tpu():
|
if current_platform.use_sync_weight_loader():
|
||||||
weight_loader = _make_synced_weight_loader(weight_loader)
|
weight_loader = current_platform.make_synced_weight_loader(
|
||||||
|
weight_loader)
|
||||||
|
|
||||||
self._weight_loader = weight_loader
|
self._weight_loader = weight_loader
|
||||||
self.tp_rank = get_tensor_model_parallel_rank()
|
self.tp_rank = get_tensor_model_parallel_rank()
|
||||||
|
|||||||
@ -44,23 +44,12 @@ def set_weight_attrs(
|
|||||||
# TODO(woosuk): Remove this hack once we have a better solution.
|
# TODO(woosuk): Remove this hack once we have a better solution.
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_tpu() and key == "weight_loader":
|
if current_platform.use_sync_weight_loader(
|
||||||
value = _make_synced_weight_loader(value)
|
) and key == "weight_loader":
|
||||||
|
value = current_platform.make_synced_weight_loader(value)
|
||||||
setattr(weight, key, value)
|
setattr(weight, key, value)
|
||||||
|
|
||||||
|
|
||||||
def _make_synced_weight_loader(original_weight_loader):
|
|
||||||
|
|
||||||
def _synced_weight_loader(param, *args, **kwargs):
|
|
||||||
out = original_weight_loader(param, *args, **kwargs)
|
|
||||||
# torch._sync doesn't support, is not needed for CPU tensors.
|
|
||||||
if param.device != torch.device("cpu"):
|
|
||||||
torch._sync(param)
|
|
||||||
return out
|
|
||||||
|
|
||||||
return _synced_weight_loader
|
|
||||||
|
|
||||||
|
|
||||||
def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
|
def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]:
|
||||||
parent_map = getattr(model, "packed_modules_mapping", None)
|
parent_map = getattr(model, "packed_modules_mapping", None)
|
||||||
parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
|
parent_map = copy.deepcopy(parent_map) if parent_map is not None else {}
|
||||||
|
|||||||
@ -569,8 +569,8 @@ class MultiModalFieldConfig:
|
|||||||
Args:
|
Args:
|
||||||
modality: The modality of the multi-modal item that uses this
|
modality: The modality of the multi-modal item that uses this
|
||||||
keyword argument.
|
keyword argument.
|
||||||
slices: For each multi-modal item, the size of the slice that
|
size_per_item: For each multi-modal item, the size of the slice
|
||||||
is used to extract the data corresponding to it.
|
that is used to extract the data corresponding to it.
|
||||||
dim: The dimension to slice, default to 0.
|
dim: The dimension to slice, default to 0.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
@ -590,7 +590,7 @@ class MultiModalFieldConfig:
|
|||||||
|
|
||||||
```
|
```
|
||||||
Given:
|
Given:
|
||||||
slices: [3, 4, 2]
|
size_per_item: [3, 4, 2]
|
||||||
dim: 1
|
dim: 1
|
||||||
|
|
||||||
Input:
|
Input:
|
||||||
|
|||||||
@ -234,19 +234,6 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
||||||
total_len = len(prompt_token_ids)
|
total_len = len(prompt_token_ids)
|
||||||
|
|
||||||
# V0 does not support chunked prefill.
|
|
||||||
if total_len > seq_len and not envs.VLLM_USE_V1:
|
|
||||||
# `max_num_batched_tokens` is defined by `SchedulerConfig`
|
|
||||||
logger.warning_once(
|
|
||||||
"The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = %d) " # noqa: E501
|
|
||||||
"is too short to hold the multi-modal embeddings in the worst case (%d tokens in total, out of which %s are reserved for multi-modal embeddings). " # noqa: E501
|
|
||||||
"This may cause certain multi-modal inputs to fail during inference, even when the input text is short. " # noqa: E501
|
|
||||||
"To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.", # noqa: E501
|
|
||||||
seq_len,
|
|
||||||
total_len,
|
|
||||||
str(self._get_mm_num_tokens(mm_inputs)),
|
|
||||||
)
|
|
||||||
|
|
||||||
if total_len < seq_len:
|
if total_len < seq_len:
|
||||||
prompt_token_ids.extend([0] * (seq_len - total_len))
|
prompt_token_ids.extend([0] * (seq_len - total_len))
|
||||||
|
|
||||||
@ -270,22 +257,6 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
mm_counts=mm_counts,
|
mm_counts=mm_counts,
|
||||||
)
|
)
|
||||||
if max_tokens_per_item is not None:
|
if max_tokens_per_item is not None:
|
||||||
if mm_counts is None:
|
|
||||||
total_mm_tokens = sum(max_tokens_per_item.values())
|
|
||||||
else:
|
|
||||||
total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
|
|
||||||
for k in max_tokens_per_item.keys()
|
|
||||||
& mm_counts.keys())
|
|
||||||
if total_mm_tokens > seq_len:
|
|
||||||
logger.warning_once(
|
|
||||||
"The sequence length (%d) is smaller than the pre-defined"
|
|
||||||
" worst-case total number of multimodal tokens (%d). "
|
|
||||||
"This may cause certain multi-modal inputs to fail during "
|
|
||||||
"inference. To avoid this, you should increase "
|
|
||||||
"`max_model_len` or reduce `mm_counts`.",
|
|
||||||
seq_len,
|
|
||||||
total_mm_tokens,
|
|
||||||
)
|
|
||||||
return max_tokens_per_item
|
return max_tokens_per_item
|
||||||
|
|
||||||
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
|
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
|
||||||
|
|||||||
@ -395,7 +395,9 @@ def group_mm_kwargs_by_modality(
|
|||||||
modality together into the same `MultiModalKwargs` instance.
|
modality together into the same `MultiModalKwargs` instance.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
mm_inputs: List of `MultiModalKwargsItem`.
|
mm_kwargs: List of `MultiModalKwargsItem`.
|
||||||
|
device: The device to place the grouped tensors on.
|
||||||
|
pin_memory: Whether to pin memory for faster host-to-device transfer.
|
||||||
|
|
||||||
Yields:
|
Yields:
|
||||||
A tuple `(modality, num_items, grouped_kwargs)`.
|
A tuple `(modality, num_items, grouped_kwargs)`.
|
||||||
|
|||||||
@ -594,6 +594,29 @@ class Platform:
|
|||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def use_sync_weight_loader(cls) -> bool:
|
||||||
|
"""
|
||||||
|
Returns if the current platform needs to sync weight loader.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def make_synced_weight_loader(cls, original_weight_loader):
|
||||||
|
"""
|
||||||
|
Wrap the original weight loader to make it synced.
|
||||||
|
"""
|
||||||
|
if not cls.use_sync_weight_loader():
|
||||||
|
return original_weight_loader
|
||||||
|
|
||||||
|
def _synced_weight_loader(param, *args, **kwargs):
|
||||||
|
out = original_weight_loader(param, *args, **kwargs)
|
||||||
|
if param.device != torch.device("cpu"):
|
||||||
|
torch._sync(param)
|
||||||
|
return out
|
||||||
|
|
||||||
|
return _synced_weight_loader
|
||||||
|
|
||||||
|
|
||||||
class UnspecifiedPlatform(Platform):
|
class UnspecifiedPlatform(Platform):
|
||||||
_enum = PlatformEnum.UNSPECIFIED
|
_enum = PlatformEnum.UNSPECIFIED
|
||||||
|
|||||||
@ -226,6 +226,10 @@ class TpuPlatform(Platform):
|
|||||||
torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
|
torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
|
||||||
dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
|
dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def use_sync_weight_loader(cls) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
|
from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
|
||||||
|
|||||||
@ -3443,3 +3443,30 @@ def decorate_logs(process_name: Optional[str] = None) -> None:
|
|||||||
pid = os.getpid()
|
pid = os.getpid()
|
||||||
_add_prefix(sys.stdout, process_name, pid)
|
_add_prefix(sys.stdout, process_name, pid)
|
||||||
_add_prefix(sys.stderr, process_name, pid)
|
_add_prefix(sys.stderr, process_name, pid)
|
||||||
|
|
||||||
|
|
||||||
|
def length_from_prompt_token_ids_or_embeds(
|
||||||
|
prompt_token_ids: Optional[list[int]],
|
||||||
|
prompt_embeds: Optional[torch.Tensor],
|
||||||
|
) -> int:
|
||||||
|
"""Calculate the request length (in number of tokens) give either
|
||||||
|
prompt_token_ids or prompt_embeds.
|
||||||
|
"""
|
||||||
|
prompt_token_len = None if prompt_token_ids is None else len(
|
||||||
|
prompt_token_ids)
|
||||||
|
prompt_embeds_len = \
|
||||||
|
None if prompt_embeds is None else len(prompt_embeds)
|
||||||
|
|
||||||
|
if prompt_token_len is None:
|
||||||
|
if prompt_embeds_len is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Neither prompt_token_ids nor prompt_embeds were defined.")
|
||||||
|
return prompt_embeds_len
|
||||||
|
else:
|
||||||
|
if (prompt_embeds_len is not None
|
||||||
|
and prompt_embeds_len != prompt_token_len):
|
||||||
|
raise ValueError(
|
||||||
|
"Prompt token ids and prompt embeds had different lengths"
|
||||||
|
f" prompt_token_ids={prompt_token_len}"
|
||||||
|
f" prompt_embeds={prompt_embeds_len}")
|
||||||
|
return prompt_token_len
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from vllm._bc_linter import bc_linter_include
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
import torch
|
||||||
|
|
||||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||||
KVConnectorMetadata)
|
KVConnectorMetadata)
|
||||||
@ -26,13 +27,14 @@ if TYPE_CHECKING:
|
|||||||
class NewRequestData:
|
class NewRequestData:
|
||||||
|
|
||||||
req_id: str
|
req_id: str
|
||||||
prompt_token_ids: list[int]
|
prompt_token_ids: Optional[list[int]]
|
||||||
mm_features: list[MultiModalFeatureSpec]
|
mm_features: list[MultiModalFeatureSpec]
|
||||||
sampling_params: Optional[SamplingParams]
|
sampling_params: Optional[SamplingParams]
|
||||||
pooling_params: Optional[PoolingParams]
|
pooling_params: Optional[PoolingParams]
|
||||||
block_ids: tuple[list[int], ...]
|
block_ids: tuple[list[int], ...]
|
||||||
num_computed_tokens: int
|
num_computed_tokens: int
|
||||||
lora_request: Optional[LoRARequest]
|
lora_request: Optional[LoRARequest]
|
||||||
|
prompt_embeds: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_request(
|
def from_request(
|
||||||
@ -49,9 +51,12 @@ class NewRequestData:
|
|||||||
block_ids=block_ids,
|
block_ids=block_ids,
|
||||||
num_computed_tokens=request.num_computed_tokens,
|
num_computed_tokens=request.num_computed_tokens,
|
||||||
lora_request=request.lora_request,
|
lora_request=request.lora_request,
|
||||||
|
prompt_embeds=request.prompt_embeds,
|
||||||
)
|
)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
|
prompt_embeds_shape = (self.prompt_embeds.shape
|
||||||
|
if self.prompt_embeds else None)
|
||||||
return (f"NewRequestData("
|
return (f"NewRequestData("
|
||||||
f"req_id={self.req_id},"
|
f"req_id={self.req_id},"
|
||||||
f"prompt_token_ids={self.prompt_token_ids},"
|
f"prompt_token_ids={self.prompt_token_ids},"
|
||||||
@ -59,19 +64,26 @@ class NewRequestData:
|
|||||||
f"sampling_params={self.sampling_params},"
|
f"sampling_params={self.sampling_params},"
|
||||||
f"block_ids={self.block_ids},"
|
f"block_ids={self.block_ids},"
|
||||||
f"num_computed_tokens={self.num_computed_tokens},"
|
f"num_computed_tokens={self.num_computed_tokens},"
|
||||||
f"lora_request={self.lora_request}"
|
f"lora_request={self.lora_request},"
|
||||||
|
f"prompt_embeds_shape={prompt_embeds_shape}"
|
||||||
")")
|
")")
|
||||||
|
|
||||||
# Version of __repr__ with the prompt data obfuscated
|
# Version of __repr__ with the prompt data obfuscated
|
||||||
def anon_repr(self):
|
def anon_repr(self) -> str:
|
||||||
|
prompt_token_ids_len = len(
|
||||||
|
self.prompt_token_ids
|
||||||
|
) if self.prompt_token_ids is not None else None
|
||||||
|
prompt_embeds_shape = (self.prompt_embeds.shape
|
||||||
|
if self.prompt_embeds else None)
|
||||||
return (f"NewRequestData("
|
return (f"NewRequestData("
|
||||||
f"req_id={self.req_id},"
|
f"req_id={self.req_id},"
|
||||||
f"prompt_token_ids_len={len(self.prompt_token_ids)},"
|
f"prompt_token_ids_len={prompt_token_ids_len},"
|
||||||
f"mm_features={self.mm_features},"
|
f"mm_features={self.mm_features},"
|
||||||
f"sampling_params={self.sampling_params},"
|
f"sampling_params={self.sampling_params},"
|
||||||
f"block_ids={self.block_ids},"
|
f"block_ids={self.block_ids},"
|
||||||
f"num_computed_tokens={self.num_computed_tokens},"
|
f"num_computed_tokens={self.num_computed_tokens},"
|
||||||
f"lora_request={self.lora_request}"
|
f"lora_request={self.lora_request},"
|
||||||
|
f"prompt_embeds_shape={prompt_embeds_shape}"
|
||||||
")")
|
")")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -47,7 +47,7 @@ class EngineCoreRequest(
|
|||||||
gc=False): # type: ignore[call-arg]
|
gc=False): # type: ignore[call-arg]
|
||||||
|
|
||||||
request_id: str
|
request_id: str
|
||||||
prompt_token_ids: list[int]
|
prompt_token_ids: Optional[list[int]]
|
||||||
mm_features: Optional[list[MultiModalFeatureSpec]]
|
mm_features: Optional[list[MultiModalFeatureSpec]]
|
||||||
sampling_params: Optional[SamplingParams]
|
sampling_params: Optional[SamplingParams]
|
||||||
pooling_params: Optional[PoolingParams]
|
pooling_params: Optional[PoolingParams]
|
||||||
@ -56,6 +56,7 @@ class EngineCoreRequest(
|
|||||||
lora_request: Optional[LoRARequest]
|
lora_request: Optional[LoRARequest]
|
||||||
cache_salt: Optional[str]
|
cache_salt: Optional[str]
|
||||||
data_parallel_rank: Optional[int]
|
data_parallel_rank: Optional[int]
|
||||||
|
prompt_embeds: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
# Index of the client, used to ensure outputs are sent back to the same
|
# Index of the client, used to ensure outputs are sent back to the same
|
||||||
# client for this request when scaling out the front-end.
|
# client for this request when scaling out the front-end.
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from vllm.engine.output_processor.stop_checker import StopChecker
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.detokenizer_utils import (
|
from vllm.transformers_utils.detokenizer_utils import (
|
||||||
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
|
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
|
||||||
|
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -179,11 +180,12 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
|
|||||||
self.tokenizer: Tokenizer = tokenizer._tokenizer
|
self.tokenizer: Tokenizer = tokenizer._tokenizer
|
||||||
|
|
||||||
# Find a safe place to start.
|
# Find a safe place to start.
|
||||||
prompt_suffix = request.prompt_token_ids
|
prompt_token_ids = request.prompt_token_ids or []
|
||||||
|
prompt_suffix = prompt_token_ids
|
||||||
prompt_len = len(prompt_suffix)
|
prompt_len = len(prompt_suffix)
|
||||||
if prompt_len > 4:
|
if prompt_len > 4:
|
||||||
for i in range(4, min(prompt_len + 1, 24)):
|
for i in range(4, min(prompt_len + 1, 24)):
|
||||||
suffix = request.prompt_token_ids[-i:]
|
suffix = prompt_token_ids[-i:]
|
||||||
if '<EFBFBD>' not in self.tokenizer.decode(suffix):
|
if '<EFBFBD>' not in self.tokenizer.decode(suffix):
|
||||||
prompt_suffix = suffix
|
prompt_suffix = suffix
|
||||||
break
|
break
|
||||||
@ -260,16 +262,25 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
|
|||||||
params = request.sampling_params
|
params = request.sampling_params
|
||||||
assert params is not None
|
assert params is not None
|
||||||
|
|
||||||
# Metadata for incremental detokenization.
|
self.prompt_len = length_from_prompt_token_ids_or_embeds(
|
||||||
self.tokens, self.prefix_offset, self.read_offset = (
|
request.prompt_token_ids, request.prompt_embeds)
|
||||||
convert_prompt_ids_to_tokens(
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
prompt_ids=request.prompt_token_ids,
|
|
||||||
skip_special_tokens=params.skip_special_tokens,
|
|
||||||
))
|
|
||||||
|
|
||||||
self.token_ids.extend(request.prompt_token_ids)
|
# Metadata for incremental detokenization.
|
||||||
self.prompt_len = len(request.prompt_token_ids)
|
if request.prompt_token_ids is not None:
|
||||||
|
self.tokens, self.prefix_offset, self.read_offset = (
|
||||||
|
convert_prompt_ids_to_tokens(
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
prompt_ids=request.prompt_token_ids,
|
||||||
|
skip_special_tokens=params.skip_special_tokens,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Prompt embedding requests cannot be detokenized, in general.
|
||||||
|
self.tokens = [""] * self.prompt_len
|
||||||
|
self.prefix_offset = 0
|
||||||
|
self.read_offest = 0
|
||||||
|
|
||||||
|
self.token_ids.extend(request.prompt_token_ids
|
||||||
|
or [0] * self.prompt_len)
|
||||||
|
|
||||||
self.skip_special_tokens = params.skip_special_tokens
|
self.skip_special_tokens = params.skip_special_tokens
|
||||||
self.spaces_between_special_tokens = (
|
self.spaces_between_special_tokens = (
|
||||||
|
|||||||
@ -14,6 +14,7 @@ from vllm.sampling_params import RequestOutputKind
|
|||||||
from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
|
from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
|
||||||
extract_trace_context)
|
extract_trace_context)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
|
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||||
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
|
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
|
||||||
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
|
from vllm.v1.engine.detokenizer import IncrementalDetokenizer
|
||||||
from vllm.v1.engine.logprobs import LogprobsProcessor
|
from vllm.v1.engine.logprobs import LogprobsProcessor
|
||||||
@ -86,7 +87,8 @@ class RequestState:
|
|||||||
lora_name: Optional[str],
|
lora_name: Optional[str],
|
||||||
output_kind: RequestOutputKind,
|
output_kind: RequestOutputKind,
|
||||||
prompt: Optional[str],
|
prompt: Optional[str],
|
||||||
prompt_token_ids: list[int],
|
prompt_token_ids: Optional[list[int]],
|
||||||
|
prompt_embeds: Optional[torch.Tensor],
|
||||||
logprobs_processor: Optional[LogprobsProcessor],
|
logprobs_processor: Optional[LogprobsProcessor],
|
||||||
detokenizer: Optional[IncrementalDetokenizer],
|
detokenizer: Optional[IncrementalDetokenizer],
|
||||||
max_tokens_param: Optional[int],
|
max_tokens_param: Optional[int],
|
||||||
@ -104,7 +106,9 @@ class RequestState:
|
|||||||
self.output_kind = output_kind
|
self.output_kind = output_kind
|
||||||
self.prompt = prompt
|
self.prompt = prompt
|
||||||
self.prompt_token_ids = prompt_token_ids
|
self.prompt_token_ids = prompt_token_ids
|
||||||
self.prompt_len = len(prompt_token_ids)
|
self.prompt_embeds = prompt_embeds
|
||||||
|
self.prompt_len = length_from_prompt_token_ids_or_embeds(
|
||||||
|
self.prompt_token_ids, self.prompt_embeds)
|
||||||
self.logprobs_processor = logprobs_processor
|
self.logprobs_processor = logprobs_processor
|
||||||
self.detokenizer = detokenizer
|
self.detokenizer = detokenizer
|
||||||
self.max_tokens_param = max_tokens_param
|
self.max_tokens_param = max_tokens_param
|
||||||
@ -165,6 +169,7 @@ class RequestState:
|
|||||||
output_kind=output_kind,
|
output_kind=output_kind,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_token_ids=request.prompt_token_ids,
|
prompt_token_ids=request.prompt_token_ids,
|
||||||
|
prompt_embeds=request.prompt_embeds,
|
||||||
logprobs_processor=logprobs_processor,
|
logprobs_processor=logprobs_processor,
|
||||||
detokenizer=detokenizer,
|
detokenizer=detokenizer,
|
||||||
max_tokens_param=max_tokens_param,
|
max_tokens_param=max_tokens_param,
|
||||||
@ -223,6 +228,8 @@ class RequestState:
|
|||||||
first_output = outputs[0]
|
first_output = outputs[0]
|
||||||
if isinstance(first_output, PoolingOutput):
|
if isinstance(first_output, PoolingOutput):
|
||||||
assert len(outputs) == 1
|
assert len(outputs) == 1
|
||||||
|
# Prompt embeddings are currently not supported by pooling requests.
|
||||||
|
assert self.prompt_token_ids is not None
|
||||||
return PoolingRequestOutput(
|
return PoolingRequestOutput(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
outputs=first_output,
|
outputs=first_output,
|
||||||
@ -236,10 +243,15 @@ class RequestState:
|
|||||||
else:
|
else:
|
||||||
prompt_logprobs = self.logprobs_processor.prompt_logprobs
|
prompt_logprobs = self.logprobs_processor.prompt_logprobs
|
||||||
|
|
||||||
|
# If prompt embeds were used, put placeholder prompt token ids
|
||||||
|
prompt_token_ids = self.prompt_token_ids
|
||||||
|
if prompt_token_ids is None and self.prompt_embeds is not None:
|
||||||
|
prompt_token_ids = [0] * len(self.prompt_embeds)
|
||||||
|
|
||||||
return RequestOutput(
|
return RequestOutput(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
prompt=self.prompt,
|
prompt=self.prompt,
|
||||||
prompt_token_ids=self.prompt_token_ids,
|
prompt_token_ids=prompt_token_ids,
|
||||||
prompt_logprobs=prompt_logprobs,
|
prompt_logprobs=prompt_logprobs,
|
||||||
outputs=cast(list[CompletionOutput], outputs),
|
outputs=cast(list[CompletionOutput], outputs),
|
||||||
finished=finished,
|
finished=finished,
|
||||||
@ -469,6 +481,8 @@ class OutputProcessor:
|
|||||||
|
|
||||||
arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
|
arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
|
||||||
trace_context = extract_trace_context(engine_core_output.trace_headers)
|
trace_context = extract_trace_context(engine_core_output.trace_headers)
|
||||||
|
prompt_length = length_from_prompt_token_ids_or_embeds(
|
||||||
|
req_state.prompt_token_ids, req_state.prompt_embeds)
|
||||||
with (self.tracer.start_as_current_span(
|
with (self.tracer.start_as_current_span(
|
||||||
"llm_request",
|
"llm_request",
|
||||||
kind=SpanKind.SERVER,
|
kind=SpanKind.SERVER,
|
||||||
@ -488,7 +502,7 @@ class OutputProcessor:
|
|||||||
span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
|
span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
|
||||||
queued_time)
|
queued_time)
|
||||||
span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
|
span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
|
||||||
len(req_state.prompt_token_ids))
|
prompt_length)
|
||||||
span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
|
span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
|
||||||
metrics.num_generation_tokens)
|
metrics.num_generation_tokens)
|
||||||
span.set_attribute(
|
span.set_attribute(
|
||||||
@ -544,7 +558,8 @@ class OutputProcessor:
|
|||||||
assert req_state.stats is not None
|
assert req_state.stats is not None
|
||||||
iteration_stats.update_from_finished_request(
|
iteration_stats.update_from_finished_request(
|
||||||
finish_reason=finish_reason,
|
finish_reason=finish_reason,
|
||||||
num_prompt_tokens=len(req_state.prompt_token_ids),
|
num_prompt_tokens=length_from_prompt_token_ids_or_embeds(
|
||||||
|
req_state.prompt_token_ids, req_state.prompt_embeds),
|
||||||
max_tokens_param=req_state.max_tokens_param,
|
max_tokens_param=req_state.max_tokens_param,
|
||||||
req_stats=req_state.stats)
|
req_stats=req_state.stats)
|
||||||
self.lora_states.finish_request(req_state)
|
self.lora_states.finish_request(req_state)
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from vllm.multimodal.utils import argsort_mm_positions
|
|||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
|
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.structured_output.backend_guidance import (
|
from vllm.v1.structured_output.backend_guidance import (
|
||||||
validate_guidance_grammar)
|
validate_guidance_grammar)
|
||||||
@ -390,6 +391,16 @@ class Processor:
|
|||||||
self._validate_model_inputs(processed_inputs)
|
self._validate_model_inputs(processed_inputs)
|
||||||
|
|
||||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||||
|
# Mypy does not always properly infer the types of some elements of
|
||||||
|
# discriminated unions of TypedDicts, because of how it handles
|
||||||
|
# inheritance of TypedDict. If we explicitly extract the items we want
|
||||||
|
# we can avoid type errors from using `dict.get` later in the method.
|
||||||
|
prompt_str: Optional[str] = None if decoder_inputs[
|
||||||
|
"type"] == "embeds" else decoder_inputs.get("prompt")
|
||||||
|
prompt_token_ids = decoder_inputs[
|
||||||
|
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
|
||||||
|
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
|
||||||
|
"type"] == "embeds" else None
|
||||||
|
|
||||||
sampling_params = None
|
sampling_params = None
|
||||||
pooling_params = None
|
pooling_params = None
|
||||||
@ -398,9 +409,10 @@ class Processor:
|
|||||||
sampling_params = params.clone()
|
sampling_params = params.clone()
|
||||||
# If unset max tokens, then generate up to the max_model_len.
|
# If unset max tokens, then generate up to the max_model_len.
|
||||||
if sampling_params.max_tokens is None:
|
if sampling_params.max_tokens is None:
|
||||||
sampling_params.max_tokens = (
|
seq_len = length_from_prompt_token_ids_or_embeds(
|
||||||
self.model_config.max_model_len -
|
prompt_token_ids, prompt_embeds)
|
||||||
len(decoder_inputs["prompt_token_ids"]))
|
sampling_params.max_tokens = \
|
||||||
|
self.model_config.max_model_len - seq_len
|
||||||
sampling_params.update_from_generation_config(
|
sampling_params.update_from_generation_config(
|
||||||
self.generation_config_fields, eos_token_id)
|
self.generation_config_fields, eos_token_id)
|
||||||
if self.tokenizer is not None:
|
if self.tokenizer is not None:
|
||||||
@ -430,9 +442,10 @@ class Processor:
|
|||||||
identifier=decoder_mm_hashes[modality][idx],
|
identifier=decoder_mm_hashes[modality][idx],
|
||||||
mm_position=decoder_mm_positions[modality][idx]))
|
mm_position=decoder_mm_positions[modality][idx]))
|
||||||
|
|
||||||
return decoder_inputs.get("prompt"), EngineCoreRequest(
|
return prompt_str, EngineCoreRequest(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
prompt_token_ids=decoder_inputs["prompt_token_ids"],
|
prompt_token_ids=prompt_token_ids,
|
||||||
|
prompt_embeds=prompt_embeds,
|
||||||
mm_features=mm_features,
|
mm_features=mm_features,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
pooling_params=pooling_params,
|
pooling_params=pooling_params,
|
||||||
@ -461,10 +474,17 @@ class Processor:
|
|||||||
):
|
):
|
||||||
model_config = self.model_config
|
model_config = self.model_config
|
||||||
|
|
||||||
prompt_ids = prompt_inputs["prompt_token_ids"]
|
prompt_ids = None if prompt_inputs[
|
||||||
|
"type"] == "embeds" else prompt_inputs["prompt_token_ids"]
|
||||||
|
prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[
|
||||||
|
"type"] == "embeds" else None
|
||||||
|
prompt_len = length_from_prompt_token_ids_or_embeds(
|
||||||
|
prompt_ids, prompt_embeds)
|
||||||
if not prompt_ids:
|
if not prompt_ids:
|
||||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||||
pass # Mllama may have empty encoder inputs for text-only data
|
pass # Mllama may have empty encoder inputs for text-only data
|
||||||
|
elif prompt_inputs["type"] == "embeds":
|
||||||
|
pass # Prompt embeds should not have prompt_ids.
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
raise ValueError(f"The {prompt_type} prompt cannot be empty")
|
||||||
|
|
||||||
@ -472,7 +492,7 @@ class Processor:
|
|||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = self.tokenizer
|
tokenizer = self.tokenizer
|
||||||
max_input_id = max(prompt_ids, default=0)
|
max_input_id = max(prompt_ids or [], default=0)
|
||||||
|
|
||||||
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
|
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
|
||||||
# self.model_config.get_vocab_size() is the model’s vocab size.
|
# self.model_config.get_vocab_size() is the model’s vocab size.
|
||||||
@ -490,7 +510,7 @@ class Processor:
|
|||||||
f"Token id {max_input_id} is out of vocabulary")
|
f"Token id {max_input_id} is out of vocabulary")
|
||||||
|
|
||||||
max_prompt_len = self.model_config.max_model_len
|
max_prompt_len = self.model_config.max_model_len
|
||||||
if len(prompt_ids) > max_prompt_len:
|
if prompt_len > max_prompt_len:
|
||||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||||
mm_registry = self.input_preprocessor.mm_registry
|
mm_registry = self.input_preprocessor.mm_registry
|
||||||
mm_processor = mm_registry.create_processor(
|
mm_processor = mm_registry.create_processor(
|
||||||
@ -514,7 +534,7 @@ class Processor:
|
|||||||
"number of text tokens.")
|
"number of text tokens.")
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
|
f"The {prompt_type} prompt (length {prompt_len}) is "
|
||||||
f"longer than the maximum model length of {max_prompt_len}. "
|
f"longer than the maximum model length of {max_prompt_len}. "
|
||||||
f"{suggestion}")
|
f"{suggestion}")
|
||||||
|
|
||||||
|
|||||||
96
vllm/v1/kv_offload/backend.py
Normal file
96
vllm/v1/kv_offload/backend.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import ctypes
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
|
|
||||||
|
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||||
|
from vllm.v1.kv_offload.abstract import LoadStoreSpec
|
||||||
|
|
||||||
|
|
||||||
|
class BlockStatus(ctypes.Structure):
|
||||||
|
"""
|
||||||
|
Offloading status for a single block of KV data.
|
||||||
|
Holds the following information:
|
||||||
|
|
||||||
|
ref_cnt - the current number of transfers using this block as a source.
|
||||||
|
A value of -1 indicates the block is not yet ready to be read.
|
||||||
|
load_store_spec - backend-specific information on how to actually
|
||||||
|
read/write the block.
|
||||||
|
"""
|
||||||
|
_fields_ = [("ref_cnt", ctypes.c_int32)]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
# initialize block as "not ready" (ref_cnt = -1)
|
||||||
|
self.ref_cnt = -1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_ready(self) -> bool:
|
||||||
|
"""
|
||||||
|
Returns whether the block is ready to be read.
|
||||||
|
"""
|
||||||
|
return self.ref_cnt >= 0
|
||||||
|
|
||||||
|
|
||||||
|
class Backend(ABC):
|
||||||
|
"""
|
||||||
|
An abstract class for allocating and returning specs for writing
|
||||||
|
KV blocks to some backend.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, block_size: int, medium: str):
|
||||||
|
self.block_size = block_size
|
||||||
|
self.medium = medium
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_num_free_blocks(self):
|
||||||
|
"""
|
||||||
|
Returns the number of current number of blocks that can be allocated.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def allocate_blocks(self,
|
||||||
|
block_hashes: list[BlockHash]) -> list[BlockStatus]:
|
||||||
|
"""
|
||||||
|
Allocate space for writing blocks.
|
||||||
|
This method assumes there is enough space for allocation.
|
||||||
|
It is unsafe to use without checking get_num_free_blocks beforehand.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block_hashes: the hashes identifying the blocks to be written.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of BlockStatus for the allocated blocks.
|
||||||
|
The ref_cnt of each returned item will be -1, meaning the block
|
||||||
|
is not yet ready to be read.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def free(self, block: BlockStatus):
|
||||||
|
"""
|
||||||
|
Free a previously allocated block.
|
||||||
|
You should only call this function with blocks returned by
|
||||||
|
allocate_blocks, and only once per each block.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block: The block to be freed.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
|
||||||
|
blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
|
||||||
|
"""
|
||||||
|
Get backend-specific information on how to read/write blocks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block_hashes: the list of block hashes identifying the blocks.
|
||||||
|
blocks: the list of blocks.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A LoadStoreSpec that can be used by a worker
|
||||||
|
to read/write the blocks.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
61
vllm/v1/kv_offload/backends/cpu.py
Normal file
61
vllm/v1/kv_offload/backends/cpu.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import ctypes
|
||||||
|
from collections.abc import Iterable
|
||||||
|
|
||||||
|
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||||
|
from vllm.v1.kv_offload.abstract import LoadStoreSpec
|
||||||
|
from vllm.v1.kv_offload.backend import Backend, BlockStatus
|
||||||
|
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
|
||||||
|
|
||||||
|
|
||||||
|
class CPUBlockStatus(BlockStatus):
|
||||||
|
_fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)
|
||||||
|
] # type: ignore
|
||||||
|
|
||||||
|
def __init__(self, block_id: int):
|
||||||
|
super().__init__()
|
||||||
|
self.block_id = block_id
|
||||||
|
|
||||||
|
|
||||||
|
class CPUBackend(Backend):
|
||||||
|
|
||||||
|
def __init__(self, block_size: int, num_blocks: int):
|
||||||
|
super().__init__(block_size=block_size,
|
||||||
|
medium=CPULoadStoreSpec.medium())
|
||||||
|
|
||||||
|
self.num_blocks: int = num_blocks
|
||||||
|
self.num_allocated_blocks: int = 0
|
||||||
|
self.allocated_blocks_free_list: list[int] = []
|
||||||
|
|
||||||
|
def get_num_free_blocks(self):
|
||||||
|
return (len(self.allocated_blocks_free_list) + self.num_blocks -
|
||||||
|
self.num_allocated_blocks)
|
||||||
|
|
||||||
|
def allocate_blocks(self,
|
||||||
|
block_hashes: list[BlockHash]) -> list[BlockStatus]:
|
||||||
|
num_fresh_blocks = min(len(block_hashes),
|
||||||
|
self.num_blocks - self.num_allocated_blocks)
|
||||||
|
num_reused_blocks = len(block_hashes) - num_fresh_blocks
|
||||||
|
assert len(self.allocated_blocks_free_list) >= num_reused_blocks
|
||||||
|
|
||||||
|
# allocate fresh blocks
|
||||||
|
blocks: list[BlockStatus] = []
|
||||||
|
for _ in range(num_fresh_blocks):
|
||||||
|
blocks.append(CPUBlockStatus(self.num_allocated_blocks))
|
||||||
|
self.num_allocated_blocks += 1
|
||||||
|
|
||||||
|
# allocate reused blocks
|
||||||
|
for _ in range(num_reused_blocks):
|
||||||
|
block_id = self.allocated_blocks_free_list.pop()
|
||||||
|
blocks.append(CPUBlockStatus(block_id))
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def free(self, block: BlockStatus):
|
||||||
|
assert isinstance(block, CPUBlockStatus)
|
||||||
|
self.allocated_blocks_free_list.append(block.block_id)
|
||||||
|
|
||||||
|
def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
|
||||||
|
blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
|
||||||
|
return CPULoadStoreSpec([block.block_id for block in blocks])
|
||||||
132
vllm/v1/kv_offload/lru_manager.py
Normal file
132
vllm/v1/kv_offload/lru_manager.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections import OrderedDict
|
||||||
|
from collections.abc import Iterable
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||||
|
from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
|
||||||
|
OffloadingManager, PrepareStoreOutput)
|
||||||
|
from vllm.v1.kv_offload.backend import Backend, BlockStatus
|
||||||
|
|
||||||
|
|
||||||
|
class LRUOffloadingManager(OffloadingManager):
|
||||||
|
"""
|
||||||
|
An OffloadingManager with a pluggable backend, which evicts blocks by LRU.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, backend: Backend, enable_events: bool = False):
|
||||||
|
self.backend: Backend = backend
|
||||||
|
# block_hash -> BlockStatus
|
||||||
|
self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
|
||||||
|
self.events: Optional[list[OffloadingEvent]] = \
|
||||||
|
[] if enable_events else None
|
||||||
|
|
||||||
|
def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
|
||||||
|
hit_count = 0
|
||||||
|
for block_hash in block_hashes:
|
||||||
|
block = self.blocks.get(block_hash)
|
||||||
|
if block is None or not block.is_ready:
|
||||||
|
break
|
||||||
|
hit_count += 1
|
||||||
|
return hit_count
|
||||||
|
|
||||||
|
def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
|
||||||
|
blocks = []
|
||||||
|
for block_hash in block_hashes:
|
||||||
|
block = self.blocks[block_hash]
|
||||||
|
assert block.is_ready
|
||||||
|
block.ref_cnt += 1
|
||||||
|
blocks.append(block)
|
||||||
|
|
||||||
|
return self.backend.get_load_store_spec(block_hashes, blocks)
|
||||||
|
|
||||||
|
def touch(self, block_hashes: Iterable[BlockHash]):
|
||||||
|
for block_hash in reversed(list(block_hashes)):
|
||||||
|
if self.blocks.get(block_hash):
|
||||||
|
self.blocks.move_to_end(block_hash)
|
||||||
|
|
||||||
|
def complete_load(self, block_hashes: Iterable[BlockHash]):
|
||||||
|
for block_hash in block_hashes:
|
||||||
|
block = self.blocks[block_hash]
|
||||||
|
assert block.ref_cnt > 0
|
||||||
|
block.ref_cnt -= 1
|
||||||
|
|
||||||
|
def prepare_store(
|
||||||
|
self,
|
||||||
|
block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
|
||||||
|
# filter out blocks that are already stored
|
||||||
|
block_hashes_to_store = [
|
||||||
|
block_hash for block_hash in block_hashes
|
||||||
|
if block_hash not in self.blocks
|
||||||
|
]
|
||||||
|
|
||||||
|
num_blocks_to_evict = (len(block_hashes_to_store) -
|
||||||
|
self.backend.get_num_free_blocks())
|
||||||
|
|
||||||
|
# build list of blocks to evict
|
||||||
|
to_evict = []
|
||||||
|
if num_blocks_to_evict > 0:
|
||||||
|
for block_hash, block in self.blocks.items():
|
||||||
|
if block.ref_cnt == 0:
|
||||||
|
to_evict.append(block_hash)
|
||||||
|
num_blocks_to_evict -= 1
|
||||||
|
if num_blocks_to_evict == 0:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# we could not evict enough blocks
|
||||||
|
return None
|
||||||
|
|
||||||
|
# evict blocks
|
||||||
|
for block_hash in to_evict:
|
||||||
|
self.backend.free(self.blocks.pop(block_hash))
|
||||||
|
|
||||||
|
if to_evict and self.events is not None:
|
||||||
|
self.events.append(
|
||||||
|
OffloadingEvent(block_hashes=to_evict,
|
||||||
|
block_size=self.backend.block_size,
|
||||||
|
medium=self.backend.medium,
|
||||||
|
removed=True))
|
||||||
|
|
||||||
|
blocks = self.backend.allocate_blocks(block_hashes_to_store)
|
||||||
|
assert len(blocks) == len(block_hashes_to_store)
|
||||||
|
|
||||||
|
for block_hash, block in zip(block_hashes_to_store, blocks):
|
||||||
|
self.blocks[block_hash] = block
|
||||||
|
|
||||||
|
# build store specs for allocated blocks
|
||||||
|
store_spec = self.backend.get_load_store_spec(block_hashes_to_store,
|
||||||
|
blocks)
|
||||||
|
|
||||||
|
return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store,
|
||||||
|
store_spec=store_spec,
|
||||||
|
block_hashes_evicted=to_evict)
|
||||||
|
|
||||||
|
def complete_store(self,
|
||||||
|
block_hashes: Iterable[BlockHash],
|
||||||
|
success: bool = True):
|
||||||
|
stored_block_hashes: list[BlockHash] = []
|
||||||
|
if success:
|
||||||
|
for block_hash in block_hashes:
|
||||||
|
block = self.blocks[block_hash]
|
||||||
|
if not block.is_ready:
|
||||||
|
block.ref_cnt = 0
|
||||||
|
stored_block_hashes.append(block_hash)
|
||||||
|
else:
|
||||||
|
for block_hash in block_hashes:
|
||||||
|
block = self.blocks[block_hash]
|
||||||
|
if not block.is_ready:
|
||||||
|
self.backend.free(block)
|
||||||
|
del self.blocks[block_hash]
|
||||||
|
|
||||||
|
if stored_block_hashes and self.events is not None:
|
||||||
|
self.events.append(
|
||||||
|
OffloadingEvent(block_hashes=stored_block_hashes,
|
||||||
|
block_size=self.backend.block_size,
|
||||||
|
medium=self.backend.medium,
|
||||||
|
removed=False))
|
||||||
|
|
||||||
|
def take_events(self) -> Iterable[OffloadingEvent]:
|
||||||
|
if self.events is not None:
|
||||||
|
yield from self.events
|
||||||
|
self.events.clear()
|
||||||
@ -7,9 +7,12 @@ from collections.abc import Mapping
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
from vllm.multimodal.inputs import MultiModalFeatureSpec
|
from vllm.multimodal.inputs import MultiModalFeatureSpec
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
|
from vllm.utils import length_from_prompt_token_ids_or_embeds
|
||||||
from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
|
from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
|
||||||
EngineCoreRequest, FinishReason)
|
EngineCoreRequest, FinishReason)
|
||||||
from vllm.v1.structured_output.request import StructuredOutputRequest
|
from vllm.v1.structured_output.request import StructuredOutputRequest
|
||||||
@ -25,12 +28,13 @@ class Request:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
prompt_token_ids: list[int],
|
prompt_token_ids: Optional[list[int]],
|
||||||
sampling_params: Optional[SamplingParams],
|
sampling_params: Optional[SamplingParams],
|
||||||
pooling_params: Optional[PoolingParams],
|
pooling_params: Optional[PoolingParams],
|
||||||
eos_token_id: Optional[int],
|
eos_token_id: Optional[int],
|
||||||
client_index: int = 0,
|
client_index: int = 0,
|
||||||
arrival_time: Optional[float] = None,
|
arrival_time: Optional[float] = None,
|
||||||
|
prompt_embeds: Optional[torch.Tensor] = None,
|
||||||
mm_features: Optional[list[MultiModalFeatureSpec]] = None,
|
mm_features: Optional[list[MultiModalFeatureSpec]] = None,
|
||||||
lora_request: Optional["LoRARequest"] = None,
|
lora_request: Optional["LoRARequest"] = None,
|
||||||
structured_output_request: Optional["StructuredOutputRequest"] = None,
|
structured_output_request: Optional["StructuredOutputRequest"] = None,
|
||||||
@ -79,9 +83,13 @@ class Request:
|
|||||||
"sampling_params and pooling_params can't both be unset")
|
"sampling_params and pooling_params can't both be unset")
|
||||||
|
|
||||||
self.prompt_token_ids = prompt_token_ids
|
self.prompt_token_ids = prompt_token_ids
|
||||||
self.num_prompt_tokens = len(self.prompt_token_ids)
|
self.prompt_embeds = prompt_embeds
|
||||||
|
self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
|
||||||
|
prompt_token_ids, prompt_embeds)
|
||||||
self._output_token_ids: list[int] = []
|
self._output_token_ids: list[int] = []
|
||||||
self._all_token_ids: list[int] = self.prompt_token_ids.copy()
|
self._all_token_ids: list[int] = self.prompt_token_ids.copy(
|
||||||
|
) if self.prompt_token_ids is not None else [0
|
||||||
|
] * self.num_prompt_tokens
|
||||||
self.num_output_placeholders = 0 # Used in async scheduling.
|
self.num_output_placeholders = 0 # Used in async scheduling.
|
||||||
self.spec_token_ids: list[int] = []
|
self.spec_token_ids: list[int] = []
|
||||||
self.num_computed_tokens = 0
|
self.num_computed_tokens = 0
|
||||||
@ -123,6 +131,7 @@ class Request:
|
|||||||
request_id=request.request_id,
|
request_id=request.request_id,
|
||||||
client_index=request.client_index,
|
client_index=request.client_index,
|
||||||
prompt_token_ids=request.prompt_token_ids,
|
prompt_token_ids=request.prompt_token_ids,
|
||||||
|
prompt_embeds=request.prompt_embeds,
|
||||||
mm_features=request.mm_features,
|
mm_features=request.mm_features,
|
||||||
sampling_params=request.sampling_params,
|
sampling_params=request.sampling_params,
|
||||||
pooling_params=request.pooling_params,
|
pooling_params=request.pooling_params,
|
||||||
|
|||||||
@ -243,7 +243,7 @@ class AdapterLogitsProcessor(LogitsProcessor):
|
|||||||
def _new_state(
|
def _new_state(
|
||||||
self,
|
self,
|
||||||
params: SamplingParams,
|
params: SamplingParams,
|
||||||
prompt_ids: list[int],
|
prompt_ids: Optional[list[int]],
|
||||||
output_ids: list[int],
|
output_ids: list[int],
|
||||||
) -> Optional[partial[torch.Tensor]]:
|
) -> Optional[partial[torch.Tensor]]:
|
||||||
"""Return state representation for new request
|
"""Return state representation for new request
|
||||||
|
|||||||
@ -187,7 +187,8 @@ class MinTokensLogitsProcessor(LogitsProcessor):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_request(
|
def add_request(
|
||||||
params: SamplingParams, _: list[int], output_tok_ids: list[int]
|
params: SamplingParams, _: Optional[list[int]],
|
||||||
|
output_tok_ids: list[int]
|
||||||
) -> Optional[tuple[int, Sequence[int], set[int]]]:
|
) -> Optional[tuple[int, Sequence[int], set[int]]]:
|
||||||
min_tokens = params.min_tokens
|
min_tokens = params.min_tokens
|
||||||
if not min_tokens or len(output_tok_ids) >= min_tokens:
|
if not min_tokens or len(output_tok_ids) >= min_tokens:
|
||||||
@ -234,7 +235,8 @@ class MinTokensLogitsProcessor(LogitsProcessor):
|
|||||||
|
|
||||||
def process_dict_updates(
|
def process_dict_updates(
|
||||||
req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
|
req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
|
||||||
new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]]
|
new_state: Callable[[SamplingParams, Optional[list[int]], list[int]],
|
||||||
|
Optional[T]]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Utility function to update dict state for sparse LogitsProcessors."""
|
"""Utility function to update dict state for sparse LogitsProcessors."""
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,7 @@ RemovedRequest = int
|
|||||||
|
|
||||||
# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
|
# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
|
||||||
# requests added to the batch.
|
# requests added to the batch.
|
||||||
AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
|
AddedRequest = tuple[int, SamplingParams, Optional[list[int]], list[int]]
|
||||||
|
|
||||||
# (index 1, index 2, directionality) tuples representing
|
# (index 1, index 2, directionality) tuples representing
|
||||||
# one-way moves or two-way swaps of requests in batch
|
# one-way moves or two-way swaps of requests in batch
|
||||||
|
|||||||
@ -174,7 +174,7 @@ class MsgpackEncoder:
|
|||||||
) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
|
) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
|
||||||
assert self.aux_buffers is not None
|
assert self.aux_buffers is not None
|
||||||
# view the tensor as a contiguous 1D array of bytes
|
# view the tensor as a contiguous 1D array of bytes
|
||||||
arr = obj.flatten().contiguous().view(torch.uint8).numpy()
|
arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
|
||||||
if obj.nbytes < self.size_threshold:
|
if obj.nbytes < self.size_threshold:
|
||||||
# Smaller tensors are encoded inline, just like ndarrays.
|
# Smaller tensors are encoded inline, just like ndarrays.
|
||||||
data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
|
data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
|
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams, SamplingType
|
from vllm.sampling_params import SamplingParams, SamplingType
|
||||||
from vllm.utils import swap_dict_values
|
from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values
|
||||||
from vllm.v1.outputs import LogprobsTensors
|
from vllm.v1.outputs import LogprobsTensors
|
||||||
from vllm.v1.pool.metadata import PoolingMetadata
|
from vllm.v1.pool.metadata import PoolingMetadata
|
||||||
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
|
from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
|
||||||
@ -29,7 +29,7 @@ from vllm.v1.worker.block_table import MultiGroupBlockTable
|
|||||||
class CachedRequestState:
|
class CachedRequestState:
|
||||||
|
|
||||||
req_id: str
|
req_id: str
|
||||||
prompt_token_ids: list[int]
|
prompt_token_ids: Optional[list[int]]
|
||||||
mm_features: list[MultiModalFeatureSpec]
|
mm_features: list[MultiModalFeatureSpec]
|
||||||
sampling_params: Optional[SamplingParams]
|
sampling_params: Optional[SamplingParams]
|
||||||
pooling_params: Optional[PoolingParams]
|
pooling_params: Optional[PoolingParams]
|
||||||
@ -43,9 +43,11 @@ class CachedRequestState:
|
|||||||
mrope_position_delta: Optional[int] = None
|
mrope_position_delta: Optional[int] = None
|
||||||
|
|
||||||
lora_request: Optional[LoRARequest] = None
|
lora_request: Optional[LoRARequest] = None
|
||||||
|
prompt_embeds: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.num_prompt_tokens = len(self.prompt_token_ids)
|
self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
|
||||||
|
self.prompt_token_ids, self.prompt_embeds)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_tokens(self) -> int:
|
def num_tokens(self) -> int:
|
||||||
@ -63,6 +65,10 @@ class CachedRequestState:
|
|||||||
|
|
||||||
def get_token_id(self, idx: int) -> int:
|
def get_token_id(self, idx: int) -> int:
|
||||||
if idx < self.num_prompt_tokens:
|
if idx < self.num_prompt_tokens:
|
||||||
|
if self.prompt_token_ids is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Tried to access token index {idx}, but that token was "
|
||||||
|
"provided via prompt_embeds, and its ID is unknown.")
|
||||||
return self.prompt_token_ids[idx]
|
return self.prompt_token_ids[idx]
|
||||||
elif idx - self.num_prompt_tokens < len(self.output_token_ids):
|
elif idx - self.num_prompt_tokens < len(self.output_token_ids):
|
||||||
return self.output_token_ids[idx - self.num_prompt_tokens]
|
return self.output_token_ids[idx - self.num_prompt_tokens]
|
||||||
@ -109,6 +115,14 @@ class InputBatch:
|
|||||||
pin_memory=False,
|
pin_memory=False,
|
||||||
)
|
)
|
||||||
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
||||||
|
self.is_token_ids = torch.zeros((max_num_reqs, max_model_len),
|
||||||
|
device="cpu",
|
||||||
|
dtype=bool,
|
||||||
|
pin_memory=False)
|
||||||
|
# Store prompt embeddings per request to avoid OOM from large upfront
|
||||||
|
# allocation if max_model_len is big.
|
||||||
|
# Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
|
||||||
|
self.req_prompt_embeds: dict[int, torch.Tensor] = {}
|
||||||
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
||||||
self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
|
self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
|
||||||
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
||||||
@ -310,15 +324,23 @@ class InputBatch:
|
|||||||
self.req_id_to_index[req_id] = req_index
|
self.req_id_to_index[req_id] = req_index
|
||||||
|
|
||||||
# Copy the prompt token ids and output token ids.
|
# Copy the prompt token ids and output token ids.
|
||||||
num_prompt_tokens = len(request.prompt_token_ids)
|
num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
|
||||||
|
request.prompt_token_ids, request.prompt_embeds)
|
||||||
self.num_prompt_tokens[req_index] = num_prompt_tokens
|
self.num_prompt_tokens[req_index] = num_prompt_tokens
|
||||||
self.token_ids_cpu[
|
|
||||||
req_index, :num_prompt_tokens] = request.prompt_token_ids
|
|
||||||
start_idx = num_prompt_tokens
|
start_idx = num_prompt_tokens
|
||||||
end_idx = start_idx + len(request.output_token_ids)
|
end_idx = start_idx + len(request.output_token_ids)
|
||||||
|
if request.prompt_token_ids is not None:
|
||||||
|
self.token_ids_cpu[
|
||||||
|
req_index, :num_prompt_tokens] = request.prompt_token_ids
|
||||||
|
self.is_token_ids[req_index, :num_prompt_tokens] = True
|
||||||
|
else:
|
||||||
|
self.is_token_ids[req_index, :num_prompt_tokens] = False
|
||||||
|
if request.prompt_embeds is not None:
|
||||||
|
self.req_prompt_embeds[req_index] = request.prompt_embeds
|
||||||
self.token_ids_cpu[req_index,
|
self.token_ids_cpu[req_index,
|
||||||
start_idx:end_idx] = request.output_token_ids
|
start_idx:end_idx] = request.output_token_ids
|
||||||
# Number of token ids in token_ids_cpu.
|
self.is_token_ids[req_index, start_idx:end_idx] = True
|
||||||
|
# Number of token ids in prompt (token_ids_cpu or prompt_embeds).
|
||||||
# NOTE(woosuk): This may include spec decode tokens.
|
# NOTE(woosuk): This may include spec decode tokens.
|
||||||
self.num_tokens[req_index] = request.num_tokens
|
self.num_tokens[req_index] = request.num_tokens
|
||||||
# Number of tokens without spec decode tokens.
|
# Number of tokens without spec decode tokens.
|
||||||
@ -503,6 +525,20 @@ class InputBatch:
|
|||||||
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
|
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
|
||||||
self.token_ids_cpu[i2, ...] = tmp
|
self.token_ids_cpu[i2, ...] = tmp
|
||||||
|
|
||||||
|
self.is_token_ids[[i1, i2], ...] = self.is_token_ids[[i2, i1], ...]
|
||||||
|
|
||||||
|
# Swap prompt embeddings if they exist
|
||||||
|
embeds_i1 = self.req_prompt_embeds.get(i1)
|
||||||
|
embeds_i2 = self.req_prompt_embeds.get(i2)
|
||||||
|
if embeds_i1 is not None:
|
||||||
|
self.req_prompt_embeds[i2] = embeds_i1
|
||||||
|
else:
|
||||||
|
self.req_prompt_embeds.pop(i2, None)
|
||||||
|
if embeds_i2 is not None:
|
||||||
|
self.req_prompt_embeds[i1] = embeds_i2
|
||||||
|
else:
|
||||||
|
self.req_prompt_embeds.pop(i1, None)
|
||||||
|
|
||||||
self.block_table.swap_row(i1, i2)
|
self.block_table.swap_row(i1, i2)
|
||||||
|
|
||||||
self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \
|
self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \
|
||||||
@ -592,6 +628,11 @@ class InputBatch:
|
|||||||
num_tokens = self.num_tokens[last_req_index]
|
num_tokens = self.num_tokens[last_req_index]
|
||||||
self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
|
self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
|
||||||
last_req_index, :num_tokens]
|
last_req_index, :num_tokens]
|
||||||
|
self.is_token_ids[empty_index, :num_tokens] = self.is_token_ids[
|
||||||
|
last_req_index, :num_tokens]
|
||||||
|
if last_req_index in self.req_prompt_embeds:
|
||||||
|
self.req_prompt_embeds[
|
||||||
|
empty_index] = self.req_prompt_embeds.pop(last_req_index)
|
||||||
self.num_tokens[empty_index] = num_tokens
|
self.num_tokens[empty_index] = num_tokens
|
||||||
self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
|
self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
|
||||||
last_req_index]
|
last_req_index]
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import torch
|
|||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sampling_params import SamplingType
|
from vllm.sampling_params import SamplingType
|
||||||
from vllm.utils import swap_dict_values
|
from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values
|
||||||
from vllm.v1.outputs import LogprobsTensors
|
from vllm.v1.outputs import LogprobsTensors
|
||||||
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
||||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState
|
from vllm.v1.worker.gpu_input_batch import CachedRequestState
|
||||||
@ -213,7 +213,9 @@ class InputBatch:
|
|||||||
self.req_id_to_index[req_id] = req_index
|
self.req_id_to_index[req_id] = req_index
|
||||||
|
|
||||||
# Copy the prompt token ids and output token ids.
|
# Copy the prompt token ids and output token ids.
|
||||||
num_prompt_tokens = len(request.prompt_token_ids)
|
num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
|
||||||
|
request.prompt_token_ids, request.prompt_embeds)
|
||||||
|
# TODO: copy prompt_embeds
|
||||||
self.num_prompt_tokens[req_index] = num_prompt_tokens
|
self.num_prompt_tokens[req_index] = num_prompt_tokens
|
||||||
self.token_ids_cpu[
|
self.token_ids_cpu[
|
||||||
req_index, :num_prompt_tokens] = request.prompt_token_ids
|
req_index, :num_prompt_tokens] = request.prompt_token_ids
|
||||||
|
|||||||
@ -387,6 +387,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.requests[req_id] = CachedRequestState(
|
self.requests[req_id] = CachedRequestState(
|
||||||
req_id=req_id,
|
req_id=req_id,
|
||||||
prompt_token_ids=new_req_data.prompt_token_ids,
|
prompt_token_ids=new_req_data.prompt_token_ids,
|
||||||
|
prompt_embeds=new_req_data.prompt_embeds,
|
||||||
mm_features=new_req_data.mm_features,
|
mm_features=new_req_data.mm_features,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
pooling_params=None,
|
pooling_params=None,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user