mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-20 15:27:07 +08:00
[Input] Remove unused prompt field (#26097)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
7e4b1861c3
commit
ae03f4c010
@ -37,4 +37,5 @@ def test_multimodal_processor(model_id):
|
|||||||
hf_processor_mm_kwargs={},
|
hf_processor_mm_kwargs={},
|
||||||
)
|
)
|
||||||
|
|
||||||
assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
|
assert (str_processed_inputs["prompt_token_ids"]
|
||||||
|
== ids_processed_inputs["prompt_token_ids"])
|
||||||
|
|||||||
@ -94,10 +94,15 @@ class EngineClient(ABC):
|
|||||||
# this happens again in generation, so the double expansion causes
|
# this happens again in generation, so the double expansion causes
|
||||||
# a mismatch.
|
# a mismatch.
|
||||||
# TODO - would be ideal to handle this more gracefully.
|
# TODO - would be ideal to handle this more gracefully.
|
||||||
prompt_token_ids = prompt.get("prompt_token_ids")
|
if isinstance(prompt, str):
|
||||||
multi_modal_data = prompt.get("multi_modal_data")
|
prompt_text = prompt
|
||||||
|
prompt_token_ids = []
|
||||||
|
multi_modal_data = None
|
||||||
|
else:
|
||||||
|
prompt_text = prompt.get("prompt")
|
||||||
|
prompt_token_ids = prompt.get("prompt_token_ids", [])
|
||||||
|
multi_modal_data = prompt.get("multi_modal_data")
|
||||||
|
|
||||||
prompt_text = processed_inputs.get("prompt")
|
|
||||||
mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
|
mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")
|
||||||
|
|
||||||
tokenized_length = len(prompt_token_ids)
|
tokenized_length = len(prompt_token_ids)
|
||||||
|
|||||||
@ -205,11 +205,6 @@ class TokenInputs(TypedDict):
|
|||||||
prompt_token_ids: list[int]
|
prompt_token_ids: list[int]
|
||||||
"""The token IDs of the prompt."""
|
"""The token IDs of the prompt."""
|
||||||
|
|
||||||
prompt: NotRequired[str]
|
|
||||||
"""
|
|
||||||
The original prompt text corresponding to the token IDs, if available.
|
|
||||||
"""
|
|
||||||
|
|
||||||
cache_salt: NotRequired[str]
|
cache_salt: NotRequired[str]
|
||||||
"""
|
"""
|
||||||
Optional cache salt to be used for prefix caching.
|
Optional cache salt to be used for prefix caching.
|
||||||
@ -218,15 +213,12 @@ class TokenInputs(TypedDict):
|
|||||||
|
|
||||||
def token_inputs(
|
def token_inputs(
|
||||||
prompt_token_ids: list[int],
|
prompt_token_ids: list[int],
|
||||||
prompt: Optional[str] = None,
|
|
||||||
cache_salt: Optional[str] = None,
|
cache_salt: Optional[str] = None,
|
||||||
) -> TokenInputs:
|
) -> TokenInputs:
|
||||||
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
|
"""Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
|
||||||
values."""
|
values."""
|
||||||
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
|
||||||
|
|
||||||
if prompt is not None:
|
|
||||||
inputs["prompt"] = prompt
|
|
||||||
if cache_salt is not None:
|
if cache_salt is not None:
|
||||||
inputs["cache_salt"] = cache_salt
|
inputs["cache_salt"] = cache_salt
|
||||||
|
|
||||||
|
|||||||
@ -16,9 +16,10 @@ from vllm.multimodal.processing import BaseMultiModalProcessor
|
|||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
|
|
||||||
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
|
from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
|
||||||
EncoderDecoderInputs, ProcessorInputs, PromptType,
|
EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
|
||||||
SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
|
ProcessorInputs, PromptType, SingletonInputs,
|
||||||
TokensPrompt, embeds_inputs, token_inputs)
|
SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
|
||||||
|
embeds_inputs, token_inputs)
|
||||||
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
|
from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -322,7 +323,7 @@ class InputPreprocessor:
|
|||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
inputs = token_inputs(prompt_token_ids=prompt_token_ids)
|
inputs = token_inputs(prompt_token_ids)
|
||||||
|
|
||||||
if cache_salt := parsed_content.get("cache_salt"):
|
if cache_salt := parsed_content.get("cache_salt"):
|
||||||
inputs["cache_salt"] = cache_salt
|
inputs["cache_salt"] = cache_salt
|
||||||
@ -352,10 +353,7 @@ class InputPreprocessor:
|
|||||||
prompt_text,
|
prompt_text,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
)
|
)
|
||||||
inputs = token_inputs(
|
inputs = token_inputs(prompt_token_ids)
|
||||||
prompt=prompt_text,
|
|
||||||
prompt_token_ids=prompt_token_ids,
|
|
||||||
)
|
|
||||||
|
|
||||||
if cache_salt := parsed_content.get("cache_salt"):
|
if cache_salt := parsed_content.get("cache_salt"):
|
||||||
inputs["cache_salt"] = cache_salt
|
inputs["cache_salt"] = cache_salt
|
||||||
@ -473,22 +471,17 @@ class InputPreprocessor:
|
|||||||
decoder_inputs: SingletonInputs
|
decoder_inputs: SingletonInputs
|
||||||
|
|
||||||
if inputs["type"] == "multimodal": # Multimodal data inputs
|
if inputs["type"] == "multimodal": # Multimodal data inputs
|
||||||
if not ("encoder_prompt" in inputs
|
if "encoder_prompt_token_ids" not in inputs:
|
||||||
and "encoder_prompt_token_ids" in inputs):
|
|
||||||
raise RuntimeError("You should register an encoder-decoder "
|
raise RuntimeError("You should register an encoder-decoder "
|
||||||
"multi-modal processor for encoder-decoder "
|
"multi-modal processor for encoder-decoder "
|
||||||
"models.")
|
"models.")
|
||||||
inputs = cast(MultiModalEncDecInputs, inputs)
|
inputs = cast(MultiModalEncDecInputs, inputs)
|
||||||
|
|
||||||
encoder_inputs = token_inputs(
|
encoder_inputs = token_inputs(inputs["encoder_prompt_token_ids"])
|
||||||
prompt=inputs["encoder_prompt"],
|
|
||||||
prompt_token_ids=inputs["encoder_prompt_token_ids"],
|
|
||||||
)
|
|
||||||
|
|
||||||
decoder_prompt_inputs = decoder_inputs_to_override or inputs
|
decoder_prompt_inputs = decoder_inputs_to_override or inputs
|
||||||
decoder_inputs = MultiModalInputs(
|
decoder_inputs = MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=decoder_prompt_inputs.get("prompt", ""),
|
|
||||||
prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
|
prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"],
|
||||||
mm_kwargs=inputs["mm_kwargs"],
|
mm_kwargs=inputs["mm_kwargs"],
|
||||||
mm_hashes=inputs["mm_hashes"],
|
mm_hashes=inputs["mm_hashes"],
|
||||||
@ -498,7 +491,7 @@ class InputPreprocessor:
|
|||||||
decoder_inputs["cache_salt"] = cache_salt
|
decoder_inputs["cache_salt"] = cache_salt
|
||||||
|
|
||||||
elif inputs["type"] == "token": # Text-only inputs
|
elif inputs["type"] == "token": # Text-only inputs
|
||||||
encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
|
encoder_inputs = token_inputs(prompt_token_ids=[])
|
||||||
decoder_inputs = decoder_inputs_to_override or inputs
|
decoder_inputs = decoder_inputs_to_override or inputs
|
||||||
else:
|
else:
|
||||||
assert_never(inputs) # type: ignore[arg-type]
|
assert_never(inputs) # type: ignore[arg-type]
|
||||||
@ -549,12 +542,14 @@ class InputPreprocessor:
|
|||||||
decoder_inputs: Optional[SingletonInputs]
|
decoder_inputs: Optional[SingletonInputs]
|
||||||
|
|
||||||
if is_explicit_encoder_decoder_prompt(prompt):
|
if is_explicit_encoder_decoder_prompt(prompt):
|
||||||
|
# `cast` is needed for mypy, but not pyright
|
||||||
|
prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt)
|
||||||
encoder_inputs = self._prompt_to_llm_inputs(
|
encoder_inputs = self._prompt_to_llm_inputs(
|
||||||
prompt["encoder_prompt"],
|
prompt_["encoder_prompt"],
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
if (decoder_input := prompt_["decoder_prompt"]) is None:
|
||||||
decoder_inputs = None
|
decoder_inputs = None
|
||||||
else:
|
else:
|
||||||
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
|
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
|
||||||
@ -565,8 +560,9 @@ class InputPreprocessor:
|
|||||||
self._split_enc_dec_mm_inputs(encoder_inputs,
|
self._split_enc_dec_mm_inputs(encoder_inputs,
|
||||||
decoder_inputs))
|
decoder_inputs))
|
||||||
else:
|
else:
|
||||||
|
# `cast` is needed for mypy, but not pyright
|
||||||
inputs = self._prompt_to_llm_inputs(
|
inputs = self._prompt_to_llm_inputs(
|
||||||
prompt,
|
cast(SingletonPrompt, prompt),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
@ -641,8 +637,9 @@ class InputPreprocessor:
|
|||||||
"to decoder-only models")
|
"to decoder-only models")
|
||||||
|
|
||||||
# Decoder-only operation
|
# Decoder-only operation
|
||||||
|
# `cast` is needed for mypy, but not pyright
|
||||||
return self._process_decoder_only_prompt(
|
return self._process_decoder_only_prompt(
|
||||||
prompt,
|
cast(SingletonPrompt, prompt),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
mm_uuids=mm_uuids,
|
mm_uuids=mm_uuids,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -778,7 +778,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
)
|
)
|
||||||
], mm_item_counts)
|
], mm_item_counts)
|
||||||
|
|
||||||
prompt_ids, prompt, _ = self._apply_prompt_updates(
|
prompt_ids, _ = self._apply_prompt_updates(
|
||||||
result["prompt_token_ids"],
|
result["prompt_token_ids"],
|
||||||
mantis_mm_repls,
|
mantis_mm_repls,
|
||||||
)
|
)
|
||||||
@ -798,7 +798,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
|
|
||||||
return MultiModalInputs(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
|
||||||
prompt_token_ids=prompt_ids,
|
prompt_token_ids=prompt_ids,
|
||||||
mm_kwargs=mm_kwargs,
|
mm_kwargs=mm_kwargs,
|
||||||
mm_hashes=mm_hashes,
|
mm_hashes=mm_hashes,
|
||||||
|
|||||||
@ -219,7 +219,6 @@ class PaliGemmaMultiModalProcessor(
|
|||||||
if len(prompt_token_ids) and prompt_token_ids[-1] != newline_token_id:
|
if len(prompt_token_ids) and prompt_token_ids[-1] != newline_token_id:
|
||||||
prompt_token_ids.append(newline_token_id)
|
prompt_token_ids.append(newline_token_id)
|
||||||
mm_inputs["prompt_token_ids"] = prompt_token_ids
|
mm_inputs["prompt_token_ids"] = prompt_token_ids
|
||||||
mm_inputs["prompt"] += newline_prompt
|
|
||||||
|
|
||||||
return mm_inputs
|
return mm_inputs
|
||||||
|
|
||||||
|
|||||||
@ -461,7 +461,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
|||||||
self,
|
self,
|
||||||
token_ids: list[int],
|
token_ids: list[int],
|
||||||
mm_prompt_updates: MultiModalPromptUpdates,
|
mm_prompt_updates: MultiModalPromptUpdates,
|
||||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||||
# align to hf behavior when there are images
|
# align to hf behavior when there are images
|
||||||
if len(mm_prompt_updates):
|
if len(mm_prompt_updates):
|
||||||
tokenizer = self.info.get_tokenizer()
|
tokenizer = self.info.get_tokenizer()
|
||||||
@ -496,14 +496,14 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
|||||||
for ele in sublist for e in ele
|
for ele in sublist for e in ele
|
||||||
]
|
]
|
||||||
|
|
||||||
token_ids, text, placeholders = super()._apply_prompt_updates(
|
token_ids, placeholders = super()._apply_prompt_updates(
|
||||||
token_ids=token_ids,
|
token_ids=token_ids,
|
||||||
mm_prompt_updates=mm_prompt_updates,
|
mm_prompt_updates=mm_prompt_updates,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Keep the behavior in line with HF processor
|
# Keep the behavior in line with HF processor
|
||||||
if text.startswith("<s> <|image|>"):
|
if token_ids[:2] == tokenizer.encode("<s> <|image|>",
|
||||||
text = text.replace("<s> <|image|>", "<s><|image|>", 1)
|
add_special_tokens=False):
|
||||||
token_ids = [token_ids[0], *token_ids[2:]]
|
token_ids = [token_ids[0], *token_ids[2:]]
|
||||||
placeholders = {
|
placeholders = {
|
||||||
modality: [
|
modality: [
|
||||||
@ -518,7 +518,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
|||||||
for modality, ps in placeholders.items()
|
for modality, ps in placeholders.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
return token_ids, text, placeholders
|
return token_ids, placeholders
|
||||||
|
|
||||||
|
|
||||||
@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
|
@MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
|
||||||
|
|||||||
@ -63,7 +63,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
PromptReplacement, PromptUpdate)
|
PromptReplacement, PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
|
from vllm.transformers_utils.tokenizer import encode_tokens
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
@ -316,7 +316,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
|||||||
mm_kwargs: MultiModalKwargsItems,
|
mm_kwargs: MultiModalKwargsItems,
|
||||||
mm_prompt_updates: MultiModalPromptUpdates,
|
mm_prompt_updates: MultiModalPromptUpdates,
|
||||||
is_update_applied: bool,
|
is_update_applied: bool,
|
||||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||||
"""
|
"""
|
||||||
Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
|
Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
|
||||||
"""
|
"""
|
||||||
@ -341,28 +341,20 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
|||||||
self._validate_mm_placeholders(
|
self._validate_mm_placeholders(
|
||||||
mm_placeholders,
|
mm_placeholders,
|
||||||
mm_item_counts,
|
mm_item_counts,
|
||||||
use_audio_in_video=use_audio_in_video)
|
use_audio_in_video=use_audio_in_video,
|
||||||
|
)
|
||||||
tokenizer = self.info.get_tokenizer()
|
|
||||||
prompt = decode_tokens(tokenizer, prompt_ids)
|
|
||||||
else:
|
else:
|
||||||
(
|
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||||
prompt_ids,
|
|
||||||
prompt,
|
|
||||||
mm_placeholders,
|
|
||||||
) = self._apply_prompt_updates(
|
|
||||||
prompt_ids,
|
prompt_ids,
|
||||||
mm_prompt_updates,
|
mm_prompt_updates,
|
||||||
)
|
)
|
||||||
self._validate_mm_placeholders(
|
self._validate_mm_placeholders(
|
||||||
mm_placeholders,
|
mm_placeholders,
|
||||||
mm_item_counts,
|
mm_item_counts,
|
||||||
use_audio_in_video=use_audio_in_video)
|
use_audio_in_video=use_audio_in_video,
|
||||||
|
)
|
||||||
|
|
||||||
tokenizer = self.info.get_tokenizer()
|
return prompt_ids, mm_placeholders
|
||||||
prompt = decode_tokens(tokenizer, prompt_ids)
|
|
||||||
|
|
||||||
return prompt_ids, prompt, mm_placeholders
|
|
||||||
|
|
||||||
def _get_prompt_updates(
|
def _get_prompt_updates(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -190,7 +190,6 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
|
|||||||
|
|
||||||
return MultiModalInputs(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
|
||||||
prompt_token_ids=[1],
|
prompt_token_ids=[1],
|
||||||
mm_kwargs=mm_kwargs,
|
mm_kwargs=mm_kwargs,
|
||||||
mm_hashes=mm_hashes,
|
mm_hashes=mm_hashes,
|
||||||
|
|||||||
@ -453,7 +453,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
|||||||
|
|
||||||
return MultiModalInputs(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
|
||||||
prompt_token_ids=prompt_ids,
|
prompt_token_ids=prompt_ids,
|
||||||
mm_kwargs=mm_kwargs,
|
mm_kwargs=mm_kwargs,
|
||||||
mm_hashes=mm_hashes,
|
mm_hashes=mm_hashes,
|
||||||
|
|||||||
@ -949,9 +949,6 @@ class MultiModalInputs(TypedDict):
|
|||||||
type: Literal["multimodal"]
|
type: Literal["multimodal"]
|
||||||
"""The type of inputs."""
|
"""The type of inputs."""
|
||||||
|
|
||||||
prompt: str
|
|
||||||
"""The processed prompt text."""
|
|
||||||
|
|
||||||
prompt_token_ids: list[int]
|
prompt_token_ids: list[int]
|
||||||
"""The processed token IDs which includes placeholder tokens."""
|
"""The processed token IDs which includes placeholder tokens."""
|
||||||
|
|
||||||
@ -980,8 +977,5 @@ class MultiModalEncDecInputs(MultiModalInputs):
|
|||||||
ready to be passed to vLLM internals.
|
ready to be passed to vLLM internals.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
encoder_prompt: str
|
|
||||||
"""The processed encoder prompt text."""
|
|
||||||
|
|
||||||
encoder_prompt_token_ids: list[int]
|
encoder_prompt_token_ids: list[int]
|
||||||
"""The processed token IDs of the encoder prompt."""
|
"""The processed token IDs of the encoder prompt."""
|
||||||
|
|||||||
@ -1878,7 +1878,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
self,
|
self,
|
||||||
token_ids: list[int],
|
token_ids: list[int],
|
||||||
mm_prompt_updates: MultiModalPromptUpdates,
|
mm_prompt_updates: MultiModalPromptUpdates,
|
||||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||||
tokenizer = self.info.get_tokenizer()
|
tokenizer = self.info.get_tokenizer()
|
||||||
|
|
||||||
new_token_ids, match_result = self._apply_token_matches(
|
new_token_ids, match_result = self._apply_token_matches(
|
||||||
@ -1896,11 +1896,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
# Since it is inefficient to search for all possible tokenizations
|
# Since it is inefficient to search for all possible tokenizations
|
||||||
# of the search text in the prompt, we instead perform string-based
|
# of the search text in the prompt, we instead perform string-based
|
||||||
# updates on the decoded token IDs, then encode them back.
|
# updates on the decoded token IDs, then encode them back.
|
||||||
if all(
|
if not all(
|
||||||
all(update_idx is not None for update_idx in update_idxs)
|
all(update_idx is not None for update_idx in update_idxs)
|
||||||
for update_idxs in match_result.values()):
|
for update_idxs in match_result.values()):
|
||||||
new_text = decode_tokens(tokenizer, new_token_ids)
|
|
||||||
else:
|
|
||||||
new_text, match_result = self._apply_text_matches(
|
new_text, match_result = self._apply_text_matches(
|
||||||
decode_tokens(tokenizer, token_ids),
|
decode_tokens(tokenizer, token_ids),
|
||||||
mm_prompt_updates,
|
mm_prompt_updates,
|
||||||
@ -1928,7 +1926,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
dict(matched_updates),
|
dict(matched_updates),
|
||||||
)
|
)
|
||||||
|
|
||||||
return new_token_ids, new_text, placeholders
|
return new_token_ids, placeholders
|
||||||
|
|
||||||
def _validate_mm_kwargs(
|
def _validate_mm_kwargs(
|
||||||
self,
|
self,
|
||||||
@ -1976,7 +1974,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_kwargs: MultiModalKwargsOptionalItems,
|
mm_kwargs: MultiModalKwargsOptionalItems,
|
||||||
mm_prompt_updates: MultiModalPromptUpdates,
|
mm_prompt_updates: MultiModalPromptUpdates,
|
||||||
is_update_applied: bool,
|
is_update_applied: bool,
|
||||||
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||||
mm_item_counts = mm_items.get_all_counts()
|
mm_item_counts = mm_items.get_all_counts()
|
||||||
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
|
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
|
||||||
|
|
||||||
@ -1986,21 +1984,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_prompt_updates,
|
mm_prompt_updates,
|
||||||
)
|
)
|
||||||
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||||
|
|
||||||
tokenizer = self.info.get_tokenizer()
|
|
||||||
prompt = decode_tokens(tokenizer, prompt_ids)
|
|
||||||
else:
|
else:
|
||||||
(
|
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||||
prompt_ids,
|
|
||||||
prompt,
|
|
||||||
mm_placeholders,
|
|
||||||
) = self._apply_prompt_updates(
|
|
||||||
prompt_ids,
|
prompt_ids,
|
||||||
mm_prompt_updates,
|
mm_prompt_updates,
|
||||||
)
|
)
|
||||||
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||||
|
|
||||||
return prompt_ids, prompt, mm_placeholders
|
return prompt_ids, mm_placeholders
|
||||||
|
|
||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
@ -2042,7 +2033,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: tokenization_kwargs are not required to init processor
|
# NOTE: tokenization_kwargs are not required to init processor
|
||||||
prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
|
prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates(
|
||||||
mm_items=mm_items,
|
mm_items=mm_items,
|
||||||
prompt_ids=prompt_ids,
|
prompt_ids=prompt_ids,
|
||||||
mm_kwargs=mm_info.kwargs,
|
mm_kwargs=mm_info.kwargs,
|
||||||
@ -2057,7 +2048,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
|
|
||||||
return MultiModalInputs(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
|
||||||
prompt_token_ids=prompt_ids,
|
prompt_token_ids=prompt_ids,
|
||||||
mm_kwargs=mm_info.kwargs,
|
mm_kwargs=mm_info.kwargs,
|
||||||
mm_hashes=mm_info.hashes,
|
mm_hashes=mm_info.hashes,
|
||||||
@ -2100,19 +2090,15 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
tokenizer = self.info.get_tokenizer()
|
tokenizer = self.info.get_tokenizer()
|
||||||
decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data)
|
decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data)
|
||||||
if isinstance(decoder_prompt_raw, str):
|
if isinstance(decoder_prompt_raw, str):
|
||||||
decoder_prompt = decoder_prompt_raw
|
|
||||||
decoder_prompt_ids = encode_tokens(tokenizer,
|
decoder_prompt_ids = encode_tokens(tokenizer,
|
||||||
decoder_prompt_raw,
|
decoder_prompt_raw,
|
||||||
add_special_tokens=False)
|
add_special_tokens=False)
|
||||||
else:
|
else:
|
||||||
decoder_prompt = decode_tokens(tokenizer, decoder_prompt_raw)
|
|
||||||
decoder_prompt_ids = decoder_prompt_raw
|
decoder_prompt_ids = decoder_prompt_raw
|
||||||
|
|
||||||
mm_inputs = MultiModalEncDecInputs(
|
mm_inputs = MultiModalEncDecInputs(
|
||||||
encoder_prompt=encoder_inputs["prompt"],
|
|
||||||
encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
|
encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
|
||||||
**encoder_inputs)
|
**encoder_inputs)
|
||||||
mm_inputs["prompt"] = decoder_prompt
|
|
||||||
mm_inputs["prompt_token_ids"] = decoder_prompt_ids
|
mm_inputs["prompt_token_ids"] = decoder_prompt_ids
|
||||||
return mm_inputs
|
return mm_inputs
|
||||||
|
|
||||||
|
|||||||
@ -281,12 +281,16 @@ class AsyncLLM(EngineClient):
|
|||||||
queue = RequestOutputCollector(output_kind=params.output_kind)
|
queue = RequestOutputCollector(output_kind=params.output_kind)
|
||||||
|
|
||||||
# Convert Input --> Request.
|
# Convert Input --> Request.
|
||||||
prompt_str, request = self.processor.process_inputs(
|
request = self.processor.process_inputs(request_id, prompt, params,
|
||||||
request_id, prompt, params, arrival_time, lora_request,
|
arrival_time, lora_request,
|
||||||
tokenization_kwargs, trace_headers, priority, data_parallel_rank)
|
tokenization_kwargs,
|
||||||
|
trace_headers, priority,
|
||||||
|
data_parallel_rank)
|
||||||
|
prompt_text = prompt if isinstance(prompt,
|
||||||
|
str) else prompt.get("prompt")
|
||||||
|
|
||||||
if is_pooling or params.n == 1:
|
if is_pooling or params.n == 1:
|
||||||
await self._add_request(request, prompt_str, None, 0, queue)
|
await self._add_request(request, prompt_text, None, 0, queue)
|
||||||
return queue
|
return queue
|
||||||
|
|
||||||
# Get the updated SamplingParams from the request, which
|
# Get the updated SamplingParams from the request, which
|
||||||
@ -302,7 +306,7 @@ class AsyncLLM(EngineClient):
|
|||||||
request)
|
request)
|
||||||
child_request.request_id = request_id
|
child_request.request_id = request_id
|
||||||
child_request.sampling_params = child_params
|
child_request.sampling_params = child_params
|
||||||
await self._add_request(child_request, prompt_str, parent_request,
|
await self._add_request(child_request, prompt_text, parent_request,
|
||||||
idx, queue)
|
idx, queue)
|
||||||
return queue
|
return queue
|
||||||
|
|
||||||
|
|||||||
@ -227,15 +227,18 @@ class LLMEngine:
|
|||||||
f"request_id must be a string, got {type(request_id)}")
|
f"request_id must be a string, got {type(request_id)}")
|
||||||
|
|
||||||
# Process raw inputs into the request.
|
# Process raw inputs into the request.
|
||||||
prompt_str, request = self.processor.process_inputs(
|
request = self.processor.process_inputs(request_id, prompt, params,
|
||||||
request_id, prompt, params, arrival_time, lora_request,
|
arrival_time, lora_request,
|
||||||
tokenization_kwargs, trace_headers, priority)
|
tokenization_kwargs,
|
||||||
|
trace_headers, priority)
|
||||||
|
prompt_text = prompt if isinstance(prompt,
|
||||||
|
str) else prompt.get("prompt")
|
||||||
|
|
||||||
n = params.n if isinstance(params, SamplingParams) else 1
|
n = params.n if isinstance(params, SamplingParams) else 1
|
||||||
|
|
||||||
if n == 1:
|
if n == 1:
|
||||||
# Make a new RequestState and queue.
|
# Make a new RequestState and queue.
|
||||||
self.output_processor.add_request(request, prompt_str, None, 0)
|
self.output_processor.add_request(request, prompt_text, None, 0)
|
||||||
# Add the request to EngineCore.
|
# Add the request to EngineCore.
|
||||||
self.engine_core.add_request(request)
|
self.engine_core.add_request(request)
|
||||||
return
|
return
|
||||||
@ -249,7 +252,7 @@ class LLMEngine:
|
|||||||
child_request.sampling_params = params
|
child_request.sampling_params = params
|
||||||
|
|
||||||
# Make a new RequestState and queue.
|
# Make a new RequestState and queue.
|
||||||
self.output_processor.add_request(child_request, prompt_str,
|
self.output_processor.add_request(child_request, prompt_text,
|
||||||
parent_req, idx)
|
parent_req, idx)
|
||||||
# Add the request to EngineCore.
|
# Add the request to EngineCore.
|
||||||
self.engine_core.add_request(child_request)
|
self.engine_core.add_request(child_request)
|
||||||
|
|||||||
@ -334,9 +334,7 @@ class Processor:
|
|||||||
trace_headers: Optional[Mapping[str, str]] = None,
|
trace_headers: Optional[Mapping[str, str]] = None,
|
||||||
priority: int = 0,
|
priority: int = 0,
|
||||||
data_parallel_rank: Optional[int] = None,
|
data_parallel_rank: Optional[int] = None,
|
||||||
) -> tuple[Optional[str], EngineCoreRequest]:
|
) -> EngineCoreRequest:
|
||||||
|
|
||||||
# TODO(woosuk): Support pooling models.
|
|
||||||
self._validate_lora(lora_request)
|
self._validate_lora(lora_request)
|
||||||
self._validate_params(params)
|
self._validate_params(params)
|
||||||
|
|
||||||
@ -395,8 +393,6 @@ class Processor:
|
|||||||
# discriminated unions of TypedDicts, because of how it handles
|
# discriminated unions of TypedDicts, because of how it handles
|
||||||
# inheritance of TypedDict. If we explicitly extract the items we want
|
# inheritance of TypedDict. If we explicitly extract the items we want
|
||||||
# we can avoid type errors from using `dict.get` later in the method.
|
# we can avoid type errors from using `dict.get` later in the method.
|
||||||
prompt_str: Optional[str] = None if decoder_inputs[
|
|
||||||
"type"] == "embeds" else decoder_inputs.get("prompt")
|
|
||||||
prompt_token_ids = decoder_inputs[
|
prompt_token_ids = decoder_inputs[
|
||||||
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
|
"prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
|
||||||
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
|
prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
|
||||||
@ -442,7 +438,7 @@ class Processor:
|
|||||||
identifier=decoder_mm_hashes[modality][idx],
|
identifier=decoder_mm_hashes[modality][idx],
|
||||||
mm_position=decoder_mm_positions[modality][idx]))
|
mm_position=decoder_mm_positions[modality][idx]))
|
||||||
|
|
||||||
return prompt_str, EngineCoreRequest(
|
return EngineCoreRequest(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
prompt_token_ids=prompt_token_ids,
|
prompt_token_ids=prompt_token_ids,
|
||||||
prompt_embeds=prompt_embeds,
|
prompt_embeds=prompt_embeds,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user