[Frontend] Avoid list copies in serving_chat.py (#22947)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-08-15 19:06:30 -07:00 committed by GitHub
parent fbd88728b3
commit f6b5040590
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 15 deletions

View File

@ -50,6 +50,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
truncate_tool_call_ids,
validate_request_params)
from vllm.utils import as_list
logger = init_logger(__name__)
@ -670,10 +671,10 @@ class OpenAIServingChat(OpenAIServing):
# avoid the None + list error.
if previous_token_ids:
current_token_ids = previous_token_ids + list(
current_token_ids = previous_token_ids + as_list(
output.token_ids)
else:
current_token_ids = list(output.token_ids)
current_token_ids = as_list(output.token_ids)
if self.use_harmony:
if is_final:
@ -703,11 +704,10 @@ class OpenAIServingChat(OpenAIServing):
# set reasoning status to end.
# Only keep 'content', remove 'reasoning_content'.
if reasoning_parser.is_reasoning_end(
list(output.token_ids)) or \
(res.prompt_token_ids and
reasoning_parser.is_reasoning_end(
list(res.prompt_token_ids)
)):
as_list(output.token_ids)) or (
res.prompt_token_ids
and reasoning_parser.is_reasoning_end(
res.prompt_token_ids)):
reasoning_end_arr[i] = True
if delta_message and delta_message.content:
# This need to be added to next `delta_text`
@ -771,6 +771,7 @@ class OpenAIServingChat(OpenAIServing):
assert reasoning_parser is not None
assert added_content_delta_arr is not None
assert reasoning_end_arr is not None
output_token_ids = as_list(output.token_ids)
if not reasoning_end_arr[i]:
delta_message = (
reasoning_parser.
@ -780,7 +781,7 @@ class OpenAIServingChat(OpenAIServing):
delta_text,
previous_token_ids,
current_token_ids,
output.token_ids,
output_token_ids,
))
# When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False},
@ -789,9 +790,9 @@ class OpenAIServingChat(OpenAIServing):
# to 'reasoning_content'.
if res.prompt_token_ids and \
reasoning_parser.is_reasoning_end(
list(res.prompt_token_ids)):
res.prompt_token_ids):
reasoning_end_arr[i] = True
current_token_ids = list(output.token_ids)
current_token_ids = output_token_ids
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
@ -802,11 +803,11 @@ class OpenAIServingChat(OpenAIServing):
# Remove the text and token ids related
# to 'reasoning_content'.
if reasoning_parser.is_reasoning_end(
list(output.token_ids)):
output_token_ids):
reasoning_end_arr[i] = True
current_token_ids = \
reasoning_parser.extract_content_ids(
list(output.token_ids))
output_token_ids)
if delta_message and delta_message.content:
current_text = delta_message.content
delta_message.content = None
@ -815,7 +816,7 @@ class OpenAIServingChat(OpenAIServing):
# handle tool calls only after reasoning is done,
else:
delta_token_ids = list(output.token_ids)
delta_token_ids = output_token_ids
# First time to tool call,
# add the remaining text and token ids
# to delta from previous
@ -899,7 +900,7 @@ class OpenAIServingChat(OpenAIServing):
self.request_logger.log_outputs(
request_id=request_id,
outputs=delta_content,
output_token_ids=list(output.token_ids),
output_token_ids=as_list(output.token_ids),
finish_reason=output.finish_reason,
is_streaming=True,
delta=True,

View File

@ -44,7 +44,7 @@ class ReasoningParser:
return self.model_tokenizer.get_vocab()
@abstractmethod
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
def is_reasoning_end(self, input_ids: list[int]) -> bool:
"""
Check if the reasoning content ends in the input_ids.