[Frontend] Avoid list copies in serving_chat.py (#22947)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-08-15 19:06:30 -07:00 committed by GitHub
parent fbd88728b3
commit f6b5040590
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 15 deletions

View File

@ -50,6 +50,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls, from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
truncate_tool_call_ids, truncate_tool_call_ids,
validate_request_params) validate_request_params)
from vllm.utils import as_list
logger = init_logger(__name__) logger = init_logger(__name__)
@ -670,10 +671,10 @@ class OpenAIServingChat(OpenAIServing):
# avoid the None + list error. # avoid the None + list error.
if previous_token_ids: if previous_token_ids:
current_token_ids = previous_token_ids + list( current_token_ids = previous_token_ids + as_list(
output.token_ids) output.token_ids)
else: else:
current_token_ids = list(output.token_ids) current_token_ids = as_list(output.token_ids)
if self.use_harmony: if self.use_harmony:
if is_final: if is_final:
@ -703,11 +704,10 @@ class OpenAIServingChat(OpenAIServing):
# set reasoning status to end. # set reasoning status to end.
# Only keep 'content', remove 'reasoning_content'. # Only keep 'content', remove 'reasoning_content'.
if reasoning_parser.is_reasoning_end( if reasoning_parser.is_reasoning_end(
list(output.token_ids)) or \ as_list(output.token_ids)) or (
(res.prompt_token_ids and res.prompt_token_ids
reasoning_parser.is_reasoning_end( and reasoning_parser.is_reasoning_end(
list(res.prompt_token_ids) res.prompt_token_ids)):
)):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
if delta_message and delta_message.content: if delta_message and delta_message.content:
# This need to be added to next `delta_text` # This need to be added to next `delta_text`
@ -771,6 +771,7 @@ class OpenAIServingChat(OpenAIServing):
assert reasoning_parser is not None assert reasoning_parser is not None
assert added_content_delta_arr is not None assert added_content_delta_arr is not None
assert reasoning_end_arr is not None assert reasoning_end_arr is not None
output_token_ids = as_list(output.token_ids)
if not reasoning_end_arr[i]: if not reasoning_end_arr[i]:
delta_message = ( delta_message = (
reasoning_parser. reasoning_parser.
@ -780,7 +781,7 @@ class OpenAIServingChat(OpenAIServing):
delta_text, delta_text,
previous_token_ids, previous_token_ids,
current_token_ids, current_token_ids,
output.token_ids, output_token_ids,
)) ))
# When encountering think end id in prompt_token_ids # When encountering think end id in prompt_token_ids
# i.e {"enable_thinking": False}, # i.e {"enable_thinking": False},
@ -789,9 +790,9 @@ class OpenAIServingChat(OpenAIServing):
# to 'reasoning_content'. # to 'reasoning_content'.
if res.prompt_token_ids and \ if res.prompt_token_ids and \
reasoning_parser.is_reasoning_end( reasoning_parser.is_reasoning_end(
list(res.prompt_token_ids)): res.prompt_token_ids):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
current_token_ids = list(output.token_ids) current_token_ids = output_token_ids
if delta_message and delta_message.content: if delta_message and delta_message.content:
current_text = delta_message.content current_text = delta_message.content
delta_message.content = None delta_message.content = None
@ -802,11 +803,11 @@ class OpenAIServingChat(OpenAIServing):
# Remove the text and token ids related # Remove the text and token ids related
# to 'reasoning_content'. # to 'reasoning_content'.
if reasoning_parser.is_reasoning_end( if reasoning_parser.is_reasoning_end(
list(output.token_ids)): output_token_ids):
reasoning_end_arr[i] = True reasoning_end_arr[i] = True
current_token_ids = \ current_token_ids = \
reasoning_parser.extract_content_ids( reasoning_parser.extract_content_ids(
list(output.token_ids)) output_token_ids)
if delta_message and delta_message.content: if delta_message and delta_message.content:
current_text = delta_message.content current_text = delta_message.content
delta_message.content = None delta_message.content = None
@ -815,7 +816,7 @@ class OpenAIServingChat(OpenAIServing):
# handle tool calls only after reasoning is done, # handle tool calls only after reasoning is done,
else: else:
delta_token_ids = list(output.token_ids) delta_token_ids = output_token_ids
# First time to tool call, # First time to tool call,
# add the remaining text and token ids # add the remaining text and token ids
# to delta from previous # to delta from previous
@ -899,7 +900,7 @@ class OpenAIServingChat(OpenAIServing):
self.request_logger.log_outputs( self.request_logger.log_outputs(
request_id=request_id, request_id=request_id,
outputs=delta_content, outputs=delta_content,
output_token_ids=list(output.token_ids), output_token_ids=as_list(output.token_ids),
finish_reason=output.finish_reason, finish_reason=output.finish_reason,
is_streaming=True, is_streaming=True,
delta=True, delta=True,

View File

@ -44,7 +44,7 @@ class ReasoningParser:
return self.model_tokenizer.get_vocab() return self.model_tokenizer.get_vocab()
@abstractmethod @abstractmethod
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: def is_reasoning_end(self, input_ids: list[int]) -> bool:
""" """
Check if the reasoning content ends in the input_ids. Check if the reasoning content ends in the input_ids.