[Refactor]Reduce duplicate code in serving_chat (#26627)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-10-11 20:04:49 +08:00 committed by GitHub
parent f7ee69868a
commit d0bed837ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 56 additions and 49 deletions

View File

@ -6,7 +6,7 @@ import json
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Callable, Final, Optional, Union
from typing import Final, Optional, Union
import jinja2
import partial_json_parser
@ -56,14 +56,13 @@ from vllm.entrypoints.openai.protocol import (
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.entrypoints.utils import get_max_tokens
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizers import (
@ -112,42 +111,15 @@ class OpenAIServingChat(OpenAIServing):
self.trust_request_chat_template = trust_request_chat_template
self.enable_log_outputs = enable_log_outputs
# set up reasoning parser
self.reasoning_parser = self._get_reasoning_parser(
reasoning_parser_name=reasoning_parser
)
# set up tool use
self.enable_auto_tools: bool = enable_auto_tools
if self.enable_auto_tools:
logger.info(
'"auto" tool choice has been enabled please note that while'
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored."
)
self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
None
self.tool_parser = self._get_tool_parser(
tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
)
if reasoning_parser:
try:
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
reasoning_parser
)
assert self.reasoning_parser is not None
except Exception as e:
raise TypeError(f"{reasoning_parser=} has not been registered") from e
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
if self.enable_auto_tools:
try:
if tool_parser == "pythonic" and self.model_config.model.startswith(
"meta-llama/Llama-3.2"
):
logger.warning(
"Llama3.2 models may struggle to emit valid pythonic tool calls"
)
self.tool_parser = ToolParserManager.get_tool_parser(tool_parser)
except Exception as e:
raise TypeError(
"Error: --enable-auto-tool-choice requires "
f"tool_parser:'{tool_parser}' which has not "
"been registered"
) from e
self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
self.enable_prompt_tokens_details = enable_prompt_tokens_details

View File

@ -63,7 +63,7 @@ from vllm.entrypoints.openai.protocol import (
TranslationRequest,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs.data import PromptType
@ -82,6 +82,7 @@ from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error
)
from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tracing import (
contains_trace_headers,
@ -274,6 +275,50 @@ class OpenAIServing:
self.model_config = self.models.model_config
self.max_model_len = self.model_config.max_model_len
def _get_tool_parser(
self, tool_parser_name: Optional[str] = None, enable_auto_tools: bool = False
) -> Optional[Callable[[AnyTokenizer], ToolParser]]:
"""Get the tool parser based on the name."""
parser = None
if not enable_auto_tools or tool_parser_name is None:
return parser
logger.info(
'"auto" tool choice has been enabled please note that while'
" the parallel_tool_calls client option is preset for "
"compatibility reasons, it will be ignored."
)
try:
if tool_parser_name == "pythonic" and self.model_config.model.startswith(
"meta-llama/Llama-3.2"
):
logger.warning(
"Llama3.2 models may struggle to emit valid pythonic tool calls"
)
parser = ToolParserManager.get_tool_parser(tool_parser_name)
except Exception as e:
raise TypeError(
"Error: --enable-auto-tool-choice requires "
f"tool_parser:'{tool_parser_name}' which has not "
"been registered"
) from e
return parser
def _get_reasoning_parser(
self,
reasoning_parser_name: str,
) -> Optional[Callable[[AnyTokenizer], ReasoningParser]]:
"""Get the reasoning parser based on the name."""
parser = None
if not reasoning_parser_name:
return None
try:
parser = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
assert parser is not None
except Exception as e:
raise TypeError(f"{reasoning_parser_name=} has not been registered") from e
return parser
async def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache()
await self.engine_client.reset_mm_cache()

View File

@ -96,7 +96,6 @@ from vllm.logger import init_logger
from vllm.logprobs import Logprob as SampleLogprob
from vllm.logprobs import SampleLogprobs
from vllm.outputs import CompletionOutput
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import random_uuid
@ -136,18 +135,9 @@ class OpenAIServingResponses(OpenAIServing):
self.chat_template_content_format: Final = chat_template_content_format
self.enable_log_outputs = enable_log_outputs
self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
None
self.reasoning_parser = self._get_reasoning_parser(
reasoning_parser_name=reasoning_parser
)
if reasoning_parser:
try:
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
reasoning_parser
)
assert self.reasoning_parser is not None
except Exception as e:
raise TypeError(f"{reasoning_parser=} has not been registered") from e
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_force_include_usage = enable_force_include_usage
self.default_sampling_params = self.model_config.get_diff_sampling_param()