mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 18:35:58 +08:00
[Refactor]Reduce duplicate code in serving_chat (#26627)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
parent
f7ee69868a
commit
d0bed837ac
@ -6,7 +6,7 @@ import json
|
|||||||
import time
|
import time
|
||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from collections.abc import Sequence as GenericSequence
|
from collections.abc import Sequence as GenericSequence
|
||||||
from typing import Callable, Final, Optional, Union
|
from typing import Final, Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
import partial_json_parser
|
import partial_json_parser
|
||||||
@ -56,14 +56,13 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
||||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
|
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
|
||||||
from vllm.entrypoints.utils import get_max_tokens
|
from vllm.entrypoints.utils import get_max_tokens
|
||||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.logprobs import Logprob
|
from vllm.logprobs import Logprob
|
||||||
from vllm.outputs import CompletionOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
|
||||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.transformers_utils.tokenizers import (
|
from vllm.transformers_utils.tokenizers import (
|
||||||
@ -112,42 +111,15 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
self.trust_request_chat_template = trust_request_chat_template
|
self.trust_request_chat_template = trust_request_chat_template
|
||||||
self.enable_log_outputs = enable_log_outputs
|
self.enable_log_outputs = enable_log_outputs
|
||||||
|
|
||||||
|
# set up reasoning parser
|
||||||
|
self.reasoning_parser = self._get_reasoning_parser(
|
||||||
|
reasoning_parser_name=reasoning_parser
|
||||||
|
)
|
||||||
# set up tool use
|
# set up tool use
|
||||||
self.enable_auto_tools: bool = enable_auto_tools
|
self.enable_auto_tools: bool = enable_auto_tools
|
||||||
if self.enable_auto_tools:
|
self.tool_parser = self._get_tool_parser(
|
||||||
logger.info(
|
tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
|
||||||
'"auto" tool choice has been enabled please note that while'
|
|
||||||
" the parallel_tool_calls client option is preset for "
|
|
||||||
"compatibility reasons, it will be ignored."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
|
|
||||||
None
|
|
||||||
)
|
)
|
||||||
if reasoning_parser:
|
|
||||||
try:
|
|
||||||
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
|
|
||||||
reasoning_parser
|
|
||||||
)
|
|
||||||
assert self.reasoning_parser is not None
|
|
||||||
except Exception as e:
|
|
||||||
raise TypeError(f"{reasoning_parser=} has not been registered") from e
|
|
||||||
self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
|
|
||||||
if self.enable_auto_tools:
|
|
||||||
try:
|
|
||||||
if tool_parser == "pythonic" and self.model_config.model.startswith(
|
|
||||||
"meta-llama/Llama-3.2"
|
|
||||||
):
|
|
||||||
logger.warning(
|
|
||||||
"Llama3.2 models may struggle to emit valid pythonic tool calls"
|
|
||||||
)
|
|
||||||
self.tool_parser = ToolParserManager.get_tool_parser(tool_parser)
|
|
||||||
except Exception as e:
|
|
||||||
raise TypeError(
|
|
||||||
"Error: --enable-auto-tool-choice requires "
|
|
||||||
f"tool_parser:'{tool_parser}' which has not "
|
|
||||||
"been registered"
|
|
||||||
) from e
|
|
||||||
self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
|
self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
|
||||||
|
|
||||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||||
|
|||||||
@ -63,7 +63,7 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
TranslationRequest,
|
TranslationRequest,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
|
||||||
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
|
from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
|
||||||
from vllm.entrypoints.utils import _validate_truncation_size
|
from vllm.entrypoints.utils import _validate_truncation_size
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
@ -82,6 +82,7 @@ from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error
|
|||||||
)
|
)
|
||||||
from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
|
from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
|
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||||
from vllm.tracing import (
|
from vllm.tracing import (
|
||||||
contains_trace_headers,
|
contains_trace_headers,
|
||||||
@ -274,6 +275,50 @@ class OpenAIServing:
|
|||||||
self.model_config = self.models.model_config
|
self.model_config = self.models.model_config
|
||||||
self.max_model_len = self.model_config.max_model_len
|
self.max_model_len = self.model_config.max_model_len
|
||||||
|
|
||||||
|
def _get_tool_parser(
|
||||||
|
self, tool_parser_name: Optional[str] = None, enable_auto_tools: bool = False
|
||||||
|
) -> Optional[Callable[[AnyTokenizer], ToolParser]]:
|
||||||
|
"""Get the tool parser based on the name."""
|
||||||
|
parser = None
|
||||||
|
if not enable_auto_tools or tool_parser_name is None:
|
||||||
|
return parser
|
||||||
|
logger.info(
|
||||||
|
'"auto" tool choice has been enabled please note that while'
|
||||||
|
" the parallel_tool_calls client option is preset for "
|
||||||
|
"compatibility reasons, it will be ignored."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if tool_parser_name == "pythonic" and self.model_config.model.startswith(
|
||||||
|
"meta-llama/Llama-3.2"
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
"Llama3.2 models may struggle to emit valid pythonic tool calls"
|
||||||
|
)
|
||||||
|
parser = ToolParserManager.get_tool_parser(tool_parser_name)
|
||||||
|
except Exception as e:
|
||||||
|
raise TypeError(
|
||||||
|
"Error: --enable-auto-tool-choice requires "
|
||||||
|
f"tool_parser:'{tool_parser_name}' which has not "
|
||||||
|
"been registered"
|
||||||
|
) from e
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def _get_reasoning_parser(
|
||||||
|
self,
|
||||||
|
reasoning_parser_name: str,
|
||||||
|
) -> Optional[Callable[[AnyTokenizer], ReasoningParser]]:
|
||||||
|
"""Get the reasoning parser based on the name."""
|
||||||
|
parser = None
|
||||||
|
if not reasoning_parser_name:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
parser = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
|
||||||
|
assert parser is not None
|
||||||
|
except Exception as e:
|
||||||
|
raise TypeError(f"{reasoning_parser_name=} has not been registered") from e
|
||||||
|
return parser
|
||||||
|
|
||||||
async def reset_mm_cache(self) -> None:
|
async def reset_mm_cache(self) -> None:
|
||||||
self.processor.clear_mm_cache()
|
self.processor.clear_mm_cache()
|
||||||
await self.engine_client.reset_mm_cache()
|
await self.engine_client.reset_mm_cache()
|
||||||
|
|||||||
@ -96,7 +96,6 @@ from vllm.logger import init_logger
|
|||||||
from vllm.logprobs import Logprob as SampleLogprob
|
from vllm.logprobs import Logprob as SampleLogprob
|
||||||
from vllm.logprobs import SampleLogprobs
|
from vllm.logprobs import SampleLogprobs
|
||||||
from vllm.outputs import CompletionOutput
|
from vllm.outputs import CompletionOutput
|
||||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
@ -136,18 +135,9 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
self.chat_template_content_format: Final = chat_template_content_format
|
self.chat_template_content_format: Final = chat_template_content_format
|
||||||
self.enable_log_outputs = enable_log_outputs
|
self.enable_log_outputs = enable_log_outputs
|
||||||
|
|
||||||
self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
|
self.reasoning_parser = self._get_reasoning_parser(
|
||||||
None
|
reasoning_parser_name=reasoning_parser
|
||||||
)
|
)
|
||||||
if reasoning_parser:
|
|
||||||
try:
|
|
||||||
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
|
|
||||||
reasoning_parser
|
|
||||||
)
|
|
||||||
assert self.reasoning_parser is not None
|
|
||||||
except Exception as e:
|
|
||||||
raise TypeError(f"{reasoning_parser=} has not been registered") from e
|
|
||||||
|
|
||||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||||
self.enable_force_include_usage = enable_force_include_usage
|
self.enable_force_include_usage = enable_force_include_usage
|
||||||
self.default_sampling_params = self.model_config.get_diff_sampling_param()
|
self.default_sampling_params = self.model_config.get_diff_sampling_param()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user