mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 23:45:39 +08:00
[BugFix] Use async Mistral Tokenizer in Chat Completions (#26134)
Signed-off-by: Ben Browning <bbrownin@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
67bc0c003e
commit
ea25a76c05
69
tests/entrypoints/openai/test_serving_engine.py
Normal file
69
tests/entrypoints/openai/test_serving_engine.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
|
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def serving() -> OpenAIServing:
|
||||||
|
"""Create a minimal OpenAIServing instance for testing."""
|
||||||
|
|
||||||
|
# Create minimal mocks
|
||||||
|
engine_client = Mock()
|
||||||
|
model_config = Mock(spec=ModelConfig)
|
||||||
|
model_config.max_model_len = 32768
|
||||||
|
models = Mock(spec=OpenAIServingModels)
|
||||||
|
|
||||||
|
serving = OpenAIServing(
|
||||||
|
engine_client=engine_client,
|
||||||
|
model_config=model_config,
|
||||||
|
models=models,
|
||||||
|
request_logger=None,
|
||||||
|
)
|
||||||
|
return serving
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_mistral_tokenizer_does_not_block_event_loop(
|
||||||
|
serving: OpenAIServing):
|
||||||
|
expected_tokens = [1, 2, 3]
|
||||||
|
|
||||||
|
# Mock the blocking version to sleep
|
||||||
|
def mocked_apply_chat_template(*_args, **_kwargs):
|
||||||
|
time.sleep(2)
|
||||||
|
return expected_tokens
|
||||||
|
|
||||||
|
mock_tokenizer = Mock(spec=MistralTokenizer)
|
||||||
|
mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template
|
||||||
|
|
||||||
|
task = serving._apply_mistral_chat_template_async(tokenizer=mock_tokenizer,
|
||||||
|
messages=[],
|
||||||
|
chat_template=None,
|
||||||
|
tools=[])
|
||||||
|
|
||||||
|
# Ensure the event loop is not blocked
|
||||||
|
blocked_count = 0
|
||||||
|
for _i in range(20): # Check over ~2 seconds
|
||||||
|
start = time.perf_counter()
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
elapsed = time.perf_counter() - start
|
||||||
|
|
||||||
|
# an overly generous elapsed time for slow machines
|
||||||
|
if elapsed >= 0.5:
|
||||||
|
blocked_count += 1
|
||||||
|
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
# Ensure task completes
|
||||||
|
tokens = await task
|
||||||
|
assert tokens == expected_tokens, "Mocked blocking tokenizer was not called"
|
||||||
|
assert blocked_count == 0, ("Event loop blocked during tokenization")
|
||||||
@ -80,7 +80,7 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams
|
|||||||
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
|
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
|
||||||
log_tracing_disabled_warning)
|
log_tracing_disabled_warning)
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
|
||||||
from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
|
from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of, make_async,
|
||||||
merge_async_iterators, random_uuid)
|
merge_async_iterators, random_uuid)
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -240,6 +240,8 @@ class OpenAIServing:
|
|||||||
self.enable_force_include_usage = enable_force_include_usage
|
self.enable_force_include_usage = enable_force_include_usage
|
||||||
|
|
||||||
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
|
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
|
||||||
|
self._apply_mistral_chat_template_async = make_async(
|
||||||
|
apply_mistral_chat_template, executor=self._tokenizer_executor)
|
||||||
|
|
||||||
self._async_tokenizer_pool: dict[AnyTokenizer,
|
self._async_tokenizer_pool: dict[AnyTokenizer,
|
||||||
AsyncMicrobatchTokenizer] = {}
|
AsyncMicrobatchTokenizer] = {}
|
||||||
@ -798,7 +800,7 @@ class OpenAIServing:
|
|||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
request_prompt = "placeholder"
|
request_prompt = "placeholder"
|
||||||
elif isinstance(tokenizer, MistralTokenizer):
|
elif isinstance(tokenizer, MistralTokenizer):
|
||||||
request_prompt = apply_mistral_chat_template(
|
request_prompt = await self._apply_mistral_chat_template_async(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
**_chat_template_kwargs,
|
**_chat_template_kwargs,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user