diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py new file mode 100644 index 000000000000..ba6f10891159 --- /dev/null +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import time +from unittest.mock import Mock + +import pytest + +from vllm.config import ModelConfig +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer + + +@pytest.fixture() +def serving() -> OpenAIServing: + """Create a minimal OpenAIServing instance for testing.""" + + # Create minimal mocks + engine_client = Mock() + model_config = Mock(spec=ModelConfig) + model_config.max_model_len = 32768 + models = Mock(spec=OpenAIServingModels) + + serving = OpenAIServing( + engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=None, + ) + return serving + + +@pytest.mark.asyncio +async def test_async_mistral_tokenizer_does_not_block_event_loop( + serving: OpenAIServing): + expected_tokens = [1, 2, 3] + + # Mock the blocking version to sleep + def mocked_apply_chat_template(*_args, **_kwargs): + time.sleep(2) + return expected_tokens + + mock_tokenizer = Mock(spec=MistralTokenizer) + mock_tokenizer.apply_chat_template.side_effect = mocked_apply_chat_template + + task = serving._apply_mistral_chat_template_async(tokenizer=mock_tokenizer, + messages=[], + chat_template=None, + tools=[]) + + # Ensure the event loop is not blocked + blocked_count = 0 + for _i in range(20): # Check over ~2 seconds + start = time.perf_counter() + await asyncio.sleep(0) + elapsed = time.perf_counter() - start + + # an overly generous elapsed time for slow machines + if elapsed >= 0.5: + blocked_count += 1 + + await asyncio.sleep(0.1) + + # Ensure task completes + tokens = await task + assert tokens == expected_tokens, "Mocked blocking tokenizer was not called" + assert blocked_count == 0, ("Event loop blocked during tokenization") diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 0e5279baed29..e58d943d3f7f 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -80,7 +80,7 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of, +from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of, make_async, merge_async_iterators, random_uuid) logger = init_logger(__name__) @@ -240,6 +240,8 @@ class OpenAIServing: self.enable_force_include_usage = enable_force_include_usage self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) + self._apply_mistral_chat_template_async = make_async( + apply_mistral_chat_template, executor=self._tokenizer_executor) self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {} @@ -798,7 +800,7 @@ class OpenAIServing: if tokenizer is None: request_prompt = "placeholder" elif isinstance(tokenizer, MistralTokenizer): - request_prompt = apply_mistral_chat_template( + request_prompt = await self._apply_mistral_chat_template_async( tokenizer, messages=messages, **_chat_template_kwargs,