diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index e17f25afe4c9..5a97739e5a34 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -39,6 +39,9 @@ INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/re {"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" +INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}""" + def test_empty_file(): with ( @@ -188,3 +191,50 @@ def test_score(input_batch): line_dict = json.loads(line) assert isinstance(line_dict, dict) assert line_dict["error"] is None + + +def test_reasoning_parser(): + """ + Test that reasoning_parser parameter works correctly in run_batch. + """ + with ( + tempfile.NamedTemporaryFile("w") as input_file, + tempfile.NamedTemporaryFile("r") as output_file, + ): + input_file.write(INPUT_REASONING_BATCH) + input_file.flush() + proc = subprocess.Popen( + [ + "vllm", + "run-batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + "Qwen/Qwen3-0.6B", + "--reasoning-parser", + "qwen3", + ], + ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + # Ensure that there is no error in the response. + line_dict = json.loads(line) + assert isinstance(line_dict, dict) + assert line_dict["error"] is None + + # Check that reasoning_content is present and not empty + reasoning_content = line_dict["response"]["body"]["choices"][0]["message"][ + "reasoning_content" + ] + assert reasoning_content is not None + assert len(reasoning_content) > 0 diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index c8ca6e7d29ba..da036e30ba7e 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -31,6 +31,7 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_score import ServingScores from vllm.logger import init_logger +from vllm.reasoning import ReasoningParserManager from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -331,6 +332,17 @@ async def run_request( return batch_output +def validate_run_batch_args(args): + valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() + if ( + reasoning_parser := args.structured_outputs_config.reasoning_parser + ) and reasoning_parser not in valid_reasoning_parses: + raise KeyError( + f"invalid reasoning parser: {reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parses)} }})" + ) + + async def run_batch( engine_client: EngineClient, args: Namespace, @@ -359,6 +371,7 @@ async def run_batch( base_model_paths=base_model_paths, lora_modules=None, ) + openai_serving_chat = ( OpenAIServingChat( engine_client, @@ -367,12 +380,14 @@ async def run_batch( request_logger=request_logger, chat_template=None, chat_template_content_format="auto", + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, ) if "generate" in supported_tasks else None ) + openai_serving_embedding = ( OpenAIServingEmbedding( engine_client, @@ -504,6 +519,8 @@ async def main(args: Namespace): from vllm.entrypoints.openai.api_server import build_async_engine_client from vllm.usage.usage_lib import UsageContext + validate_run_batch_args(args) + async with build_async_engine_client( args, usage_context=UsageContext.OPENAI_BATCH_RUNNER,