[Bugfix] reasoning_parser parameter handling in run_batch.py (#26225)

Signed-off-by: inc-jeong <inc.jeong@navercorp.com>
Signed-off-by: InChang Jeong <inc.jeong@navercorp.com>
Co-authored-by: USER <user@AL02367916.local>
This commit is contained in:
InChang Jeong 2025-10-16 11:24:05 +09:00 committed by GitHub
parent f96bc3649c
commit 0ecc553ee6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 67 additions and 0 deletions

View File

@ -39,6 +39,9 @@ INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/re
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
def test_empty_file():
with (
@ -188,3 +191,50 @@ def test_score(input_batch):
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
def test_reasoning_parser():
"""
Test that reasoning_parser parameter works correctly in run_batch.
"""
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(INPUT_REASONING_BATCH)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"Qwen/Qwen3-0.6B",
"--reasoning-parser",
"qwen3",
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
# Ensure that there is no error in the response.
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
# Check that reasoning_content is present and not empty
reasoning_content = line_dict["response"]["body"]["choices"][0]["message"][
"reasoning_content"
]
assert reasoning_content is not None
assert len(reasoning_content) > 0

View File

@ -31,6 +31,7 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.serving_score import ServingScores
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION
@ -331,6 +332,17 @@ async def run_request(
return batch_output
def validate_run_batch_args(args):
valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
if (
reasoning_parser := args.structured_outputs_config.reasoning_parser
) and reasoning_parser not in valid_reasoning_parses:
raise KeyError(
f"invalid reasoning parser: {reasoning_parser} "
f"(chose from {{ {','.join(valid_reasoning_parses)} }})"
)
async def run_batch(
engine_client: EngineClient,
args: Namespace,
@ -359,6 +371,7 @@ async def run_batch(
base_model_paths=base_model_paths,
lora_modules=None,
)
openai_serving_chat = (
OpenAIServingChat(
engine_client,
@ -367,12 +380,14 @@ async def run_batch(
request_logger=request_logger,
chat_template=None,
chat_template_content_format="auto",
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
)
if "generate" in supported_tasks
else None
)
openai_serving_embedding = (
OpenAIServingEmbedding(
engine_client,
@ -504,6 +519,8 @@ async def main(args: Namespace):
from vllm.entrypoints.openai.api_server import build_async_engine_client
from vllm.usage.usage_lib import UsageContext
validate_run_batch_args(args)
async with build_async_engine_client(
args,
usage_context=UsageContext.OPENAI_BATCH_RUNNER,