mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:45:00 +08:00
[BugFix] Fix duplicate id tool-call race condition (#29355)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
parent
b8328b49fb
commit
a178a0b40b
@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
try:
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
|
||||
# If we are creating sub requests for multiple prompts, ensure that they
|
||||
# have unique request ids.
|
||||
sub_request_id = (
|
||||
request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
|
||||
)
|
||||
|
||||
if self.default_sampling_params is None:
|
||||
self.default_sampling_params = {}
|
||||
@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
)
|
||||
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
request_prompts[i],
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if isinstance(sampling_params, BeamSearchParams):
|
||||
generator = self.beam_search(
|
||||
prompt=engine_prompt,
|
||||
request_id=request_id,
|
||||
request_id=sub_request_id,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
else:
|
||||
engine_request, tokenization_kwargs = await self._process_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
sampling_params,
|
||||
request_id,
|
||||
sub_request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
|
||||
@ -1242,16 +1242,19 @@ class OpenAIServing:
|
||||
):
|
||||
prompt_text, _, _ = self._get_prompt_components(request_prompt)
|
||||
orig_priority = priority
|
||||
sub_request = 0
|
||||
while True:
|
||||
# Ensure that each sub-request has a unique request id.
|
||||
sub_request_id = f"{request_id}_{sub_request}"
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
request_prompt,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
trace_headers = kwargs.get("trace_headers")
|
||||
engine_request, tokenization_kwargs = await self._process_inputs(
|
||||
request_id,
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
@ -1262,7 +1265,7 @@ class OpenAIServing:
|
||||
generator = self.engine_client.generate(
|
||||
engine_request,
|
||||
sampling_params,
|
||||
request_id,
|
||||
sub_request_id,
|
||||
lora_request=lora_request,
|
||||
priority=priority,
|
||||
prompt_text=prompt_text,
|
||||
@ -1295,6 +1298,7 @@ class OpenAIServing:
|
||||
sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
|
||||
# OPTIMIZATION
|
||||
priority = orig_priority - 1
|
||||
sub_request += 1
|
||||
|
||||
def _get_prompt_components(
|
||||
self,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user