diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6cc685acd672..2a870dbc3afa 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing): try: for i, engine_prompt in enumerate(engine_prompts): prompt_text, _, _ = self._get_prompt_components(request_prompts[i]) + # If we are creating sub requests for multiple prompts, ensure that they + # have unique request ids. + sub_request_id = ( + request_id if len(engine_prompts) == 1 else f"{request_id}_{i}" + ) if self.default_sampling_params is None: self.default_sampling_params = {} @@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing): ) self._log_inputs( - request_id, + sub_request_id, request_prompts[i], params=sampling_params, lora_request=lora_request, @@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing): if isinstance(sampling_params, BeamSearchParams): generator = self.beam_search( prompt=engine_prompt, - request_id=request_id, + request_id=sub_request_id, params=sampling_params, lora_request=lora_request, trace_headers=trace_headers, ) else: engine_request, tokenization_kwargs = await self._process_inputs( - request_id, + sub_request_id, engine_prompt, sampling_params, lora_request=lora_request, @@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing): generator = self.engine_client.generate( engine_request, sampling_params, - request_id, + sub_request_id, lora_request=lora_request, trace_headers=trace_headers, priority=request.priority, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 7dab5dbacd28..de22c48809dc 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1242,16 +1242,19 @@ class OpenAIServing: ): prompt_text, _, _ = self._get_prompt_components(request_prompt) orig_priority = priority + sub_request = 0 while True: + # Ensure that each sub-request has a unique request id. + sub_request_id = f"{request_id}_{sub_request}" self._log_inputs( - request_id, + sub_request_id, request_prompt, params=sampling_params, lora_request=lora_request, ) trace_headers = kwargs.get("trace_headers") engine_request, tokenization_kwargs = await self._process_inputs( - request_id, + sub_request_id, engine_prompt, sampling_params, lora_request=lora_request, @@ -1262,7 +1265,7 @@ class OpenAIServing: generator = self.engine_client.generate( engine_request, sampling_params, - request_id, + sub_request_id, lora_request=lora_request, priority=priority, prompt_text=prompt_text, @@ -1295,6 +1298,7 @@ class OpenAIServing: sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids) # OPTIMIZATION priority = orig_priority - 1 + sub_request += 1 def _get_prompt_components( self,