diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6c6ec207a3cac..b6b3bf3f530e3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response."), + ) logits_processors: Optional[LogitsProcessors] = Field( default=None, description=( @@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response."), + ) # --8<-- [end:embedding-extra-params] @@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response."), + ) # --8<-- [end:chat-embedding-extra-params] @model_validator(mode="before") diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 323795ca4372d..22c6b6250394c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing): return self.create_error_response( "Echo is unsupported with prompt embeds.") - request_id = f"cmpl-{self._base_request_id(raw_request)}" + request_id = ( + f"cmpl-" + f"{self._base_request_id(raw_request, request.request_id)}") created_time = int(time.time()) request_metadata = RequestResponseMetadata(request_id=request_id) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 697f43c018b27..84ba00873103d 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin): for the API specification. This API mimics the OpenAI Embedding API. """ model_name = self._get_model_name(request.model) - request_id = (f"{self.request_id_prefix}-" - f"{self._base_request_id(raw_request)}") + request_id = ( + f"{self.request_id_prefix}-" + f"{self._base_request_id(raw_request, request.request_id)}") ctx = EmbeddingServeContext( request=request,