diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index b59f7120551e0..7512723515e0c 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -60,7 +60,8 @@ async def generate(request: Request) -> Response: async def _generate(request_dict: dict, raw_request: Request) -> Response: prompt = request_dict.pop("prompt") stream = request_dict.pop("stream", False) - sampling_params = SamplingParams(**request_dict) + # Since SamplingParams is created fresh per request, safe to skip clone + sampling_params = SamplingParams(**request_dict, skip_clone=True) request_id = random_uuid() assert engine is not None diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 6be1f1a126f55..1a1874b7d6d22 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -642,7 +642,10 @@ class LLM: # following the huggingface transformers implementation # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa beam_search_params = SamplingParams( - logprobs=2 * beam_width, max_tokens=1, temperature=temperature + logprobs=2 * beam_width, + max_tokens=1, + temperature=temperature, + skip_clone=True, # Internal beam search, safe to skip clone ) instances: list[BeamSearchInstance] = [] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 982f5533ad7f9..a705ab0d4ddb9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -474,6 +474,7 @@ class ResponsesRequest(OpenAIBaseModel): ), structured_outputs=structured_outputs, logit_bias=self.logit_bias, + skip_clone=True, # Created fresh per request, safe to skip clone ) def is_include_output_logprobs(self) -> bool: @@ -876,6 +877,7 @@ class ChatCompletionRequest(OpenAIBaseModel): bad_words=self.bad_words, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, + skip_clone=True, # Created fresh per request, safe to skip clone ) @model_validator(mode="before") @@ -1316,6 +1318,7 @@ class CompletionRequest(OpenAIBaseModel): logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, + skip_clone=True, # Created fresh per request, safe to skip clone ) @model_validator(mode="before") @@ -2182,6 +2185,7 @@ class TranscriptionRequest(OpenAIBaseModel): if self.stream else RequestOutputKind.FINAL_ONLY, extra_args=self.vllm_xargs, + skip_clone=True, # Created fresh per request, safe to skip clone ) @model_validator(mode="before") @@ -2409,6 +2413,7 @@ class TranslationRequest(OpenAIBaseModel): output_kind=RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY, + skip_clone=True, # Created fresh per request, safe to skip clone ) @model_validator(mode="before") diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 22da46902da14..acfc8160a6d77 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -219,6 +219,7 @@ class OpenAISpeechToText(OpenAIServing): dummy_params = SamplingParams( max_tokens=1, temperature=0.0, + skip_clone=True, # Internal warmup, safe to skip clone ) # Process the dummy input through the input processor diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 453100f2e5135..541e6fbcdcfe3 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -211,6 +211,12 @@ class SamplingParams( set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). If set to `None`, truncation is disabled.""" output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE + skip_clone: bool = False + """Internal flag indicating that this SamplingParams instance is safe to + reuse without cloning. When True, clone() will return self without + performing a deep copy. This should only be set when the params object + is guaranteed to be dedicated to a single request and won't be modified + in ways that would affect other uses.""" # The below fields are not supposed to be used as an input. # They are set in post_init. @@ -270,6 +276,7 @@ class SamplingParams( logit_bias: dict[int, float] | dict[str, float] | None = None, allowed_token_ids: list[int] | None = None, extra_args: dict[str, Any] | None = None, + skip_clone: bool = False, ) -> "SamplingParams": if logit_bias is not None: # Convert token_id to integer @@ -310,6 +317,7 @@ class SamplingParams( logit_bias=logit_bias, allowed_token_ids=allowed_token_ids, extra_args=extra_args, + skip_clone=skip_clone, ) def __post_init__(self) -> None: @@ -540,8 +548,13 @@ class SamplingParams( data that is expensive to copy. However, if not copied, the processor needs to support parallel decoding for multiple sequences See https://github.com/vllm-project/vllm/issues/3087 + + If skip_clone is True, uses shallow copy instead of deep copy. """ + if self.skip_clone: + return copy.copy(self) + logit_processor_refs = ( None if self.logits_processors is None