mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:45:01 +08:00
[Misc] Remove yapf directives (#29675)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
460d8bbf2d
commit
0808eb813b
@ -5,15 +5,12 @@ import importlib
|
||||
from collections.abc import Callable
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
# yapf: disable
|
||||
from vllm.distributed.ec_transfer.ec_connector.base import (
|
||||
ECConnectorBase,
|
||||
ECConnectorRole,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
# yapf: enable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ECTransferConfig, VllmConfig
|
||||
|
||||
|
||||
@ -7,7 +7,6 @@ from collections.abc import Sequence as GenericSequence
|
||||
|
||||
from fastapi import Request
|
||||
|
||||
# yapf: disable
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
@ -49,22 +48,26 @@ class ServingTokens(OpenAIServing):
|
||||
enable_prompt_tokens_details: bool = False,
|
||||
enable_log_outputs: bool = False,
|
||||
):
|
||||
super().__init__(engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
log_error_stack=log_error_stack)
|
||||
super().__init__(
|
||||
engine_client=engine_client,
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||
self.enable_log_outputs = enable_log_outputs
|
||||
self.force_no_detokenize = force_no_detokenize
|
||||
if force_no_detokenize:
|
||||
logger.info("Tokens-only mode is enabled, skipping detokenization "
|
||||
"step for incoming requests.")
|
||||
logger.info(
|
||||
"Tokens-only mode is enabled, skipping detokenization "
|
||||
"step for incoming requests."
|
||||
)
|
||||
|
||||
async def serve_tokens(
|
||||
self,
|
||||
request: GenerateRequest,
|
||||
raw_request: Request | None = None
|
||||
raw_request: Request | None = None,
|
||||
) -> GenerateResponse | ErrorResponse:
|
||||
error_check_ret = await self._check_model(request)
|
||||
if error_check_ret is not None:
|
||||
@ -78,13 +81,13 @@ class ServingTokens(OpenAIServing):
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
lora_request = None
|
||||
lora_request = self._maybe_get_adapters(request,
|
||||
supports_default_mm_loras=True)
|
||||
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
|
||||
|
||||
model_name = self.models.model_name(lora_request)
|
||||
|
||||
request_id = "generate-tokens-" \
|
||||
f"{self._base_request_id(raw_request, request.request_id)}"
|
||||
request_id = (
|
||||
f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
|
||||
)
|
||||
|
||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||
if raw_request:
|
||||
@ -106,13 +109,18 @@ class ServingTokens(OpenAIServing):
|
||||
if self.force_no_detokenize:
|
||||
sampling_params.detokenize = False
|
||||
|
||||
self._log_inputs(request_id,
|
||||
request.token_ids,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request)
|
||||
self._log_inputs(
|
||||
request_id,
|
||||
request.token_ids,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
trace_headers = (None if raw_request is None else await
|
||||
self._get_trace_headers(raw_request.headers))
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
result_generator = self.engine_client.generate(
|
||||
engine_prompt,
|
||||
@ -131,8 +139,8 @@ class ServingTokens(OpenAIServing):
|
||||
try:
|
||||
assert result_generator is not None
|
||||
return await self.serve_tokens_full_generator(
|
||||
request, result_generator, request_id, model_name,
|
||||
request_metadata)
|
||||
request, result_generator, request_id, model_name, request_metadata
|
||||
)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
@ -144,7 +152,6 @@ class ServingTokens(OpenAIServing):
|
||||
model_name: str,
|
||||
request_metadata: RequestResponseMetadata,
|
||||
) -> ErrorResponse | GenerateResponse:
|
||||
|
||||
created_time = int(time.time())
|
||||
final_res: RequestOutput | None = None
|
||||
sampling_params: SamplingParams = request.sampling_params
|
||||
@ -179,9 +186,9 @@ class ServingTokens(OpenAIServing):
|
||||
choice_data = GenerateResponseChoice(
|
||||
index=output.index,
|
||||
logprobs=logprobs,
|
||||
finish_reason=output.finish_reason
|
||||
if output.finish_reason else "stop",
|
||||
token_ids=as_list(output.token_ids))
|
||||
finish_reason=output.finish_reason if output.finish_reason else "stop",
|
||||
token_ids=as_list(output.token_ids),
|
||||
)
|
||||
|
||||
choices.append(choice_data)
|
||||
num_generated_tokens += len(output.token_ids)
|
||||
@ -191,14 +198,16 @@ class ServingTokens(OpenAIServing):
|
||||
if final_res.encoder_prompt_token_ids is not None:
|
||||
num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
|
||||
|
||||
usage = UsageInfo(prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=num_generated_tokens,
|
||||
total_tokens=num_prompt_tokens +
|
||||
num_generated_tokens)
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=num_generated_tokens,
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
)
|
||||
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
|
||||
# This info is not available at the /coordinator level
|
||||
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
||||
cached_tokens=final_res.num_cached_tokens)
|
||||
cached_tokens=final_res.num_cached_tokens
|
||||
)
|
||||
|
||||
request_metadata.final_usage_info = usage
|
||||
|
||||
@ -218,8 +227,7 @@ class ServingTokens(OpenAIServing):
|
||||
# Get the corresponding output token IDs
|
||||
output_token_ids = None
|
||||
if choice.index < len(final_res.outputs):
|
||||
output_token_ids = final_res.outputs[
|
||||
choice.index].token_ids
|
||||
output_token_ids = final_res.outputs[choice.index].token_ids
|
||||
|
||||
if output_token_ids:
|
||||
# Log token_ids only.
|
||||
@ -246,10 +254,12 @@ class ServingTokens(OpenAIServing):
|
||||
for i, token_id in enumerate(token_ids):
|
||||
token = f"token_id:{token_id}"
|
||||
step_top_logprobs = top_logprobs[i]
|
||||
if step_top_logprobs is None or step_top_logprobs.get(
|
||||
token_id) is None:
|
||||
if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
|
||||
logprobs_content.append(
|
||||
ChatCompletionLogProbsContent(token=token, ))
|
||||
ChatCompletionLogProbsContent(
|
||||
token=token,
|
||||
)
|
||||
)
|
||||
else:
|
||||
step_token = step_top_logprobs[token_id]
|
||||
|
||||
@ -261,9 +271,11 @@ class ServingTokens(OpenAIServing):
|
||||
ChatCompletionLogProb(
|
||||
token=token,
|
||||
logprob=max(p[1].logprob, -9999.0),
|
||||
) for i, p in enumerate(step_top_logprobs.items())
|
||||
if num_output_top_logprobs
|
||||
and i < num_output_top_logprobs
|
||||
]))
|
||||
)
|
||||
for i, p in enumerate(step_top_logprobs.items())
|
||||
if num_output_top_logprobs and i < num_output_top_logprobs
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
return ChatCompletionLogProbs(content=logprobs_content)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user