[Misc] Remove yapf directives (#29675)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-11-28 23:07:23 +08:00 committed by GitHub
parent 460d8bbf2d
commit 0808eb813b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 52 additions and 43 deletions

View File

@ -5,15 +5,12 @@ import importlib
from collections.abc import Callable
from typing import TYPE_CHECKING
# yapf: disable
from vllm.distributed.ec_transfer.ec_connector.base import (
ECConnectorBase,
ECConnectorRole,
)
from vllm.logger import init_logger
# yapf: enable
if TYPE_CHECKING:
from vllm.config import ECTransferConfig, VllmConfig

View File

@ -7,7 +7,6 @@ from collections.abc import Sequence as GenericSequence
from fastapi import Request
# yapf: disable
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import (
@ -49,22 +48,26 @@ class ServingTokens(OpenAIServing):
enable_prompt_tokens_details: bool = False,
enable_log_outputs: bool = False,
):
super().__init__(engine_client=engine_client,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
log_error_stack=log_error_stack)
super().__init__(
engine_client=engine_client,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
log_error_stack=log_error_stack,
)
self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_log_outputs = enable_log_outputs
self.force_no_detokenize = force_no_detokenize
if force_no_detokenize:
logger.info("Tokens-only mode is enabled, skipping detokenization "
"step for incoming requests.")
logger.info(
"Tokens-only mode is enabled, skipping detokenization "
"step for incoming requests."
)
async def serve_tokens(
self,
request: GenerateRequest,
raw_request: Request | None = None
raw_request: Request | None = None,
) -> GenerateResponse | ErrorResponse:
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
@ -78,13 +81,13 @@ class ServingTokens(OpenAIServing):
raise self.engine_client.dead_error
lora_request = None
lora_request = self._maybe_get_adapters(request,
supports_default_mm_loras=True)
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
model_name = self.models.model_name(lora_request)
request_id = "generate-tokens-" \
f"{self._base_request_id(raw_request, request.request_id)}"
request_id = (
f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
)
request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request:
@ -106,13 +109,18 @@ class ServingTokens(OpenAIServing):
if self.force_no_detokenize:
sampling_params.detokenize = False
self._log_inputs(request_id,
request.token_ids,
params=sampling_params,
lora_request=lora_request)
self._log_inputs(
request_id,
request.token_ids,
params=sampling_params,
lora_request=lora_request,
)
trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers))
trace_headers = (
None
if raw_request is None
else await self._get_trace_headers(raw_request.headers)
)
result_generator = self.engine_client.generate(
engine_prompt,
@ -131,8 +139,8 @@ class ServingTokens(OpenAIServing):
try:
assert result_generator is not None
return await self.serve_tokens_full_generator(
request, result_generator, request_id, model_name,
request_metadata)
request, result_generator, request_id, model_name, request_metadata
)
except ValueError as e:
return self.create_error_response(str(e))
@ -144,7 +152,6 @@ class ServingTokens(OpenAIServing):
model_name: str,
request_metadata: RequestResponseMetadata,
) -> ErrorResponse | GenerateResponse:
created_time = int(time.time())
final_res: RequestOutput | None = None
sampling_params: SamplingParams = request.sampling_params
@ -179,9 +186,9 @@ class ServingTokens(OpenAIServing):
choice_data = GenerateResponseChoice(
index=output.index,
logprobs=logprobs,
finish_reason=output.finish_reason
if output.finish_reason else "stop",
token_ids=as_list(output.token_ids))
finish_reason=output.finish_reason if output.finish_reason else "stop",
token_ids=as_list(output.token_ids),
)
choices.append(choice_data)
num_generated_tokens += len(output.token_ids)
@ -191,14 +198,16 @@ class ServingTokens(OpenAIServing):
if final_res.encoder_prompt_token_ids is not None:
num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
usage = UsageInfo(prompt_tokens=num_prompt_tokens,
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens +
num_generated_tokens)
usage = UsageInfo(
prompt_tokens=num_prompt_tokens,
completion_tokens=num_generated_tokens,
total_tokens=num_prompt_tokens + num_generated_tokens,
)
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
# This info is not available at the /coordinator level
usage.prompt_tokens_details = PromptTokenUsageInfo(
cached_tokens=final_res.num_cached_tokens)
cached_tokens=final_res.num_cached_tokens
)
request_metadata.final_usage_info = usage
@ -218,14 +227,13 @@ class ServingTokens(OpenAIServing):
# Get the corresponding output token IDs
output_token_ids = None
if choice.index < len(final_res.outputs):
output_token_ids = final_res.outputs[
choice.index].token_ids
output_token_ids = final_res.outputs[choice.index].token_ids
if output_token_ids:
# Log token_ids only.
self.request_logger.log_outputs(
request_id=request_id,
outputs="",
outputs="",
output_token_ids=output_token_ids,
finish_reason=choice.finish_reason,
is_streaming=False,
@ -246,10 +254,12 @@ class ServingTokens(OpenAIServing):
for i, token_id in enumerate(token_ids):
token = f"token_id:{token_id}"
step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None or step_top_logprobs.get(
token_id) is None:
if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
logprobs_content.append(
ChatCompletionLogProbsContent(token=token, ))
ChatCompletionLogProbsContent(
token=token,
)
)
else:
step_token = step_top_logprobs[token_id]
@ -261,9 +271,11 @@ class ServingTokens(OpenAIServing):
ChatCompletionLogProb(
token=token,
logprob=max(p[1].logprob, -9999.0),
) for i, p in enumerate(step_top_logprobs.items())
if num_output_top_logprobs
and i < num_output_top_logprobs
]))
)
for i, p in enumerate(step_top_logprobs.items())
if num_output_top_logprobs and i < num_output_top_logprobs
],
)
)
return ChatCompletionLogProbs(content=logprobs_content)