[Misc] Remove yapf directives (#29675)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-11-28 23:07:23 +08:00 committed by GitHub
parent 460d8bbf2d
commit 0808eb813b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 52 additions and 43 deletions

View File

@ -5,15 +5,12 @@ import importlib
from collections.abc import Callable from collections.abc import Callable
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
# yapf: disable
from vllm.distributed.ec_transfer.ec_connector.base import ( from vllm.distributed.ec_transfer.ec_connector.base import (
ECConnectorBase, ECConnectorBase,
ECConnectorRole, ECConnectorRole,
) )
from vllm.logger import init_logger from vllm.logger import init_logger
# yapf: enable
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ECTransferConfig, VllmConfig from vllm.config import ECTransferConfig, VllmConfig

View File

@ -7,7 +7,6 @@ from collections.abc import Sequence as GenericSequence
from fastapi import Request from fastapi import Request
# yapf: disable
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
@ -49,22 +48,26 @@ class ServingTokens(OpenAIServing):
enable_prompt_tokens_details: bool = False, enable_prompt_tokens_details: bool = False,
enable_log_outputs: bool = False, enable_log_outputs: bool = False,
): ):
super().__init__(engine_client=engine_client, super().__init__(
models=models, engine_client=engine_client,
request_logger=request_logger, models=models,
return_tokens_as_token_ids=return_tokens_as_token_ids, request_logger=request_logger,
log_error_stack=log_error_stack) return_tokens_as_token_ids=return_tokens_as_token_ids,
log_error_stack=log_error_stack,
)
self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_prompt_tokens_details = enable_prompt_tokens_details
self.enable_log_outputs = enable_log_outputs self.enable_log_outputs = enable_log_outputs
self.force_no_detokenize = force_no_detokenize self.force_no_detokenize = force_no_detokenize
if force_no_detokenize: if force_no_detokenize:
logger.info("Tokens-only mode is enabled, skipping detokenization " logger.info(
"step for incoming requests.") "Tokens-only mode is enabled, skipping detokenization "
"step for incoming requests."
)
async def serve_tokens( async def serve_tokens(
self, self,
request: GenerateRequest, request: GenerateRequest,
raw_request: Request | None = None raw_request: Request | None = None,
) -> GenerateResponse | ErrorResponse: ) -> GenerateResponse | ErrorResponse:
error_check_ret = await self._check_model(request) error_check_ret = await self._check_model(request)
if error_check_ret is not None: if error_check_ret is not None:
@ -78,13 +81,13 @@ class ServingTokens(OpenAIServing):
raise self.engine_client.dead_error raise self.engine_client.dead_error
lora_request = None lora_request = None
lora_request = self._maybe_get_adapters(request, lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
supports_default_mm_loras=True)
model_name = self.models.model_name(lora_request) model_name = self.models.model_name(lora_request)
request_id = "generate-tokens-" \ request_id = (
f"{self._base_request_id(raw_request, request.request_id)}" f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
)
request_metadata = RequestResponseMetadata(request_id=request_id) request_metadata = RequestResponseMetadata(request_id=request_id)
if raw_request: if raw_request:
@ -106,13 +109,18 @@ class ServingTokens(OpenAIServing):
if self.force_no_detokenize: if self.force_no_detokenize:
sampling_params.detokenize = False sampling_params.detokenize = False
self._log_inputs(request_id, self._log_inputs(
request.token_ids, request_id,
params=sampling_params, request.token_ids,
lora_request=lora_request) params=sampling_params,
lora_request=lora_request,
)
trace_headers = (None if raw_request is None else await trace_headers = (
self._get_trace_headers(raw_request.headers)) None
if raw_request is None
else await self._get_trace_headers(raw_request.headers)
)
result_generator = self.engine_client.generate( result_generator = self.engine_client.generate(
engine_prompt, engine_prompt,
@ -131,8 +139,8 @@ class ServingTokens(OpenAIServing):
try: try:
assert result_generator is not None assert result_generator is not None
return await self.serve_tokens_full_generator( return await self.serve_tokens_full_generator(
request, result_generator, request_id, model_name, request, result_generator, request_id, model_name, request_metadata
request_metadata) )
except ValueError as e: except ValueError as e:
return self.create_error_response(str(e)) return self.create_error_response(str(e))
@ -144,7 +152,6 @@ class ServingTokens(OpenAIServing):
model_name: str, model_name: str,
request_metadata: RequestResponseMetadata, request_metadata: RequestResponseMetadata,
) -> ErrorResponse | GenerateResponse: ) -> ErrorResponse | GenerateResponse:
created_time = int(time.time()) created_time = int(time.time())
final_res: RequestOutput | None = None final_res: RequestOutput | None = None
sampling_params: SamplingParams = request.sampling_params sampling_params: SamplingParams = request.sampling_params
@ -179,9 +186,9 @@ class ServingTokens(OpenAIServing):
choice_data = GenerateResponseChoice( choice_data = GenerateResponseChoice(
index=output.index, index=output.index,
logprobs=logprobs, logprobs=logprobs,
finish_reason=output.finish_reason finish_reason=output.finish_reason if output.finish_reason else "stop",
if output.finish_reason else "stop", token_ids=as_list(output.token_ids),
token_ids=as_list(output.token_ids)) )
choices.append(choice_data) choices.append(choice_data)
num_generated_tokens += len(output.token_ids) num_generated_tokens += len(output.token_ids)
@ -191,14 +198,16 @@ class ServingTokens(OpenAIServing):
if final_res.encoder_prompt_token_ids is not None: if final_res.encoder_prompt_token_ids is not None:
num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
usage = UsageInfo(prompt_tokens=num_prompt_tokens, usage = UsageInfo(
completion_tokens=num_generated_tokens, prompt_tokens=num_prompt_tokens,
total_tokens=num_prompt_tokens + completion_tokens=num_generated_tokens,
num_generated_tokens) total_tokens=num_prompt_tokens + num_generated_tokens,
)
if self.enable_prompt_tokens_details and final_res.num_cached_tokens: if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
# This info is not available at the /coordinator level # This info is not available at the /coordinator level
usage.prompt_tokens_details = PromptTokenUsageInfo( usage.prompt_tokens_details = PromptTokenUsageInfo(
cached_tokens=final_res.num_cached_tokens) cached_tokens=final_res.num_cached_tokens
)
request_metadata.final_usage_info = usage request_metadata.final_usage_info = usage
@ -218,14 +227,13 @@ class ServingTokens(OpenAIServing):
# Get the corresponding output token IDs # Get the corresponding output token IDs
output_token_ids = None output_token_ids = None
if choice.index < len(final_res.outputs): if choice.index < len(final_res.outputs):
output_token_ids = final_res.outputs[ output_token_ids = final_res.outputs[choice.index].token_ids
choice.index].token_ids
if output_token_ids: if output_token_ids:
# Log token_ids only. # Log token_ids only.
self.request_logger.log_outputs( self.request_logger.log_outputs(
request_id=request_id, request_id=request_id,
outputs="", outputs="",
output_token_ids=output_token_ids, output_token_ids=output_token_ids,
finish_reason=choice.finish_reason, finish_reason=choice.finish_reason,
is_streaming=False, is_streaming=False,
@ -246,10 +254,12 @@ class ServingTokens(OpenAIServing):
for i, token_id in enumerate(token_ids): for i, token_id in enumerate(token_ids):
token = f"token_id:{token_id}" token = f"token_id:{token_id}"
step_top_logprobs = top_logprobs[i] step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None or step_top_logprobs.get( if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
token_id) is None:
logprobs_content.append( logprobs_content.append(
ChatCompletionLogProbsContent(token=token, )) ChatCompletionLogProbsContent(
token=token,
)
)
else: else:
step_token = step_top_logprobs[token_id] step_token = step_top_logprobs[token_id]
@ -261,9 +271,11 @@ class ServingTokens(OpenAIServing):
ChatCompletionLogProb( ChatCompletionLogProb(
token=token, token=token,
logprob=max(p[1].logprob, -9999.0), logprob=max(p[1].logprob, -9999.0),
) for i, p in enumerate(step_top_logprobs.items()) )
if num_output_top_logprobs for i, p in enumerate(step_top_logprobs.items())
and i < num_output_top_logprobs if num_output_top_logprobs and i < num_output_top_logprobs
])) ],
)
)
return ChatCompletionLogProbs(content=logprobs_content) return ChatCompletionLogProbs(content=logprobs_content)