mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:35:00 +08:00
[Misc] Remove yapf directives (#29675)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
460d8bbf2d
commit
0808eb813b
@ -5,15 +5,12 @@ import importlib
|
|||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.distributed.ec_transfer.ec_connector.base import (
|
from vllm.distributed.ec_transfer.ec_connector.base import (
|
||||||
ECConnectorBase,
|
ECConnectorBase,
|
||||||
ECConnectorRole,
|
ECConnectorRole,
|
||||||
)
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ECTransferConfig, VllmConfig
|
from vllm.config import ECTransferConfig, VllmConfig
|
||||||
|
|
||||||
|
|||||||
@ -7,7 +7,6 @@ from collections.abc import Sequence as GenericSequence
|
|||||||
|
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.logger import RequestLogger
|
from vllm.entrypoints.logger import RequestLogger
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
@ -49,22 +48,26 @@ class ServingTokens(OpenAIServing):
|
|||||||
enable_prompt_tokens_details: bool = False,
|
enable_prompt_tokens_details: bool = False,
|
||||||
enable_log_outputs: bool = False,
|
enable_log_outputs: bool = False,
|
||||||
):
|
):
|
||||||
super().__init__(engine_client=engine_client,
|
super().__init__(
|
||||||
|
engine_client=engine_client,
|
||||||
models=models,
|
models=models,
|
||||||
request_logger=request_logger,
|
request_logger=request_logger,
|
||||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||||
log_error_stack=log_error_stack)
|
log_error_stack=log_error_stack,
|
||||||
|
)
|
||||||
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
self.enable_prompt_tokens_details = enable_prompt_tokens_details
|
||||||
self.enable_log_outputs = enable_log_outputs
|
self.enable_log_outputs = enable_log_outputs
|
||||||
self.force_no_detokenize = force_no_detokenize
|
self.force_no_detokenize = force_no_detokenize
|
||||||
if force_no_detokenize:
|
if force_no_detokenize:
|
||||||
logger.info("Tokens-only mode is enabled, skipping detokenization "
|
logger.info(
|
||||||
"step for incoming requests.")
|
"Tokens-only mode is enabled, skipping detokenization "
|
||||||
|
"step for incoming requests."
|
||||||
|
)
|
||||||
|
|
||||||
async def serve_tokens(
|
async def serve_tokens(
|
||||||
self,
|
self,
|
||||||
request: GenerateRequest,
|
request: GenerateRequest,
|
||||||
raw_request: Request | None = None
|
raw_request: Request | None = None,
|
||||||
) -> GenerateResponse | ErrorResponse:
|
) -> GenerateResponse | ErrorResponse:
|
||||||
error_check_ret = await self._check_model(request)
|
error_check_ret = await self._check_model(request)
|
||||||
if error_check_ret is not None:
|
if error_check_ret is not None:
|
||||||
@ -78,13 +81,13 @@ class ServingTokens(OpenAIServing):
|
|||||||
raise self.engine_client.dead_error
|
raise self.engine_client.dead_error
|
||||||
|
|
||||||
lora_request = None
|
lora_request = None
|
||||||
lora_request = self._maybe_get_adapters(request,
|
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
|
||||||
supports_default_mm_loras=True)
|
|
||||||
|
|
||||||
model_name = self.models.model_name(lora_request)
|
model_name = self.models.model_name(lora_request)
|
||||||
|
|
||||||
request_id = "generate-tokens-" \
|
request_id = (
|
||||||
f"{self._base_request_id(raw_request, request.request_id)}"
|
f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
|
||||||
|
)
|
||||||
|
|
||||||
request_metadata = RequestResponseMetadata(request_id=request_id)
|
request_metadata = RequestResponseMetadata(request_id=request_id)
|
||||||
if raw_request:
|
if raw_request:
|
||||||
@ -106,13 +109,18 @@ class ServingTokens(OpenAIServing):
|
|||||||
if self.force_no_detokenize:
|
if self.force_no_detokenize:
|
||||||
sampling_params.detokenize = False
|
sampling_params.detokenize = False
|
||||||
|
|
||||||
self._log_inputs(request_id,
|
self._log_inputs(
|
||||||
|
request_id,
|
||||||
request.token_ids,
|
request.token_ids,
|
||||||
params=sampling_params,
|
params=sampling_params,
|
||||||
lora_request=lora_request)
|
lora_request=lora_request,
|
||||||
|
)
|
||||||
|
|
||||||
trace_headers = (None if raw_request is None else await
|
trace_headers = (
|
||||||
self._get_trace_headers(raw_request.headers))
|
None
|
||||||
|
if raw_request is None
|
||||||
|
else await self._get_trace_headers(raw_request.headers)
|
||||||
|
)
|
||||||
|
|
||||||
result_generator = self.engine_client.generate(
|
result_generator = self.engine_client.generate(
|
||||||
engine_prompt,
|
engine_prompt,
|
||||||
@ -131,8 +139,8 @@ class ServingTokens(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
assert result_generator is not None
|
assert result_generator is not None
|
||||||
return await self.serve_tokens_full_generator(
|
return await self.serve_tokens_full_generator(
|
||||||
request, result_generator, request_id, model_name,
|
request, result_generator, request_id, model_name, request_metadata
|
||||||
request_metadata)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
|
|
||||||
@ -144,7 +152,6 @@ class ServingTokens(OpenAIServing):
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
request_metadata: RequestResponseMetadata,
|
request_metadata: RequestResponseMetadata,
|
||||||
) -> ErrorResponse | GenerateResponse:
|
) -> ErrorResponse | GenerateResponse:
|
||||||
|
|
||||||
created_time = int(time.time())
|
created_time = int(time.time())
|
||||||
final_res: RequestOutput | None = None
|
final_res: RequestOutput | None = None
|
||||||
sampling_params: SamplingParams = request.sampling_params
|
sampling_params: SamplingParams = request.sampling_params
|
||||||
@ -179,9 +186,9 @@ class ServingTokens(OpenAIServing):
|
|||||||
choice_data = GenerateResponseChoice(
|
choice_data = GenerateResponseChoice(
|
||||||
index=output.index,
|
index=output.index,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
finish_reason=output.finish_reason
|
finish_reason=output.finish_reason if output.finish_reason else "stop",
|
||||||
if output.finish_reason else "stop",
|
token_ids=as_list(output.token_ids),
|
||||||
token_ids=as_list(output.token_ids))
|
)
|
||||||
|
|
||||||
choices.append(choice_data)
|
choices.append(choice_data)
|
||||||
num_generated_tokens += len(output.token_ids)
|
num_generated_tokens += len(output.token_ids)
|
||||||
@ -191,14 +198,16 @@ class ServingTokens(OpenAIServing):
|
|||||||
if final_res.encoder_prompt_token_ids is not None:
|
if final_res.encoder_prompt_token_ids is not None:
|
||||||
num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
|
num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
|
||||||
|
|
||||||
usage = UsageInfo(prompt_tokens=num_prompt_tokens,
|
usage = UsageInfo(
|
||||||
|
prompt_tokens=num_prompt_tokens,
|
||||||
completion_tokens=num_generated_tokens,
|
completion_tokens=num_generated_tokens,
|
||||||
total_tokens=num_prompt_tokens +
|
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||||
num_generated_tokens)
|
)
|
||||||
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
|
if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
|
||||||
# This info is not available at the /coordinator level
|
# This info is not available at the /coordinator level
|
||||||
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
usage.prompt_tokens_details = PromptTokenUsageInfo(
|
||||||
cached_tokens=final_res.num_cached_tokens)
|
cached_tokens=final_res.num_cached_tokens
|
||||||
|
)
|
||||||
|
|
||||||
request_metadata.final_usage_info = usage
|
request_metadata.final_usage_info = usage
|
||||||
|
|
||||||
@ -218,8 +227,7 @@ class ServingTokens(OpenAIServing):
|
|||||||
# Get the corresponding output token IDs
|
# Get the corresponding output token IDs
|
||||||
output_token_ids = None
|
output_token_ids = None
|
||||||
if choice.index < len(final_res.outputs):
|
if choice.index < len(final_res.outputs):
|
||||||
output_token_ids = final_res.outputs[
|
output_token_ids = final_res.outputs[choice.index].token_ids
|
||||||
choice.index].token_ids
|
|
||||||
|
|
||||||
if output_token_ids:
|
if output_token_ids:
|
||||||
# Log token_ids only.
|
# Log token_ids only.
|
||||||
@ -246,10 +254,12 @@ class ServingTokens(OpenAIServing):
|
|||||||
for i, token_id in enumerate(token_ids):
|
for i, token_id in enumerate(token_ids):
|
||||||
token = f"token_id:{token_id}"
|
token = f"token_id:{token_id}"
|
||||||
step_top_logprobs = top_logprobs[i]
|
step_top_logprobs = top_logprobs[i]
|
||||||
if step_top_logprobs is None or step_top_logprobs.get(
|
if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
|
||||||
token_id) is None:
|
|
||||||
logprobs_content.append(
|
logprobs_content.append(
|
||||||
ChatCompletionLogProbsContent(token=token, ))
|
ChatCompletionLogProbsContent(
|
||||||
|
token=token,
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
step_token = step_top_logprobs[token_id]
|
step_token = step_top_logprobs[token_id]
|
||||||
|
|
||||||
@ -261,9 +271,11 @@ class ServingTokens(OpenAIServing):
|
|||||||
ChatCompletionLogProb(
|
ChatCompletionLogProb(
|
||||||
token=token,
|
token=token,
|
||||||
logprob=max(p[1].logprob, -9999.0),
|
logprob=max(p[1].logprob, -9999.0),
|
||||||
) for i, p in enumerate(step_top_logprobs.items())
|
)
|
||||||
if num_output_top_logprobs
|
for i, p in enumerate(step_top_logprobs.items())
|
||||||
and i < num_output_top_logprobs
|
if num_output_top_logprobs and i < num_output_top_logprobs
|
||||||
]))
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return ChatCompletionLogProbs(content=logprobs_content)
|
return ChatCompletionLogProbs(content=logprobs_content)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user