add(v1): RequestStatesStats to RequestOutput (#24947)

Signed-off-by: huijjj <huijong.jeong@squeezebits.com>
This commit is contained in:
HUIJONG JEONG 2025-10-03 17:56:25 +09:00 committed by GitHub
parent eb0fa43868
commit 3e70e3d4d5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 11 deletions

View File

@ -86,3 +86,16 @@ def test_max_model_len():
# It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit.
assert num_total_tokens <= max_model_len
def test_log_stats():
llm = LLM(
model=MODEL_NAME,
disable_log_stats=False,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
outputs = llm.generate(PROMPTS, sampling_params=None)
# disable_log_stats is False, every output should have metrics
assert all(output.metrics is not None for output in outputs)

View File

@ -14,6 +14,7 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs
from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import MultiModalPlaceholderDict
from vllm.sequence import RequestMetrics
from vllm.v1.metrics.stats import RequestStateStats
logger = init_logger(__name__)
@ -108,7 +109,7 @@ class RequestOutput:
prompt_logprobs: Optional[PromptLogprobs],
outputs: list[CompletionOutput],
finished: bool,
metrics: Optional[RequestMetrics] = None,
metrics: Optional[Union[RequestMetrics, RequestStateStats]] = None,
lora_request: Optional[LoRARequest] = None,
encoder_prompt: Optional[str] = None,
encoder_prompt_token_ids: Optional[list[int]] = None,

View File

@ -248,16 +248,15 @@ class RequestState:
if prompt_token_ids is None and self.prompt_embeds is not None:
prompt_token_ids = [0] * len(self.prompt_embeds)
return RequestOutput(
request_id=request_id,
prompt=self.prompt,
prompt_token_ids=prompt_token_ids,
prompt_logprobs=prompt_logprobs,
outputs=cast(list[CompletionOutput], outputs),
finished=finished,
kv_transfer_params=kv_transfer_params,
num_cached_tokens=self.num_cached_tokens,
)
return RequestOutput(request_id=request_id,
prompt=self.prompt,
prompt_token_ids=prompt_token_ids,
prompt_logprobs=prompt_logprobs,
outputs=cast(list[CompletionOutput], outputs),
finished=finished,
kv_transfer_params=kv_transfer_params,
num_cached_tokens=self.num_cached_tokens,
metrics=self.stats)
def _new_completion_output(
self,