add(v1): RequestStatesStats to RequestOutput (#24947)

Signed-off-by: huijjj <huijong.jeong@squeezebits.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
HUIJONG JEONG 2025-10-03 17:56:25 +09:00 committed by yewentao256
parent 5b80f22087
commit edaae1825f
3 changed files with 24 additions and 11 deletions

View File

@ -86,3 +86,16 @@ def test_max_model_len():
# It can be less if generation finishes due to other reasons (e.g., EOS) # It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit. # before reaching the absolute model length limit.
assert num_total_tokens <= max_model_len assert num_total_tokens <= max_model_len
def test_log_stats():
llm = LLM(
model=MODEL_NAME,
disable_log_stats=False,
gpu_memory_utilization=0.10,
enforce_eager=True, # reduce test time
)
outputs = llm.generate(PROMPTS, sampling_params=None)
# disable_log_stats is False, every output should have metrics
assert all(output.metrics is not None for output in outputs)

View File

@ -14,6 +14,7 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import MultiModalPlaceholderDict from vllm.multimodal.inputs import MultiModalPlaceholderDict
from vllm.sequence import RequestMetrics from vllm.sequence import RequestMetrics
from vllm.v1.metrics.stats import RequestStateStats
logger = init_logger(__name__) logger = init_logger(__name__)
@ -108,7 +109,7 @@ class RequestOutput:
prompt_logprobs: Optional[PromptLogprobs], prompt_logprobs: Optional[PromptLogprobs],
outputs: list[CompletionOutput], outputs: list[CompletionOutput],
finished: bool, finished: bool,
metrics: Optional[RequestMetrics] = None, metrics: Optional[Union[RequestMetrics, RequestStateStats]] = None,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
encoder_prompt: Optional[str] = None, encoder_prompt: Optional[str] = None,
encoder_prompt_token_ids: Optional[list[int]] = None, encoder_prompt_token_ids: Optional[list[int]] = None,

View File

@ -248,8 +248,7 @@ class RequestState:
if prompt_token_ids is None and self.prompt_embeds is not None: if prompt_token_ids is None and self.prompt_embeds is not None:
prompt_token_ids = [0] * len(self.prompt_embeds) prompt_token_ids = [0] * len(self.prompt_embeds)
return RequestOutput( return RequestOutput(request_id=request_id,
request_id=request_id,
prompt=self.prompt, prompt=self.prompt,
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
prompt_logprobs=prompt_logprobs, prompt_logprobs=prompt_logprobs,
@ -257,7 +256,7 @@ class RequestState:
finished=finished, finished=finished,
kv_transfer_params=kv_transfer_params, kv_transfer_params=kv_transfer_params,
num_cached_tokens=self.num_cached_tokens, num_cached_tokens=self.num_cached_tokens,
) metrics=self.stats)
def _new_completion_output( def _new_completion_output(
self, self,