mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 06:25:01 +08:00
[Bugfix] Last token measurement fix (#11376)
Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
parent
df04dffade
commit
b5cbe8eeb3
@ -1124,6 +1124,8 @@ class LLMEngine:
|
|||||||
|
|
||||||
seq_group = scheduled_seq_group.seq_group
|
seq_group = scheduled_seq_group.seq_group
|
||||||
seq_group.maybe_set_first_token_time(now)
|
seq_group.maybe_set_first_token_time(now)
|
||||||
|
if not seq_group.is_prefill():
|
||||||
|
seq_group.set_last_token_time(now)
|
||||||
request_output = RequestOutputFactory.create(
|
request_output = RequestOutputFactory.create(
|
||||||
seq_group,
|
seq_group,
|
||||||
self.seq_id_to_seq_group,
|
self.seq_id_to_seq_group,
|
||||||
@ -1166,6 +1168,8 @@ class LLMEngine:
|
|||||||
|
|
||||||
seq_group = scheduled_seq_group.seq_group
|
seq_group = scheduled_seq_group.seq_group
|
||||||
seq_group.maybe_set_first_token_time(now)
|
seq_group.maybe_set_first_token_time(now)
|
||||||
|
if not seq_group.is_prefill():
|
||||||
|
seq_group.set_last_token_time(now)
|
||||||
request_output = RequestOutputFactory.create(
|
request_output = RequestOutputFactory.create(
|
||||||
seq_group,
|
seq_group,
|
||||||
self.seq_id_to_seq_group,
|
self.seq_id_to_seq_group,
|
||||||
@ -1686,7 +1690,7 @@ class LLMEngine:
|
|||||||
# If the seq_group just finished the prefill state
|
# If the seq_group just finished the prefill state
|
||||||
# get TTFT.
|
# get TTFT.
|
||||||
if not seq_group.is_prefill():
|
if not seq_group.is_prefill():
|
||||||
latency = seq_group.get_last_latency(now)
|
latency = seq_group.get_last_token_latency()
|
||||||
time_to_first_tokens_iter.append(latency)
|
time_to_first_tokens_iter.append(latency)
|
||||||
|
|
||||||
# One generation token per finished prefill.
|
# One generation token per finished prefill.
|
||||||
@ -1694,7 +1698,7 @@ class LLMEngine:
|
|||||||
seq_group.num_seqs())
|
seq_group.num_seqs())
|
||||||
else:
|
else:
|
||||||
# TPOTs.
|
# TPOTs.
|
||||||
latency = seq_group.get_last_latency(now)
|
latency = seq_group.get_last_token_latency()
|
||||||
time_per_output_tokens_iter.append(latency)
|
time_per_output_tokens_iter.append(latency)
|
||||||
if seq_group.state.current_step == 0:
|
if seq_group.state.current_step == 0:
|
||||||
# For async_output_proc, the do_log_stats()
|
# For async_output_proc, the do_log_stats()
|
||||||
|
|||||||
@ -667,6 +667,7 @@ class SequenceGroup:
|
|||||||
first_scheduled_time=None,
|
first_scheduled_time=None,
|
||||||
first_token_time=None,
|
first_token_time=None,
|
||||||
time_in_queue=None)
|
time_in_queue=None)
|
||||||
|
self.last_token_latency = 0.0
|
||||||
self.lora_request = lora_request
|
self.lora_request = lora_request
|
||||||
self.prompt_logprobs: Optional[PromptLogprobs] = None
|
self.prompt_logprobs: Optional[PromptLogprobs] = None
|
||||||
self.state = SequenceGroupState()
|
self.state = SequenceGroupState()
|
||||||
@ -762,18 +763,21 @@ class SequenceGroup:
|
|||||||
assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
|
assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
|
||||||
self.init_multi_step(num_steps=num_lookahead_slots + 1)
|
self.init_multi_step(num_steps=num_lookahead_slots + 1)
|
||||||
|
|
||||||
def get_last_latency(self, now: float) -> float:
|
def set_last_token_time(self, now: float) -> None:
|
||||||
"""Sets the last token time for Request level timings."""
|
"""Sets the last token time for Request level timings."""
|
||||||
# If still in prefill phase, raise Error.
|
# If still in prefill phase, assertion fails.
|
||||||
if self.is_prefill():
|
assert not self.is_prefill(), (
|
||||||
raise ValueError(
|
"seq_group.set_last_token_time() should not be called "
|
||||||
"seq_group.get_last_latency() should not be called "
|
|
||||||
"if the seq_group is in prefill phase.")
|
"if the seq_group is in prefill phase.")
|
||||||
|
self.last_token_latency = now - self.metrics.last_token_time
|
||||||
# Otherwise return token latency.
|
|
||||||
latency = now - self.metrics.last_token_time
|
|
||||||
self.metrics.last_token_time = now
|
self.metrics.last_token_time = now
|
||||||
return latency
|
|
||||||
|
def get_last_token_latency(self) -> float:
|
||||||
|
"""Returns the latency of the last token."""
|
||||||
|
assert not self.is_prefill(), (
|
||||||
|
"seq_group.get_last_token_latency() should not be called "
|
||||||
|
"if the seq_group is in prefill phase.")
|
||||||
|
return self.last_token_latency
|
||||||
|
|
||||||
def maybe_set_first_token_time(self, time: float) -> None:
|
def maybe_set_first_token_time(self, time: float) -> None:
|
||||||
"""Sets the first token time for Request level timings."""
|
"""Sets the first token time for Request level timings."""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user