[V0 deprecation] Clean up num_prefill_tokens logic for V0 (#28203)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
This commit is contained in:
Canlin Guo 2025-11-12 02:57:12 +08:00 committed by GitHub
parent 4228be7959
commit de120bc94f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5,7 +5,7 @@ import time
from collections import defaultdict
from contextlib import contextmanager
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, NamedTuple, Union
from typing import TYPE_CHECKING, Any, NamedTuple
import torch
@ -185,18 +185,13 @@ class ForwardContext:
# copy from vllm_config.compilation_config.static_forward_context
no_compile_layers: dict[str, Any]
"""
Type AttentionMetadata for v0,
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata
Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
for each microbatch.
Set dynamically for each forward pass
"""
attn_metadata: Union[
"AttentionMetadata",
dict[str, "AttentionMetadata"],
list[dict[str, "AttentionMetadata"]],
]
attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
# TODO: remove after making all virtual_engines share the same kv cache
virtual_engine: int # set dynamically for each forward pass
# set dynamically for each forward pass
@ -324,14 +319,7 @@ def set_forward_context(
finally:
global last_logging_time, batchsize_logging_interval
if need_to_track_batchsize:
if hasattr(attn_metadata, "num_prefill_tokens"):
# for v0 attention backends
batchsize = (
attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
)
else:
# for v1 attention backends
batchsize = num_tokens
batchsize = num_tokens
# we use synchronous scheduling right now,
# adding a sync point here should not affect
# scheduling of the next batch