[V0 deprecation] Clean up num_prefill_tokens logic for V0 (#28203)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
This commit is contained in:
Canlin Guo 2025-11-12 02:57:12 +08:00 committed by GitHub
parent 4228be7959
commit de120bc94f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5,7 +5,7 @@ import time
from collections import defaultdict from collections import defaultdict
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, NamedTuple, Union from typing import TYPE_CHECKING, Any, NamedTuple
import torch import torch
@ -185,18 +185,13 @@ class ForwardContext:
# copy from vllm_config.compilation_config.static_forward_context # copy from vllm_config.compilation_config.static_forward_context
no_compile_layers: dict[str, Any] no_compile_layers: dict[str, Any]
""" """
Type AttentionMetadata for v0,
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata attention layer to its attention metadata
Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
for each microbatch. for each microbatch.
Set dynamically for each forward pass Set dynamically for each forward pass
""" """
attn_metadata: Union[ attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
"AttentionMetadata",
dict[str, "AttentionMetadata"],
list[dict[str, "AttentionMetadata"]],
]
# TODO: remove after making all virtual_engines share the same kv cache # TODO: remove after making all virtual_engines share the same kv cache
virtual_engine: int # set dynamically for each forward pass virtual_engine: int # set dynamically for each forward pass
# set dynamically for each forward pass # set dynamically for each forward pass
@ -324,14 +319,7 @@ def set_forward_context(
finally: finally:
global last_logging_time, batchsize_logging_interval global last_logging_time, batchsize_logging_interval
if need_to_track_batchsize: if need_to_track_batchsize:
if hasattr(attn_metadata, "num_prefill_tokens"): batchsize = num_tokens
# for v0 attention backends
batchsize = (
attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
)
else:
# for v1 attention backends
batchsize = num_tokens
# we use synchronous scheduling right now, # we use synchronous scheduling right now,
# adding a sync point here should not affect # adding a sync point here should not affect
# scheduling of the next batch # scheduling of the next batch