mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-06 11:29:08 +08:00
[V0 deprecation] Clean up num_prefill_tokens logic for V0 (#28203)
Signed-off-by: gcanlin <canlinguosdu@gmail.com>
This commit is contained in:
parent
4228be7959
commit
de120bc94f
@ -5,7 +5,7 @@ import time
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import TYPE_CHECKING, Any, NamedTuple, Union
|
from typing import TYPE_CHECKING, Any, NamedTuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -185,18 +185,13 @@ class ForwardContext:
|
|||||||
# copy from vllm_config.compilation_config.static_forward_context
|
# copy from vllm_config.compilation_config.static_forward_context
|
||||||
no_compile_layers: dict[str, Any]
|
no_compile_layers: dict[str, Any]
|
||||||
"""
|
"""
|
||||||
Type AttentionMetadata for v0,
|
|
||||||
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
|
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
|
||||||
attention layer to its attention metadata
|
attention layer to its attention metadata
|
||||||
Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
|
Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
|
||||||
for each microbatch.
|
for each microbatch.
|
||||||
Set dynamically for each forward pass
|
Set dynamically for each forward pass
|
||||||
"""
|
"""
|
||||||
attn_metadata: Union[
|
attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
|
||||||
"AttentionMetadata",
|
|
||||||
dict[str, "AttentionMetadata"],
|
|
||||||
list[dict[str, "AttentionMetadata"]],
|
|
||||||
]
|
|
||||||
# TODO: remove after making all virtual_engines share the same kv cache
|
# TODO: remove after making all virtual_engines share the same kv cache
|
||||||
virtual_engine: int # set dynamically for each forward pass
|
virtual_engine: int # set dynamically for each forward pass
|
||||||
# set dynamically for each forward pass
|
# set dynamically for each forward pass
|
||||||
@ -324,14 +319,7 @@ def set_forward_context(
|
|||||||
finally:
|
finally:
|
||||||
global last_logging_time, batchsize_logging_interval
|
global last_logging_time, batchsize_logging_interval
|
||||||
if need_to_track_batchsize:
|
if need_to_track_batchsize:
|
||||||
if hasattr(attn_metadata, "num_prefill_tokens"):
|
batchsize = num_tokens
|
||||||
# for v0 attention backends
|
|
||||||
batchsize = (
|
|
||||||
attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# for v1 attention backends
|
|
||||||
batchsize = num_tokens
|
|
||||||
# we use synchronous scheduling right now,
|
# we use synchronous scheduling right now,
|
||||||
# adding a sync point here should not affect
|
# adding a sync point here should not affect
|
||||||
# scheduling of the next batch
|
# scheduling of the next batch
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user