From de120bc94f2e51633824093c626423ec8e7cb3a9 Mon Sep 17 00:00:00 2001
From: Canlin Guo <961750412@qq.com>
Date: Wed, 12 Nov 2025 02:57:12 +0800
Subject: [PATCH] [V0 deprecation] Clean up num_prefill_tokens logic for V0
 (#28203)

Signed-off-by: gcanlin <canlinguosdu@gmail.com>
---
 vllm/forward_context.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index ef37cf862c9fe..44bc2a4cda311 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,7 +5,7 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Union
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import torch
 
@@ -185,18 +185,13 @@ class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
     no_compile_layers: dict[str, Any]
     """
-    Type AttentionMetadata for v0, 
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
     Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: Union[
-        "AttentionMetadata",
-        dict[str, "AttentionMetadata"],
-        list[dict[str, "AttentionMetadata"]],
-    ]
+    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
@@ -324,14 +319,7 @@ def set_forward_context(
     finally:
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
-            if hasattr(attn_metadata, "num_prefill_tokens"):
-                # for v0 attention backends
-                batchsize = (
-                    attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
-                )
-            else:
-                # for v1 attention backends
-                batchsize = num_tokens
+            batchsize = num_tokens
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch