[V1] Move usage stats to worker and start logging TPU hardware (#16211)

This commit is contained in:
Daniel Li 2025-04-25 13:06:01 -07:00 committed by GitHub
parent a5450f11c9
commit 48cb2109b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 22 additions and 10 deletions

View File

@ -174,6 +174,15 @@ class UsageMessage:
cuda_get_device_properties(0, ("name", "total_memory")))
if current_platform.is_cuda():
self.cuda_runtime = torch.version.cuda
if current_platform.is_tpu():
try:
import torch_xla
self.gpu_count = torch_xla.runtime.world_size()
self.gpu_type = torch_xla.tpu.get_tpu_type()
self.gpu_memory_per_device = (
torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
except Exception:
pass
self.provider = _detect_cloud_provider()
self.architecture = platform.machine()
self.platform = platform.platform()

View File

@ -36,7 +36,6 @@ from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
StatLoggerBase)
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
from vllm.v1.utils import report_usage_stats
logger = init_logger(__name__)
@ -113,9 +112,6 @@ class AsyncLLM(EngineClient):
except RuntimeError:
pass
# If usage stat is enabled, collect relevant info.
report_usage_stats(vllm_config, usage_context)
@classmethod
def from_vllm_config(
cls,

View File

@ -28,7 +28,6 @@ from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.abstract import Executor
from vllm.v1.utils import report_usage_stats
logger = init_logger(__name__)
@ -97,9 +96,6 @@ class LLMEngine:
# for v0 compatibility
self.model_executor = self.engine_core.engine_core.model_executor # type: ignore
# If usage stat is enabled, collect relevant info.
report_usage_stats(vllm_config, usage_context)
@classmethod
def from_vllm_config(
cls,

View File

@ -205,7 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
def report_usage_stats(vllm_config, usage_context: UsageContext) -> None:
def report_usage_stats(
vllm_config,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
"""Report usage statistics if enabled."""
if not is_usage_stats_enabled():

View File

@ -23,6 +23,7 @@ from vllm.platforms import current_platform
from vllm.utils import GiB_bytes
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.utils import report_usage_stats
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from vllm.v1.worker.worker_base import WorkerBase
@ -141,6 +142,10 @@ class Worker(WorkerBase):
self.model_runner: GPUModelRunner = GPUModelRunner(
self.vllm_config, self.device)
if self.rank == 0:
# If usage stat is enabled, collect relevant info.
report_usage_stats(self.vllm_config)
# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
# to hijack tensor allocation.
def load_model(self) -> None:

View File

@ -21,7 +21,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
KVCacheSpec)
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.utils import bind_kv_cache
from vllm.v1.utils import bind_kv_cache, report_usage_stats
from vllm.v1.worker.tpu_model_runner import TPUModelRunner
logger = init_logger(__name__)
@ -133,6 +133,10 @@ class TPUWorker:
# Init ModelRunner here, so that we have access to self.device.
self.model_runner = TPUModelRunner(self.vllm_config, self.device)
if rank == 0:
# If usage stat is enabled, collect relevant info.
report_usage_stats(self.vllm_config)
def determine_available_memory(self) -> int:
kv_caches: dict[str, torch.Tensor] = {}
kv_cache_spec = self.model_runner.get_kv_cache_spec()