mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:15:01 +08:00
[V1] Move usage stats to worker and start logging TPU hardware (#16211)
This commit is contained in:
parent
a5450f11c9
commit
48cb2109b6
@ -174,6 +174,15 @@ class UsageMessage:
|
||||
cuda_get_device_properties(0, ("name", "total_memory")))
|
||||
if current_platform.is_cuda():
|
||||
self.cuda_runtime = torch.version.cuda
|
||||
if current_platform.is_tpu():
|
||||
try:
|
||||
import torch_xla
|
||||
self.gpu_count = torch_xla.runtime.world_size()
|
||||
self.gpu_type = torch_xla.tpu.get_tpu_type()
|
||||
self.gpu_memory_per_device = (
|
||||
torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
|
||||
except Exception:
|
||||
pass
|
||||
self.provider = _detect_cloud_provider()
|
||||
self.architecture = platform.machine()
|
||||
self.platform = platform.platform()
|
||||
|
||||
@ -36,7 +36,6 @@ from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
|
||||
StatLoggerBase)
|
||||
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
|
||||
from vllm.v1.utils import report_usage_stats
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -113,9 +112,6 @@ class AsyncLLM(EngineClient):
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
report_usage_stats(vllm_config, usage_context)
|
||||
|
||||
@classmethod
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
|
||||
@ -28,7 +28,6 @@ from vllm.v1.engine.output_processor import OutputProcessor
|
||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||
from vllm.v1.engine.processor import Processor
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.utils import report_usage_stats
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -97,9 +96,6 @@ class LLMEngine:
|
||||
# for v0 compatibility
|
||||
self.model_executor = self.engine_core.engine_core.model_executor # type: ignore
|
||||
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
report_usage_stats(vllm_config, usage_context)
|
||||
|
||||
@classmethod
|
||||
def from_vllm_config(
|
||||
cls,
|
||||
|
||||
@ -205,7 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
|
||||
return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
|
||||
|
||||
|
||||
def report_usage_stats(vllm_config, usage_context: UsageContext) -> None:
|
||||
def report_usage_stats(
|
||||
vllm_config,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
|
||||
"""Report usage statistics if enabled."""
|
||||
|
||||
if not is_usage_stats_enabled():
|
||||
|
||||
@ -23,6 +23,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.utils import report_usage_stats
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
from vllm.v1.worker.worker_base import WorkerBase
|
||||
|
||||
@ -141,6 +142,10 @@ class Worker(WorkerBase):
|
||||
self.model_runner: GPUModelRunner = GPUModelRunner(
|
||||
self.vllm_config, self.device)
|
||||
|
||||
if self.rank == 0:
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
report_usage_stats(self.vllm_config)
|
||||
|
||||
# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
|
||||
# to hijack tensor allocation.
|
||||
def load_model(self) -> None:
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
|
||||
KVCacheSpec)
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.utils import bind_kv_cache
|
||||
from vllm.v1.utils import bind_kv_cache, report_usage_stats
|
||||
from vllm.v1.worker.tpu_model_runner import TPUModelRunner
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -133,6 +133,10 @@ class TPUWorker:
|
||||
# Init ModelRunner here, so that we have access to self.device.
|
||||
self.model_runner = TPUModelRunner(self.vllm_config, self.device)
|
||||
|
||||
if rank == 0:
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
report_usage_stats(self.vllm_config)
|
||||
|
||||
def determine_available_memory(self) -> int:
|
||||
kv_caches: dict[str, torch.Tensor] = {}
|
||||
kv_cache_spec = self.model_runner.get_kv_cache_spec()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user