mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 19:34:59 +08:00
[V1] Move usage stats to worker and start logging TPU hardware (#16211)
This commit is contained in:
parent
a5450f11c9
commit
48cb2109b6
@ -174,6 +174,15 @@ class UsageMessage:
|
|||||||
cuda_get_device_properties(0, ("name", "total_memory")))
|
cuda_get_device_properties(0, ("name", "total_memory")))
|
||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
self.cuda_runtime = torch.version.cuda
|
self.cuda_runtime = torch.version.cuda
|
||||||
|
if current_platform.is_tpu():
|
||||||
|
try:
|
||||||
|
import torch_xla
|
||||||
|
self.gpu_count = torch_xla.runtime.world_size()
|
||||||
|
self.gpu_type = torch_xla.tpu.get_tpu_type()
|
||||||
|
self.gpu_memory_per_device = (
|
||||||
|
torch_xla.core.xla_model.get_memory_info()["bytes_limit"])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
self.provider = _detect_cloud_provider()
|
self.provider = _detect_cloud_provider()
|
||||||
self.architecture = platform.machine()
|
self.architecture = platform.machine()
|
||||||
self.platform = platform.platform()
|
self.platform = platform.platform()
|
||||||
|
|||||||
@ -36,7 +36,6 @@ from vllm.v1.executor.abstract import Executor
|
|||||||
from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
|
from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
|
||||||
StatLoggerBase)
|
StatLoggerBase)
|
||||||
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
|
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
|
||||||
from vllm.v1.utils import report_usage_stats
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -113,9 +112,6 @@ class AsyncLLM(EngineClient):
|
|||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If usage stat is enabled, collect relevant info.
|
|
||||||
report_usage_stats(vllm_config, usage_context)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_vllm_config(
|
def from_vllm_config(
|
||||||
cls,
|
cls,
|
||||||
|
|||||||
@ -28,7 +28,6 @@ from vllm.v1.engine.output_processor import OutputProcessor
|
|||||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||||
from vllm.v1.engine.processor import Processor
|
from vllm.v1.engine.processor import Processor
|
||||||
from vllm.v1.executor.abstract import Executor
|
from vllm.v1.executor.abstract import Executor
|
||||||
from vllm.v1.utils import report_usage_stats
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -97,9 +96,6 @@ class LLMEngine:
|
|||||||
# for v0 compatibility
|
# for v0 compatibility
|
||||||
self.model_executor = self.engine_core.engine_core.model_executor # type: ignore
|
self.model_executor = self.engine_core.engine_core.model_executor # type: ignore
|
||||||
|
|
||||||
# If usage stat is enabled, collect relevant info.
|
|
||||||
report_usage_stats(vllm_config, usage_context)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_vllm_config(
|
def from_vllm_config(
|
||||||
cls,
|
cls,
|
||||||
|
|||||||
@ -205,7 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
|
|||||||
return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
|
return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
|
||||||
|
|
||||||
|
|
||||||
def report_usage_stats(vllm_config, usage_context: UsageContext) -> None:
|
def report_usage_stats(
|
||||||
|
vllm_config,
|
||||||
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT) -> None:
|
||||||
"""Report usage statistics if enabled."""
|
"""Report usage statistics if enabled."""
|
||||||
|
|
||||||
if not is_usage_stats_enabled():
|
if not is_usage_stats_enabled():
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from vllm.platforms import current_platform
|
|||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes
|
||||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||||
from vllm.v1.outputs import ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
|
from vllm.v1.utils import report_usage_stats
|
||||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||||
from vllm.v1.worker.worker_base import WorkerBase
|
from vllm.v1.worker.worker_base import WorkerBase
|
||||||
|
|
||||||
@ -141,6 +142,10 @@ class Worker(WorkerBase):
|
|||||||
self.model_runner: GPUModelRunner = GPUModelRunner(
|
self.model_runner: GPUModelRunner = GPUModelRunner(
|
||||||
self.vllm_config, self.device)
|
self.vllm_config, self.device)
|
||||||
|
|
||||||
|
if self.rank == 0:
|
||||||
|
# If usage stat is enabled, collect relevant info.
|
||||||
|
report_usage_stats(self.vllm_config)
|
||||||
|
|
||||||
# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
|
# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
|
||||||
# to hijack tensor allocation.
|
# to hijack tensor allocation.
|
||||||
def load_model(self) -> None:
|
def load_model(self) -> None:
|
||||||
|
|||||||
@ -21,7 +21,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
|
|||||||
from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
|
||||||
KVCacheSpec)
|
KVCacheSpec)
|
||||||
from vllm.v1.outputs import ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
from vllm.v1.utils import bind_kv_cache
|
from vllm.v1.utils import bind_kv_cache, report_usage_stats
|
||||||
from vllm.v1.worker.tpu_model_runner import TPUModelRunner
|
from vllm.v1.worker.tpu_model_runner import TPUModelRunner
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -133,6 +133,10 @@ class TPUWorker:
|
|||||||
# Init ModelRunner here, so that we have access to self.device.
|
# Init ModelRunner here, so that we have access to self.device.
|
||||||
self.model_runner = TPUModelRunner(self.vllm_config, self.device)
|
self.model_runner = TPUModelRunner(self.vllm_config, self.device)
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
# If usage stat is enabled, collect relevant info.
|
||||||
|
report_usage_stats(self.vllm_config)
|
||||||
|
|
||||||
def determine_available_memory(self) -> int:
|
def determine_available_memory(self) -> int:
|
||||||
kv_caches: dict[str, torch.Tensor] = {}
|
kv_caches: dict[str, torch.Tensor] = {}
|
||||||
kv_cache_spec = self.model_runner.get_kv_cache_spec()
|
kv_cache_spec = self.model_runner.get_kv_cache_spec()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user