diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index 92f6c6f0e89cd..0c9f83f049e4d 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -8,7 +8,8 @@ import ray from vllm.config import ModelDType from vllm.sampling_params import SamplingParams from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM -from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger +from vllm.v1.metrics.ray_wrappers import (RayPrometheusMetric, + RayPrometheusStatLogger) @pytest.fixture(scope="function", autouse=True) @@ -65,3 +66,39 @@ def test_engine_log_metrics_ray( # Create the actor and call the async method actor = EngineTestActor.remote() # type: ignore[attr-defined] ray.get(actor.run.remote()) + + +def test_sanitized_opentelemetry_name(): + """Test the metric name sanitization logic for Ray.""" + + # Only a-z, A-Z, 0-9, _, test valid characters are preserved + valid_name = "valid_metric_123_abcDEF" + assert RayPrometheusMetric._get_sanitized_opentelemetry_name( + valid_name) == valid_name + + # Test dash, dot, are replaced + name_with_dash_dot = "metric-name.test" + expected = "metric_name_test" + assert RayPrometheusMetric._get_sanitized_opentelemetry_name( + name_with_dash_dot) == expected + + # Test colon is replaced with underscore + name_with_colon = "metric:name" + expected = "metric_name" + assert RayPrometheusMetric._get_sanitized_opentelemetry_name( + name_with_colon) == expected + + # Test multiple invalid characters are replaced + name_with_invalid = "metric:name@with#special%chars" + expected = "metric_name_with_special_chars" + assert RayPrometheusMetric._get_sanitized_opentelemetry_name( + name_with_invalid) == expected + + # Test mixed valid and invalid characters + complex_name = "vllm:engine_stats/time.latency_ms-99p" + expected = "vllm_engine_stats_time_latency_ms_99p" + assert RayPrometheusMetric._get_sanitized_opentelemetry_name( + complex_name) == expected + + # Test empty string + assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == "" diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index ae8f9447e9c8b..6091857538609 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -11,6 +11,7 @@ try: from ray.util.metrics import Metric except ImportError: ray_metrics = None +import regex as re class RayPrometheusMetric: @@ -42,6 +43,21 @@ class RayPrometheusMetric: return self + @staticmethod + def _get_sanitized_opentelemetry_name(name: str) -> str: + """ + For compatibility with Ray + OpenTelemetry, the metric name must be + sanitized. In particular, this replaces disallowed character (e.g., ':') + with '_' in the metric name. + Allowed characters: a-z, A-Z, 0-9, _ + + # ruff: noqa: E501 + Ref: https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/src/metrics/instrument_metadata_validator.cc#L22-L23 + Ref: https://github.com/ray-project/ray/blob/master/src/ray/stats/metric.cc#L107 + """ + + return re.sub(r"[^a-zA-Z0-9_]", "_", name) + class RayGaugeWrapper(RayPrometheusMetric): """Wraps around ray.util.metrics.Gauge to provide same API as @@ -58,6 +74,7 @@ class RayGaugeWrapper(RayPrometheusMetric): # implemented at the observability layer (Prometheus/Grafana). del multiprocess_mode labelnames_tuple = tuple(labelnames) if labelnames else None + name = self._get_sanitized_opentelemetry_name(name) self.metric = ray_metrics.Gauge(name=name, description=documentation, tag_keys=labelnames_tuple) @@ -79,6 +96,7 @@ class RayCounterWrapper(RayPrometheusMetric): documentation: Optional[str] = "", labelnames: Optional[list[str]] = None): labelnames_tuple = tuple(labelnames) if labelnames else None + name = self._get_sanitized_opentelemetry_name(name) self.metric = ray_metrics.Counter(name=name, description=documentation, tag_keys=labelnames_tuple) @@ -99,6 +117,7 @@ class RayHistogramWrapper(RayPrometheusMetric): labelnames: Optional[list[str]] = None, buckets: Optional[list[float]] = None): labelnames_tuple = tuple(labelnames) if labelnames else None + name = self._get_sanitized_opentelemetry_name(name) boundaries = buckets if buckets else [] self.metric = ray_metrics.Histogram(name=name, description=documentation,