[ray][metrics] Replace ':' with '_' for OpenTelemetry compatibility in Ray (#25439)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
This commit is contained in:
Seiji Eicher 2025-09-26 09:43:30 -07:00 committed by GitHub
parent 984d18498a
commit 8d52f2b3a7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 57 additions and 1 deletions

View File

@ -8,7 +8,8 @@ import ray
from vllm.config import ModelDType
from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
from vllm.v1.metrics.ray_wrappers import (RayPrometheusMetric,
RayPrometheusStatLogger)
@pytest.fixture(scope="function", autouse=True)
@ -65,3 +66,39 @@ def test_engine_log_metrics_ray(
# Create the actor and call the async method
actor = EngineTestActor.remote() # type: ignore[attr-defined]
ray.get(actor.run.remote())
def test_sanitized_opentelemetry_name():
"""Test the metric name sanitization logic for Ray."""
# Only a-z, A-Z, 0-9, _, test valid characters are preserved
valid_name = "valid_metric_123_abcDEF"
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
valid_name) == valid_name
# Test dash, dot, are replaced
name_with_dash_dot = "metric-name.test"
expected = "metric_name_test"
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
name_with_dash_dot) == expected
# Test colon is replaced with underscore
name_with_colon = "metric:name"
expected = "metric_name"
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
name_with_colon) == expected
# Test multiple invalid characters are replaced
name_with_invalid = "metric:name@with#special%chars"
expected = "metric_name_with_special_chars"
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
name_with_invalid) == expected
# Test mixed valid and invalid characters
complex_name = "vllm:engine_stats/time.latency_ms-99p"
expected = "vllm_engine_stats_time_latency_ms_99p"
assert RayPrometheusMetric._get_sanitized_opentelemetry_name(
complex_name) == expected
# Test empty string
assert RayPrometheusMetric._get_sanitized_opentelemetry_name("") == ""

View File

@ -11,6 +11,7 @@ try:
from ray.util.metrics import Metric
except ImportError:
ray_metrics = None
import regex as re
class RayPrometheusMetric:
@ -42,6 +43,21 @@ class RayPrometheusMetric:
return self
@staticmethod
def _get_sanitized_opentelemetry_name(name: str) -> str:
"""
For compatibility with Ray + OpenTelemetry, the metric name must be
sanitized. In particular, this replaces disallowed character (e.g., ':')
with '_' in the metric name.
Allowed characters: a-z, A-Z, 0-9, _
# ruff: noqa: E501
Ref: https://github.com/open-telemetry/opentelemetry-cpp/blob/main/sdk/src/metrics/instrument_metadata_validator.cc#L22-L23
Ref: https://github.com/ray-project/ray/blob/master/src/ray/stats/metric.cc#L107
"""
return re.sub(r"[^a-zA-Z0-9_]", "_", name)
class RayGaugeWrapper(RayPrometheusMetric):
"""Wraps around ray.util.metrics.Gauge to provide same API as
@ -58,6 +74,7 @@ class RayGaugeWrapper(RayPrometheusMetric):
# implemented at the observability layer (Prometheus/Grafana).
del multiprocess_mode
labelnames_tuple = tuple(labelnames) if labelnames else None
name = self._get_sanitized_opentelemetry_name(name)
self.metric = ray_metrics.Gauge(name=name,
description=documentation,
tag_keys=labelnames_tuple)
@ -79,6 +96,7 @@ class RayCounterWrapper(RayPrometheusMetric):
documentation: Optional[str] = "",
labelnames: Optional[list[str]] = None):
labelnames_tuple = tuple(labelnames) if labelnames else None
name = self._get_sanitized_opentelemetry_name(name)
self.metric = ray_metrics.Counter(name=name,
description=documentation,
tag_keys=labelnames_tuple)
@ -99,6 +117,7 @@ class RayHistogramWrapper(RayPrometheusMetric):
labelnames: Optional[list[str]] = None,
buckets: Optional[list[float]] = None):
labelnames_tuple = tuple(labelnames) if labelnames else None
name = self._get_sanitized_opentelemetry_name(name)
boundaries = buckets if buckets else []
self.metric = ray_metrics.Histogram(name=name,
description=documentation,