[Misc] Remove experimental dep from tracing.py (#12007)

Signed-off-by: Adrian Cole <adrian.cole@elastic.co>
This commit is contained in:
Adrian Cole 2025-01-21 11:51:55 -08:00 committed by GitHub
parent 18fd4a8331
commit 347eeebe3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 66 additions and 60 deletions

View File

@ -100,32 +100,32 @@ def test_traces(trace_service):
attributes = decode_attributes( attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes) request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids) outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics metrics = outputs[0].metrics
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
e2e_time = metrics.finished_time - metrics.arrival_time e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
assert metrics.scheduler_time > 0 assert metrics.scheduler_time > 0
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time ) == metrics.scheduler_time
# Model forward and model execute should be none, since detailed traces is # Model forward and model execute should be none, since detailed traces is
# not enabled. # not enabled.
assert metrics.model_forward_time is None assert metrics.model_forward_time is None
@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service):
attributes = decode_attributes( attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes) request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids) outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics metrics = outputs[0].metrics
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
e2e_time = metrics.finished_time - metrics.arrival_time e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
assert metrics.scheduler_time > 0 assert metrics.scheduler_time > 0
assert attributes.get( assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time ) == metrics.scheduler_time
assert metrics.model_forward_time > 0 assert metrics.model_forward_time > 0
assert attributes.get( assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
metrics.model_forward_time / 1000) metrics.model_forward_time / 1000)
assert metrics.model_execute_time > 0 assert metrics.model_execute_time > 0
assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
) == metrics.model_execute_time ) == metrics.model_execute_time
assert metrics.model_forward_time < 1000 * metrics.model_execute_time assert metrics.model_forward_time < 1000 * metrics.model_execute_time

View File

@ -1857,46 +1857,44 @@ class LLMEngine:
metrics = seq_group.metrics metrics = seq_group.metrics
ttft = metrics.first_token_time - metrics.arrival_time ttft = metrics.first_token_time - metrics.arrival_time
e2e_time = metrics.finished_time - metrics.arrival_time e2e_time = metrics.finished_time - metrics.arrival_time
# attribute names are based on seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
self.model_config.model) self.model_config.model)
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID, seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
seq_group.request_id) seq_group.request_id)
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE, seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
seq_group.sampling_params.temperature) seq_group.sampling_params.temperature)
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P, seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
seq_group.sampling_params.top_p) seq_group.sampling_params.top_p)
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
seq_group.sampling_params.max_tokens) seq_group.sampling_params.max_tokens)
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
seq_group.sampling_params.n) seq_group.sampling_params.n)
seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
seq_group.num_seqs()) seq_group.num_seqs())
seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS, seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
len(seq_group.prompt_token_ids)) len(seq_group.prompt_token_ids))
seq_span.set_attribute( seq_span.set_attribute(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
sum([ sum([
seq.get_output_len() seq.get_output_len()
for seq in seq_group.get_finished_seqs() for seq in seq_group.get_finished_seqs()
])) ]))
seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE, seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
metrics.time_in_queue) metrics.time_in_queue)
seq_span.set_attribute( seq_span.set_attribute(
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft) SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
if metrics.scheduler_time is not None: if metrics.scheduler_time is not None:
seq_span.set_attribute( seq_span.set_attribute(
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER, SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
metrics.scheduler_time) metrics.scheduler_time)
if metrics.model_forward_time is not None: if metrics.model_forward_time is not None:
seq_span.set_attribute( seq_span.set_attribute(
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD, SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
metrics.model_forward_time / 1000.0) metrics.model_forward_time / 1000.0)
if metrics.model_execute_time is not None: if metrics.model_execute_time is not None:
seq_span.set_attribute( seq_span.set_attribute(
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
metrics.model_execute_time) metrics.model_execute_time)
def _validate_model_inputs(self, inputs: ProcessorInputs, def _validate_model_inputs(self, inputs: ProcessorInputs,

View File

@ -16,7 +16,6 @@ try:
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import ( from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator) TraceContextTextMapPropagator)
@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
return {h: headers[h] for h in TRACE_HEADERS if h in headers} return {h: headers[h] for h in TRACE_HEADERS if h in headers}
class SpanAttributes(BaseSpanAttributes): class SpanAttributes:
# The following span attribute names are added here because they are missing # Attribute names copied from here to avoid version conflicts:
# from the Semantic Conventions for LLM. # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
LLM_REQUEST_ID = "gen_ai.request.id" GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
LLM_REQUEST_N = "gen_ai.request.n" GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
LLM_LATENCY_E2E = "gen_ai.latency.e2e" GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" # Attribute names added until they are added to the semantic conventions:
GEN_AI_REQUEST_ID = "gen_ai.request.id"
GEN_AI_REQUEST_N = "gen_ai.request.n"
GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
# Time taken in the forward pass for this across all workers # Time taken in the forward pass for this across all workers
LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward" GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
"gen_ai.latency.time_in_model_forward")
# Time taken in the model execute function. This will include model # Time taken in the model execute function. This will include model
# forward, block/sync across workers, cpu-gpu sync time and sampling time. # forward, block/sync across workers, cpu-gpu sync time and sampling time.
LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute" GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
"gen_ai.latency.time_in_model_execute")
def contains_trace_headers(headers: Mapping[str, str]) -> bool: def contains_trace_headers(headers: Mapping[str, str]) -> bool: