mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:15:42 +08:00
[Misc] Remove experimental dep from tracing.py (#12007)
Signed-off-by: Adrian Cole <adrian.cole@elastic.co>
This commit is contained in:
parent
18fd4a8331
commit
347eeebe3b
@ -100,32 +100,32 @@ def test_traces(trace_service):
|
|||||||
|
|
||||||
attributes = decode_attributes(
|
attributes = decode_attributes(
|
||||||
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
||||||
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
|
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
|
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
||||||
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
||||||
|
) == sampling_params.temperature
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
|
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
|
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
|
||||||
assert attributes.get(
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
||||||
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
|
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
||||||
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
|
|
||||||
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
|
|
||||||
outputs[0].prompt_token_ids)
|
outputs[0].prompt_token_ids)
|
||||||
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
|
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
||||||
metrics = outputs[0].metrics
|
metrics = outputs[0].metrics
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
|
||||||
ttft = metrics.first_token_time - metrics.arrival_time
|
ttft = metrics.first_token_time - metrics.arrival_time
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
||||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||||
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
||||||
assert metrics.scheduler_time > 0
|
assert metrics.scheduler_time > 0
|
||||||
assert attributes.get(
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
|
) == metrics.scheduler_time
|
||||||
# Model forward and model execute should be none, since detailed traces is
|
# Model forward and model execute should be none, since detailed traces is
|
||||||
# not enabled.
|
# not enabled.
|
||||||
assert metrics.model_forward_time is None
|
assert metrics.model_forward_time is None
|
||||||
@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service):
|
|||||||
|
|
||||||
attributes = decode_attributes(
|
attributes = decode_attributes(
|
||||||
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
||||||
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
|
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
|
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
||||||
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
||||||
|
) == sampling_params.temperature
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
|
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
|
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
|
||||||
assert attributes.get(
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
||||||
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
|
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
||||||
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
|
|
||||||
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
|
|
||||||
outputs[0].prompt_token_ids)
|
outputs[0].prompt_token_ids)
|
||||||
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
|
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
||||||
metrics = outputs[0].metrics
|
metrics = outputs[0].metrics
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
|
||||||
ttft = metrics.first_token_time - metrics.arrival_time
|
ttft = metrics.first_token_time - metrics.arrival_time
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
||||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||||
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
||||||
assert metrics.scheduler_time > 0
|
assert metrics.scheduler_time > 0
|
||||||
assert attributes.get(
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
|
) == metrics.scheduler_time
|
||||||
assert metrics.model_forward_time > 0
|
assert metrics.model_forward_time > 0
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
|
||||||
metrics.model_forward_time / 1000)
|
metrics.model_forward_time / 1000)
|
||||||
assert metrics.model_execute_time > 0
|
assert metrics.model_execute_time > 0
|
||||||
assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
|
||||||
) == metrics.model_execute_time
|
) == metrics.model_execute_time
|
||||||
assert metrics.model_forward_time < 1000 * metrics.model_execute_time
|
assert metrics.model_forward_time < 1000 * metrics.model_execute_time
|
||||||
|
|||||||
@ -1857,46 +1857,44 @@ class LLMEngine:
|
|||||||
metrics = seq_group.metrics
|
metrics = seq_group.metrics
|
||||||
ttft = metrics.first_token_time - metrics.arrival_time
|
ttft = metrics.first_token_time - metrics.arrival_time
|
||||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||||
# attribute names are based on
|
seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
|
||||||
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md
|
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL,
|
|
||||||
self.model_config.model)
|
self.model_config.model)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
|
||||||
seq_group.request_id)
|
seq_group.request_id)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
|
||||||
seq_group.sampling_params.temperature)
|
seq_group.sampling_params.temperature)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
|
||||||
seq_group.sampling_params.top_p)
|
seq_group.sampling_params.top_p)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
|
||||||
seq_group.sampling_params.max_tokens)
|
seq_group.sampling_params.max_tokens)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
|
||||||
seq_group.sampling_params.n)
|
seq_group.sampling_params.n)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
|
||||||
seq_group.num_seqs())
|
seq_group.num_seqs())
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
|
||||||
len(seq_group.prompt_token_ids))
|
len(seq_group.prompt_token_ids))
|
||||||
seq_span.set_attribute(
|
seq_span.set_attribute(
|
||||||
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
|
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
|
||||||
sum([
|
sum([
|
||||||
seq.get_output_len()
|
seq.get_output_len()
|
||||||
for seq in seq_group.get_finished_seqs()
|
for seq in seq_group.get_finished_seqs()
|
||||||
]))
|
]))
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE,
|
seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
|
||||||
metrics.time_in_queue)
|
metrics.time_in_queue)
|
||||||
seq_span.set_attribute(
|
seq_span.set_attribute(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
|
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
|
||||||
seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time)
|
seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
|
||||||
if metrics.scheduler_time is not None:
|
if metrics.scheduler_time is not None:
|
||||||
seq_span.set_attribute(
|
seq_span.set_attribute(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER,
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
|
||||||
metrics.scheduler_time)
|
metrics.scheduler_time)
|
||||||
if metrics.model_forward_time is not None:
|
if metrics.model_forward_time is not None:
|
||||||
seq_span.set_attribute(
|
seq_span.set_attribute(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD,
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
|
||||||
metrics.model_forward_time / 1000.0)
|
metrics.model_forward_time / 1000.0)
|
||||||
if metrics.model_execute_time is not None:
|
if metrics.model_execute_time is not None:
|
||||||
seq_span.set_attribute(
|
seq_span.set_attribute(
|
||||||
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE,
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
|
||||||
metrics.model_execute_time)
|
metrics.model_execute_time)
|
||||||
|
|
||||||
def _validate_model_inputs(self, inputs: ProcessorInputs,
|
def _validate_model_inputs(self, inputs: ProcessorInputs,
|
||||||
|
|||||||
@ -16,7 +16,6 @@ try:
|
|||||||
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
|
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL)
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
from opentelemetry.sdk.trace import TracerProvider
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||||
from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes
|
|
||||||
from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
|
from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider
|
||||||
from opentelemetry.trace.propagation.tracecontext import (
|
from opentelemetry.trace.propagation.tracecontext import (
|
||||||
TraceContextTextMapPropagator)
|
TraceContextTextMapPropagator)
|
||||||
@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]:
|
|||||||
return {h: headers[h] for h in TRACE_HEADERS if h in headers}
|
return {h: headers[h] for h in TRACE_HEADERS if h in headers}
|
||||||
|
|
||||||
|
|
||||||
class SpanAttributes(BaseSpanAttributes):
|
class SpanAttributes:
|
||||||
# The following span attribute names are added here because they are missing
|
# Attribute names copied from here to avoid version conflicts:
|
||||||
# from the Semantic Conventions for LLM.
|
# https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md
|
||||||
LLM_REQUEST_ID = "gen_ai.request.id"
|
GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens"
|
||||||
LLM_REQUEST_N = "gen_ai.request.n"
|
GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens"
|
||||||
LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
|
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
|
||||||
LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
|
GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"
|
||||||
LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
|
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
|
||||||
LLM_LATENCY_E2E = "gen_ai.latency.e2e"
|
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
|
||||||
LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
|
# Attribute names added until they are added to the semantic conventions:
|
||||||
|
GEN_AI_REQUEST_ID = "gen_ai.request.id"
|
||||||
|
GEN_AI_REQUEST_N = "gen_ai.request.n"
|
||||||
|
GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences"
|
||||||
|
GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue"
|
||||||
|
GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token"
|
||||||
|
GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e"
|
||||||
|
GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler"
|
||||||
# Time taken in the forward pass for this across all workers
|
# Time taken in the forward pass for this across all workers
|
||||||
LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward"
|
GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = (
|
||||||
|
"gen_ai.latency.time_in_model_forward")
|
||||||
# Time taken in the model execute function. This will include model
|
# Time taken in the model execute function. This will include model
|
||||||
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
|
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
|
||||||
LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute"
|
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
|
||||||
|
"gen_ai.latency.time_in_model_execute")
|
||||||
|
|
||||||
|
|
||||||
def contains_trace_headers(headers: Mapping[str, str]) -> bool:
|
def contains_trace_headers(headers: Mapping[str, str]) -> bool:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user