[V1] [2/n] Logging and Metrics - OutputProcessor Abstraction (#11973)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
This commit is contained in:
Robert Shaw 2025-01-12 23:54:10 -05:00 committed by GitHub
parent d14e98d924
commit 619ae268c3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 449 additions and 209 deletions

View File

@ -1,5 +1,5 @@
import asyncio
from typing import Tuple
from typing import List, Tuple
import pytest
@ -13,6 +13,7 @@ if not current_platform.is_cuda():
allow_module_level=True)
ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
enforce_eager=True,
disable_log_requests=True)
@ -53,17 +54,63 @@ async def test_load(monkeypatch):
generate(engine, request_id, NUM_EXPECTED_TOKENS)))
# Confirm that we got all the EXPECTED tokens from the requests.
failed_request_id = None
tokens = None
for task in tasks:
num_generated_tokens, request_id = await task
if (num_generated_tokens != NUM_EXPECTED_TOKENS
and failed_request_id is None):
failed_request_id = request_id
tokens = num_generated_tokens
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
f"{request_id} generated {num_generated_tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert failed_request_id is None, (
f"{failed_request_id} generated {tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert not engine.output_processor.has_unfinished_requests()
engine.shutdown()
@pytest.mark.asyncio
async def test_abort(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
NUM_REQUESTS = 100
NUM_EXPECTED_TOKENS = 100
REQUEST_IDS_TO_ABORT = range(1, 100, 10)
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
# Create concurrent requests.
tasks: List[asyncio.Task] = []
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, NUM_EXPECTED_TOKENS)))
# API server cancels requests when they disconnect.
for idx in REQUEST_IDS_TO_ABORT:
tasks[idx].cancel()
await asyncio.sleep(0.1)
# Confirm the other requests are okay.
for idx, task in enumerate(tasks):
# Confirm that it was actually canceled.
if idx in REQUEST_IDS_TO_ABORT:
with pytest.raises(asyncio.CancelledError):
await task
else:
# Otherwise, make sure the request was not impacted.
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
f"{request_id} generated {num_generated_tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert not engine.output_processor.has_unfinished_requests()
# Confirm we can do another generation.
request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
task = asyncio.create_task(
generate(engine, request_id, NUM_EXPECTED_TOKENS))
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS
assert not engine.output_processor.has_unfinished_requests()
engine.shutdown()

View File

@ -3,11 +3,18 @@ from typing import List
import pytest
from transformers import AutoTokenizer
from vllm.engine.arg_utils import EngineArgs
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine.detokenizer import Detokenizer
from vllm.v1.engine.output_processor import OutputProcessor
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
VLLM_CONFIG.scheduler_config,
VLLM_CONFIG.parallel_config,
VLLM_CONFIG.lora_config)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
FULL_STRINGS = [
@ -66,7 +73,7 @@ class MockEngineCore:
"request_output_kind",
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
def test_incremental_detokenization(request_output_kind: RequestOutputKind):
detokenizer = Detokenizer(TOKENIZER_NAME)
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
engine_core = MockEngineCore(GENERATION_TOKENS)
# Make N requests.
@ -93,7 +100,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
# Add requests to the detokenizer.
for request in requests:
detokenizer.add_request(request)
output_processor.add_request(request)
gen_strings = {}
gen_tokens = {}
@ -104,7 +111,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
break
# Step the Detokenizer.
request_outputs, requests_to_abort = detokenizer.step(outputs)
processed_outputs = output_processor.process_outputs(outputs, )
request_outputs = processed_outputs.request_outputs
requests_to_abort = processed_outputs.reqs_to_abort
assert len(requests_to_abort) == 0
# Update tracking.
@ -128,13 +137,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
assert detokenizer.get_num_unfinished_requests() == 0
assert not detokenizer.has_unfinished_requests()
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
def test_stop_string(include_stop_str_in_output: bool):
detokenizer = Detokenizer(TOKENIZER_NAME)
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
engine_core = MockEngineCore(GENERATION_TOKENS)
# Make N requests.
@ -162,7 +171,7 @@ def test_stop_string(include_stop_str_in_output: bool):
# Add requests to the detokenizer.
for request in requests:
detokenizer.add_request(request)
output_processor.add_request(request)
gen_strings = {}
aborted = []
@ -173,7 +182,9 @@ def test_stop_string(include_stop_str_in_output: bool):
break
# Step the Detokenizer.
request_outputs, requests_to_abort = detokenizer.step(outputs)
processed_outputs = output_processor.process_outputs(outputs)
request_outputs = processed_outputs.request_outputs
requests_to_abort = processed_outputs.reqs_to_abort
for request_output in request_outputs:
# If aborted, we should not get a request output.
assert request_output.request_id not in aborted
@ -214,5 +225,71 @@ def test_stop_string(include_stop_str_in_output: bool):
assert gen_str == ref_str_exc_stop, (
f"{gen_str=}, {ref_str_exc_stop=}")
assert detokenizer.get_num_unfinished_requests() == 0
assert not detokenizer.has_unfinished_requests()
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
def test_iteration_stats():
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
engine_core = MockEngineCore(GENERATION_TOKENS)
# Make N requests.
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(),
) for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
]
# Add all requests except one to the OutputProcessor.
num_active = len(GENERATION_TOKENS) - 1
for request in requests[:num_active]:
output_processor.add_request(request)
inactive_request = requests[num_active]
# First iteration has 2 prefills.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
total_prompt_tokens = sum(
[len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
assert iteration_stats.num_prompt_tokens == total_prompt_tokens
assert iteration_stats.num_generation_tokens == num_active
# Just decodes in this step.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active
# Add a new request - prefill and 2 decodes in this step.
output_processor.add_request(inactive_request)
num_active += 1
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
assert iteration_stats.num_prompt_tokens == total_prompt_tokens
assert iteration_stats.num_generation_tokens == num_active
# Just decodes in this step.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active

View File

@ -1,6 +1,6 @@
import asyncio
import os
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
from vllm.config import ModelConfig, VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
@ -18,11 +18,11 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import kill_process_tree
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.detokenizer import Detokenizer
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
from vllm.v1.metrics.stats import SchedulerStats
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
logger = init_logger(__name__)
@ -59,9 +59,6 @@ class AsyncLLM(EngineClient):
lora_config=vllm_config.lora_config)
self.tokenizer.ping()
# Request streams (map of request_id -> queue).
self.rid_to_queue: Dict[str, asyncio.Queue] = {}
# Processor (converts Inputs --> EngineCoreRequests).
self.processor = Processor(
model_config=vllm_config.model_config,
@ -71,13 +68,9 @@ class AsyncLLM(EngineClient):
input_registry=input_registry,
)
# Detokenizer (converts EngineCoreOutputs --> RequestOutput).
self.detokenizer = Detokenizer(
tokenizer_name=vllm_config.model_config.tokenizer,
tokenizer_mode=vllm_config.model_config.tokenizer_mode,
trust_remote_code=vllm_config.model_config.trust_remote_code,
revision=vllm_config.model_config.tokenizer_revision,
)
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(self.tokenizer,
log_stats=self.log_stats)
# EngineCore (starts the engine in background process).
self.engine_core = EngineCoreClient.make_client(
@ -140,9 +133,9 @@ class AsyncLLM(EngineClient):
"""Add new request to the AsyncLLM."""
# 1) Create a new output queue for the request.
if request_id in self.rid_to_queue:
if self.output_processor.is_request_active(request_id):
raise ValueError(f"Request id {request_id} already running.")
self.rid_to_queue[request_id] = asyncio.Queue()
queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
# 2) Convert Input --> Request.
request = self.processor.process_inputs(request_id, prompt, params,
@ -151,8 +144,8 @@ class AsyncLLM(EngineClient):
prompt_adapter_request,
priority)
# 3) Add the request to Detokenizer (this process).
self.detokenizer.add_request(request)
# 3) Add the request to OutputProcessor (this process).
self.output_processor.add_request(request, queue)
# 4) Add the EngineCoreRequest to EngineCore (separate process).
await self.engine_core.add_request_async(request)
@ -160,7 +153,7 @@ class AsyncLLM(EngineClient):
if self.log_requests:
logger.info("Added request %s.", request_id)
return self.rid_to_queue[request_id]
return queue
# TODO: we should support multiple prompts in one call, as you
# can do with LLM.generate. So that for multi-prompt completion
@ -217,10 +210,9 @@ class AsyncLLM(EngineClient):
# task switching under load which helps performance).
out = q.get_nowait() if q.qsize() > 0 else await q.get()
# Note: both Detokenizer and EngineCore handle their
# Note: both OutputProcessor and EngineCore handle their
# own request cleanup based on finished.
if out.finished:
del self.rid_to_queue[request_id]
yield out
break
@ -233,57 +225,51 @@ class AsyncLLM(EngineClient):
await self.abort(request_id)
raise
def _process_request_outputs(self, request_outputs: List[RequestOutput]):
"""Process outputs by putting them into per-request queues."""
for request_output in request_outputs:
request_id = request_output.request_id
# Note: it is possible a request was aborted and removed from
# the state due to client cancellations, so if we encounter a
# request id not in the state, we skip.
if request_id in self.rid_to_queue:
self.rid_to_queue[request_id].put_nowait(request_output)
async def _run_output_handler(self):
"""Background loop: pulls from EngineCore and pushes to AsyncStreams."""
try:
while True:
# 1) Pull EngineCoreOutput from the EngineCore.
# 1) Pull EngineCoreOutputs from the EngineCore.
outputs = await self.engine_core.get_output_async()
# 2) Detokenize based on the output.
request_outputs, reqs_to_abort = self.detokenizer.step(
# 2) Process EngineCoreOutputs.
processed_outputs = self.output_processor.process_outputs(
outputs.outputs)
# NOTE: RequestOutputs are pushed to their queues.
assert len(processed_outputs.request_outputs) == 0
# 3) Put the RequestOutputs into the per-request queues.
self._process_request_outputs(request_outputs)
# 3) Abort any reqs that finished due to stop strings.
await self.engine_core.abort_requests_async(
processed_outputs.reqs_to_abort)
# 4) Abort any requests that finished due to stop strings.
await self.engine_core.abort_requests_async(reqs_to_abort)
# 5) Log any stats.
await self._log_stats(scheduler_stats=outputs.scheduler_stats)
# 4) Logging.
# TODO(rob): make into a coroutine and launch it in
# background thread once we add Prometheus.
self._log_stats(
scheduler_stats=outputs.scheduler_stats,
iteration_stats=processed_outputs.iteration_stats,
)
except Exception as e:
logger.exception("EngineCore output handler hit an error: %s", e)
kill_process_tree(os.getpid())
async def abort(self, request_id: str) -> None:
"""Abort RequestId in self, detokenizer, and engine core."""
"""Abort RequestId in OutputProcessor and EngineCore."""
request_ids = [request_id]
await self.engine_core.abort_requests_async(request_ids)
self.detokenizer.abort_requests(request_ids)
self.output_processor.abort_requests(request_ids)
# If a request finishes while we await then the request_id
# will be removed from the tracked queues before we get here.
if request_id in self.rid_to_queue:
del self.rid_to_queue[request_id]
if self.log_requests:
logger.info("Aborted request %s.", request_id)
async def _log_stats(self, scheduler_stats: SchedulerStats):
"""Log stats to the stat loggers."""
def _log_stats(
self,
scheduler_stats: SchedulerStats,
iteration_stats: IterationStats,
):
if not self.log_stats:
return
@ -314,8 +300,7 @@ class AsyncLLM(EngineClient):
self,
lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer:
assert lora_request is None
return self.detokenizer.tokenizer
return self.tokenizer.get_lora_tokenizer(lora_request)
async def is_tracing_enabled(self) -> bool:
return False

View File

@ -105,7 +105,8 @@ class InprocClient(EngineCoreClient):
self.engine_core.add_request(request)
def abort_requests(self, request_ids: List[str]) -> None:
self.engine_core.abort_requests(request_ids)
if len(request_ids) > 0:
self.engine_core.abort_requests(request_ids)
def shutdown(self):
self.engine_core.shutdown()
@ -221,7 +222,8 @@ class SyncMPClient(MPClient):
self._send_input(EngineCoreRequestType.ADD, request)
def abort_requests(self, request_ids: List[str]) -> None:
self._send_input(EngineCoreRequestType.ABORT, request_ids)
if len(request_ids) > 0:
self._send_input(EngineCoreRequestType.ABORT, request_ids)
def profile(self, is_start: bool = True) -> None:
self._send_input(EngineCoreRequestType.PROFILE,

View File

@ -1,18 +1,25 @@
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Tuple, Union
from typing import List, Optional, Union
from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.sampling_params import RequestOutputKind
from vllm.transformers_utils.detokenizer_utils import (
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
logger = init_logger(__name__)
@dataclass
class DetokenizerOutput:
output_text: str
token_ids: List[int]
finished: bool
finish_reason: Optional[str] = None
stop_reason: Union[int, str, None] = None
@dataclass
class IncrementalDetokenizer:
@ -20,6 +27,7 @@ class IncrementalDetokenizer:
output_text: str
tokens: List[str]
token_ids: List[int]
prompt_len: int
# Stop strings
stop: List[str]
@ -34,11 +42,6 @@ class IncrementalDetokenizer:
spaces_between_special_tokens: bool
output_kind: RequestOutputKind
# TODO: Probably decouple these
request_id: str
prompt: Optional[str]
prompt_token_ids: List[int]
# Tokenizer for this request
tokenizer: AnyTokenizer
@ -48,8 +51,7 @@ class IncrementalDetokenizer:
@property
def output_token_ids(self) -> List[int]:
assert len(self.token_ids) >= len(self.prompt_token_ids)
return self.token_ids[len(self.prompt_token_ids):]
return self.token_ids[self.prompt_len:]
@classmethod
def from_new_request(
@ -87,25 +89,25 @@ class IncrementalDetokenizer:
spaces_between_special_tokens=request.sampling_params.
spaces_between_special_tokens,
output_kind=request.sampling_params.output_kind,
request_id=request.request_id,
prompt=request.prompt,
prompt_token_ids=request.prompt_token_ids,
prompt_len=len(request.prompt_token_ids),
tokenizer=tokenizer,
stop_buffer_length=stop_buffer_length,
)
def add_tokens(
def update_from_output(
self,
new_token_ids: List[int],
finish_reason: Optional[str],
stop_reason: Optional[Union[int, str, None]],
) -> Optional[RequestOutput]:
output: EngineCoreOutput,
) -> Optional[DetokenizerOutput]:
"""
Update RequestState for the request_id by:
1) Detokenize the new token ids incrementally.
2) Update the RequestOutput with the new text.
"""
new_token_ids = output.new_token_ids
finish_reason = output.finish_reason
stop_reason = output.stop_reason
# 1) Detokenize the new token ids incrementally.
# TODO(woosuk): This method becomes very inefficient when the number of
# new_token_ids is more than 1. We need to optimize this.
@ -158,21 +160,8 @@ class IncrementalDetokenizer:
output_text = self._get_next_output_text(finished, delta)
token_ids = new_token_ids if delta else self.output_token_ids
request_output = RequestOutput.new(
self.request_id,
self.prompt,
self.prompt_token_ids,
output_text,
token_ids,
finished,
)
if finished:
completion_output = request_output.outputs[0]
completion_output.finish_reason = finish_reason
completion_output.stop_reason = stop_reason
return request_output
return DetokenizerOutput(output_text, token_ids, finished,
finish_reason, stop_reason)
def _get_next_output_text(self, finished: bool, delta: bool) -> str:
"""If delta is True, only new text since the last call to
@ -189,85 +178,3 @@ class IncrementalDetokenizer:
self._last_output_text_offset = length
return self.output_text[last_offset:length]
return ""
class Detokenizer:
def __init__(self,
tokenizer_name: str,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
revision: Optional[str] = None):
# TODO: once we support LoRA, we should should pass the tokenizer
# here. We currently have two copies (this + in the LLMEngine).
self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code,
revision=revision)
# Request id -> IncrementalDetokenizer
self.request_states: Dict[str, IncrementalDetokenizer] = {}
def is_request_active(self, request_id: str):
return request_id in self.request_states
def get_num_unfinished_requests(self):
return len(self.request_states)
def has_unfinished_requests(self) -> bool:
return len(self.request_states) > 0
def abort_requests(
self,
request_ids: Iterable[str],
) -> None:
"""Remove the request_ids from the Detokenizer."""
for request_id in request_ids:
self.request_states.pop(request_id, None)
def add_request(
self,
request: EngineCoreRequest,
):
"""Add new request to the Detokenizer."""
assert (request.request_id not in self.request_states)
request_state = IncrementalDetokenizer.from_new_request(
self.tokenizer, request)
self.request_states[request.request_id] = request_state
def step(
self, encore_core_outputs: List[EngineCoreOutput]
) -> Tuple[List[RequestOutput], List[str]]:
"""Update state and request the RequestOutputs to the LLMEngine."""
request_outputs: List[RequestOutput] = []
requests_to_abort: List[str] = []
for engine_core_output in encore_core_outputs:
request_id = engine_core_output.request_id
detokenizer = self.request_states.get(request_id)
if detokenizer is None:
# Ignore output for already-aborted request.
continue
# Detokenize and update state.
request_output = detokenizer.add_tokens(
new_token_ids=engine_core_output.new_token_ids,
finish_reason=engine_core_output.finish_reason,
stop_reason=engine_core_output.stop_reason,
)
if request_output is not None:
# Add to RequestOutputs list.
request_outputs.append(request_output)
# Free completed requests.
if request_output.finished:
self.request_states.pop(request_id)
if not engine_core_output.finished:
requests_to_abort.append(request_id)
# Return to EngineClient.
return request_outputs, requests_to_abort

View File

@ -18,7 +18,7 @@ from vllm.transformers_utils.tokenizer_group import (
BaseTokenizerGroup, init_tokenizer_from_configs)
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.detokenizer import Detokenizer
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.abstract import Executor
@ -60,13 +60,9 @@ class LLMEngine:
input_registry=input_registry,
mm_registry=mm_registry)
# Detokenizer (converts EngineCoreOutputs --> RequestOutput)
self.detokenizer = Detokenizer(
tokenizer_name=vllm_config.model_config.tokenizer,
tokenizer_mode=vllm_config.model_config.tokenizer_mode,
trust_remote_code=vllm_config.model_config.trust_remote_code,
revision=vllm_config.model_config.tokenizer_revision,
)
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(self.tokenizer,
log_stats=False)
# EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
self.engine_core = EngineCoreClient.make_client(
@ -103,10 +99,10 @@ class LLMEngine:
multiprocess_mode=enable_multiprocessing)
def get_num_unfinished_requests(self) -> int:
return self.detokenizer.get_num_unfinished_requests()
return self.output_processor.get_num_unfinished_requests()
def has_unfinished_requests(self) -> bool:
return self.detokenizer.has_unfinished_requests()
return self.output_processor.has_unfinished_requests()
@classmethod
def validate_outputs(cls, outputs, output_type):
@ -116,7 +112,7 @@ class LLMEngine:
"""Remove request_ids from EngineCore and Detokenizer."""
self.engine_core.abort_requests(request_ids)
self.detokenizer.abort_requests(request_ids)
self.output_processor.abort_requests(request_ids)
def add_request(
self,
@ -137,8 +133,8 @@ class LLMEngine:
prompt_adapter_request,
priority)
# 2) Add the request to Detokenizer.
self.detokenizer.add_request(request)
# 2) Make a new RequestState and queue.
self.output_processor.add_request(request)
# 3) Add the request to EngineCore.
self.engine_core.add_request(request)
@ -148,15 +144,14 @@ class LLMEngine:
# 1) Get EngineCoreOutput from the EngineCore.
outputs = self.engine_core.get_output()
# 2) Detokenizer the EngineCoreOutput.
request_outputs, requests_to_abort = self.detokenizer.step(
# 2) Process EngineCoreOutputs.
processed_outputs = self.output_processor.process_outputs(
outputs.outputs)
# 3) Abort requests that finished due to stopping criteria.
if requests_to_abort:
self.abort_request(requests_to_abort)
# 3) Abort any reqs that finished due to stop strings.
self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
return request_outputs
return processed_outputs.request_outputs
def get_model_config(self):
return self.model_config

View File

@ -0,0 +1,200 @@
import asyncio
from dataclasses import dataclass
from typing import Dict, List, Optional
from vllm.outputs import RequestOutput
from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine.detokenizer import (DetokenizerOutput,
IncrementalDetokenizer)
from vllm.v1.metrics.stats import IterationStats
@dataclass
class OutputProcessorOutput:
request_outputs: List[RequestOutput]
reqs_to_abort: List[str]
iteration_stats: IterationStats
class RequestState:
def __init__(
self,
request_id: str,
prompt: Optional[str],
prompt_token_ids: List[int],
detokenizer: IncrementalDetokenizer,
queue: Optional[asyncio.Queue[RequestOutput]],
):
self.request_id = request_id
self.prompt = prompt
self.prompt_token_ids = prompt_token_ids
self.prompt_len = len(prompt_token_ids)
self.detokenizer = detokenizer
self.is_prefilling = True
self.queue = queue
@classmethod
def from_new_request(
cls,
tokenizer: AnyTokenizer,
request: EngineCoreRequest,
queue: Optional[asyncio.Queue[RequestOutput]] = None,
) -> "RequestState":
return cls(
request_id=request.request_id,
prompt=request.prompt,
prompt_token_ids=request.prompt_token_ids,
detokenizer=IncrementalDetokenizer.from_new_request(
tokenizer=tokenizer,
request=request,
),
queue=queue,
)
class OutputProcessor:
"""Process EngineCoreOutputs into RequestOutputs."""
def __init__(
self,
tokenizer: BaseTokenizerGroup,
log_stats: bool,
):
self.log_stats = log_stats
self.tokenizer = tokenizer
self.request_states: Dict[str, RequestState] = {}
def is_request_active(self, request_id: str) -> bool:
return request_id in self.request_states
def get_num_unfinished_requests(self):
return len(self.request_states)
def has_unfinished_requests(self) -> bool:
return len(self.request_states) > 0
def abort_requests(
self,
request_ids: List[str],
) -> None:
for request_id in request_ids:
self.request_states.pop(request_id, None)
def add_request(
self,
request: EngineCoreRequest,
queue: Optional[asyncio.Queue[RequestOutput]] = None,
) -> None:
request_id = request.request_id
if request_id in self.request_states:
raise ValueError(f"Request id {request_id} already running.")
self.request_states[request_id] = RequestState.from_new_request(
tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
request=request,
queue=queue)
def process_outputs(
self,
engine_core_outputs: List[EngineCoreOutput],
) -> OutputProcessorOutput:
"""
Process the EngineCoreOutputs:
1) Compute stats for logging
2) Detokenize
3) Create and handle RequestOutput objects:
* If there is a queue (for usage with AsyncLLM),
put the RequestOutput objects into the queue for
handling by the per-request generate() tasks.
* If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects.
****************** NOTE FOR DEVELOPERS ******************
VLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs.
If you need to touch every element of the batch, implement a
method called XXXClass.update_from_output() to be called
within the loop below. For examples, see:
* IterationStats.update_from_output()
* Detokenizer.update_from_output()
TODO(rob): add Protocol makes update_from_output explicit.
**********************************************************
"""
request_outputs: List[RequestOutput] = []
reqs_to_abort: List[str] = []
iteration_stats = IterationStats(self.log_stats)
for engine_core_output in engine_core_outputs:
req_id = engine_core_output.request_id
req_state = self.request_states.get(req_id)
if req_state is None:
# Ignore output for already-aborted request.
continue
# 1) Compute stats for this iteration.
iteration_stats.update_from_output(engine_core_output,
req_state.is_prefilling,
req_state.prompt_len)
req_state.is_prefilling = False
# 2) Detokenize the token ids into text.
detokenizer_output = req_state.detokenizer.update_from_output(
engine_core_output)
# 3) Create and handle RequestOutput objects.
if request_output := self._make_request_output(
req_state, detokenizer_output):
if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate().
req_state.queue.put_nowait(request_output)
else:
# LLMEngine: return list of RequestOutputs.
request_outputs.append(request_output)
# Free completed requests.
if request_output.finished:
self.request_states.pop(req_id)
if not engine_core_output.finished:
# If req not finished in EngineCore, but Detokenizer
# detected stop string, abort needed in EngineCore.
reqs_to_abort.append(req_id)
return OutputProcessorOutput(
request_outputs=request_outputs,
reqs_to_abort=reqs_to_abort,
iteration_stats=iteration_stats,
)
def _make_request_output(
self,
request_state: RequestState,
detokenizer_output: Optional[DetokenizerOutput],
) -> Optional[RequestOutput]:
if detokenizer_output is None:
return None
request_output = RequestOutput.new(
request_state.request_id,
request_state.prompt,
request_state.prompt_token_ids,
detokenizer_output.output_text,
detokenizer_output.token_ids,
detokenizer_output.finished,
)
if detokenizer_output.finished:
completion_output = request_output.outputs[0]
completion_output.finish_reason = detokenizer_output.finish_reason
completion_output.stop_reason = detokenizer_output.stop_reason
return request_output

View File

@ -1,4 +1,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from vllm.v1.engine import EngineCoreOutput
@dataclass
@ -10,3 +14,26 @@ class SchedulerStats:
# gpu_cache_usage: float = 0.0
# gpu_prefix_cache_hit_rate: float = 0.0
class IterationStats:
"""Stats associated with a single set of EngineCoreOutputs."""
def __init__(self, log_stats: bool):
self.log_stats = log_stats
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
def update_from_output(self, output: "EngineCoreOutput",
is_prefilling: bool, prompt_len: int):
if not self.log_stats:
return
self.num_generation_tokens += len(output.new_token_ids)
if is_prefilling:
# This relies on the invariant that EngineCore does
# not stream outputs for partially completed prefills
# (scheduler.update_from_output makes EngineCoreOutput
# iff num_computed_tokens == num_tokens).
assert (len(output.new_token_ids) > 0)
self.num_prompt_tokens += prompt_len