From a9e4106f28834315de4bfb1cb1186c9a2dc95856 Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Wed, 10 Dec 2025 14:00:52 -0500 Subject: [PATCH 01/16] [P/D] KV Load Failure Recovery/Abort Configuration (#26813) Signed-off-by: Will Eaton Signed-off-by: Will Eaton Signed-off-by: Nick Hill Co-authored-by: Mark McLoughlin Co-authored-by: Nick Hill Co-authored-by: chaunceyjiang --- tests/entrypoints/openai/test_chat_error.py | 228 +++++++++ .../openai/test_completion_error.py | 216 +++++++++ .../openai/test_responses_error.py | 89 ++++ .../unit/test_cache_pollution_prevention.py | 163 +++++++ .../unit/test_error_propagation.py | 147 ++++++ .../unit/test_invalid_blocks_correctness.py | 454 ++++++++++++++++++ vllm/config/kv_transfer.py | 5 + vllm/entrypoints/openai/serving_chat.py | 17 +- vllm/entrypoints/openai/serving_completion.py | 15 +- vllm/entrypoints/openai/serving_engine.py | 61 +++ vllm/entrypoints/openai/serving_responses.py | 53 +- vllm/v1/core/block_pool.py | 19 + vllm/v1/core/kv_cache_manager.py | 8 + vllm/v1/core/sched/scheduler.py | 114 +++-- vllm/v1/engine/__init__.py | 9 +- vllm/v1/request.py | 2 + 16 files changed, 1552 insertions(+), 48 deletions(-) create mode 100644 tests/entrypoints/openai/test_chat_error.py create mode 100644 tests/entrypoints/openai/test_completion_error.py create mode 100644 tests/entrypoints/openai/test_responses_error.py create mode 100644 tests/v1/kv_connector/unit/test_cache_pollution_prevention.py create mode 100644 tests/v1/kv_connector/unit/test_error_propagation.py create mode 100644 tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py new file mode 100644 index 0000000000000..102eeaf614410 --- /dev/null +++ b/tests/entrypoints/openai/test_chat_error.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + serving_chat = OpenAIServingChat( + engine, + models, + response_role="assistant", + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) + + async def _fake_process_inputs( + request_id, + engine_prompt, + sampling_params, + *, + lora_request, + trace_headers, + priority, + ): + return dict(engine_prompt), {} + + async def _fake_preprocess_chat(*args, **kwargs): + # return conversation, request_prompts, engine_prompts + return ( + [{"role": "user", "content": "Test"}], + [[1, 2, 3]], + [{"prompt_token_ids": [1, 2, 3]}], + ) + + serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs) + serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat) + return serving_chat + + +@pytest.mark.asyncio +async def test_chat_error_non_stream(): + """test finish_reason='error' returns 500 InternalServerError (non-streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_chat = _build_serving_chat(mock_engine) + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "Test prompt"}], + max_tokens=10, + stream=False, + ) + + response = await serving_chat.create_chat_completion(request) + + assert isinstance(response, ErrorResponse) + assert response.error.type == "InternalServerError" + assert response.error.message == "Internal server error" + assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_chat_error_stream(): + """test finish_reason='error' returns 500 InternalServerError (streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_chat = _build_serving_chat(mock_engine) + + completion_output_1 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ) + + request_output_1 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_1], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + completion_output_2 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output_2 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_2], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output_1 + yield request_output_2 + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "Test prompt"}], + max_tokens=10, + stream=True, + ) + + response = await serving_chat.create_chat_completion(request) + + chunks = [] + async for chunk in response: + chunks.append(chunk) + + assert len(chunks) >= 2 + assert any("Internal server error" in chunk for chunk in chunks), ( + f"Expected error message in chunks: {chunks}" + ) + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py new file mode 100644 index 0000000000000..ca56cc2ddb6a7 --- /dev/null +++ b/tests/entrypoints/openai/test_completion_error.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + serving_completion = OpenAIServingCompletion( + engine, + models, + request_logger=None, + ) + + async def _fake_process_inputs( + request_id, + engine_prompt, + sampling_params, + *, + lora_request, + trace_headers, + priority, + ): + return dict(engine_prompt), {} + + serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs) + return serving_completion + + +@pytest.mark.asyncio +async def test_completion_error_non_stream(): + """test finish_reason='error' returns 500 InternalServerError (non-streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_completion = _build_serving_completion(mock_engine) + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = CompletionRequest( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + stream=False, + ) + + response = await serving_completion.create_completion(request) + + assert isinstance(response, ErrorResponse) + assert response.error.type == "InternalServerError" + assert response.error.message == "Internal server error" + assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_completion_error_stream(): + """test finish_reason='error' returns 500 InternalServerError (streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_completion = _build_serving_completion(mock_engine) + + completion_output_1 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ) + + request_output_1 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_1], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + completion_output_2 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output_2 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_2], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output_1 + yield request_output_2 + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = CompletionRequest( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + stream=True, + ) + + response = await serving_completion.create_completion(request) + + chunks = [] + async for chunk in response: + chunks.append(chunk) + + assert len(chunks) >= 2 + assert any("Internal server error" in chunk for chunk in chunks), ( + f"Expected error message in chunks: {chunks}" + ) + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/tests/entrypoints/openai/test_responses_error.py b/tests/entrypoints/openai/test_responses_error.py new file mode 100644 index 0000000000000..f8ea178288835 --- /dev/null +++ b/tests/entrypoints/openai/test_responses_error.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from http import HTTPStatus +from unittest.mock import MagicMock + +import pytest + +from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing + + +@pytest.mark.asyncio +async def test_raise_if_error_raises_generation_error(): + """test _raise_if_error raises GenerationError""" + # create a minimal OpenAIServing instance + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # test that error finish_reason raises GenerationError + with pytest.raises(GenerationError) as exc_info: + serving._raise_if_error("error", "test-request-id") + + assert str(exc_info.value) == "Internal server error" + assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + + # test that other finish_reasons don't raise + serving._raise_if_error("stop", "test-request-id") # should not raise + serving._raise_if_error("length", "test-request-id") # should not raise + serving._raise_if_error(None, "test-request-id") # should not raise + + +@pytest.mark.asyncio +async def test_convert_generation_error_to_response(): + """test _convert_generation_error_to_response creates proper ErrorResponse""" + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # create a GenerationError + gen_error = GenerationError("Internal server error") + + # convert to ErrorResponse + error_response = serving._convert_generation_error_to_response(gen_error) + + assert isinstance(error_response, ErrorResponse) + assert error_response.error.type == "InternalServerError" + assert error_response.error.message == "Internal server error" + assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_convert_generation_error_to_streaming_response(): + """test _convert_generation_error_to_streaming_response output""" + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # create a GenerationError + gen_error = GenerationError("Internal server error") + + # convert to streaming error response + error_json = serving._convert_generation_error_to_streaming_response(gen_error) + + assert isinstance(error_json, str) + assert "Internal server error" in error_json + assert "InternalServerError" in error_json diff --git a/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py new file mode 100644 index 0000000000000..ec3fb8231e19e --- /dev/null +++ b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +test that invalid blocks are evicted from prefix cache to prevent pollution. + +verifies that when sync-loading fails, invalid blocks are removed from the +prefix cache hash table so future requests cannot match and reuse corrupted data. +""" + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +def test_invalid_blocks_evicted_prevents_cache_pollution( + fail_scheduler: Scheduler, +): + """ + verify invalid blocks are evicted to prevent future cache hits. + + scenario: + 1. request 1 loads externally-computed blocks (sync mode) + 2. some blocks fail to load and are marked invalid + 3. with fail policy, invalid blocks should be evicted from prefix cache + 4. request is marked as FINISHED_ERROR + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + # request 1: will have invalid blocks + request1 = create_request(num_tokens=num_prompt_tokens, request_id=1) + fail_scheduler.add_request(request=request1) + + req_num_new_matched_tokens = { + request1.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + # request should be running with sync KV load + assert len(fail_scheduler.running) == 1 + assert request1.status == RequestStatus.RUNNING + + # get allocated block IDs + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # get the block object to verify eviction later + block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + + # cache the blocks to simulate they've been computed and cached + # (in real scenario blocks would be cached after compute) + fail_scheduler.kv_cache_manager.cache_blocks(request1, num_external_computed_tokens) + + # verify block has a hash (is cached) before reporting invalid blocks + assert block.block_hash is not None, ( + f"block {invalid_block_id} should be cached (have a hash) before " + f"eviction test, but hash is None" + ) + + # report invalid blocks + model_runner_output = create_model_runner_output( + [request1], + invalid_block_ids=invalid_block_ids, + use_eos=False, + ) + + fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify request finished with error (fail policy) + assert request1.status == RequestStatus.FINISHED_ERROR + + # critical assertion: invalid block and all subsequent blocks should be evicted + # all blocks from invalid_block_idx onwards become invalid since they were + # computed based on the failed block + for idx in range(invalid_block_idx, len(req_block_ids)): + block_id = req_block_ids[idx] + block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id] + assert block_obj.block_hash is None, ( + f"block {block_id} at index {idx} should have been evicted " + f"(hash reset to None), but hash is {block_obj.block_hash}. " + f"All blocks from index {invalid_block_idx} onwards should be evicted " + f"since they depend on the invalid block at index {invalid_block_idx}." + ) + + # verify cache contains exactly the valid blocks (before first affected block) + # and none of the invalid blocks (from first affected block onwards) + + # valid blocks: all blocks before invalid_block_idx should be cached + for idx in range(invalid_block_idx): + block_id = req_block_ids[idx] + block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id] + assert block_obj.block_hash is not None, ( + f"valid block {block_id} at index {idx} should still be cached " + f"(have a hash), but hash is None. Only blocks from index " + f"{invalid_block_idx} onwards should be evicted." + ) + + # invalid blocks: verify they're not in the cached_block_hash_to_block map + cached_blocks = ( + fail_scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block + ) + cached_block_ids = { + b.block_id + for blocks_val in cached_blocks._cache.values() + for b in ( + [blocks_val] if not isinstance(blocks_val, dict) else blocks_val.values() + ) + } + + for idx in range(invalid_block_idx, len(req_block_ids)): + block_id = req_block_ids[idx] + assert block_id not in cached_block_ids, ( + f"invalid block {block_id} at index {idx} should not be in cache hash table" + ) diff --git a/tests/v1/kv_connector/unit/test_error_propagation.py b/tests/v1/kv_connector/unit/test_error_propagation.py new file mode 100644 index 0000000000000..20e181f379f5c --- /dev/null +++ b/tests/v1/kv_connector/unit/test_error_propagation.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import FinishReason, Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +def test_error_propagation_sync_load(fail_scheduler: Scheduler): + """test invalid_block_ids with fail policy -> FINISHED_ERROR (sync load)""" + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + assert len(fail_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert fail_scheduler.connector.get_num_new_matched_tokens.call_count == 1 + + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {req_block_ids[invalid_block_idx]} + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + assert len(fail_scheduler.running) == 0 + + +def test_error_propagation_async_load(fail_scheduler: Scheduler): + """test invalid_block_ids with fail policy -> FINISHED_ERROR (async load)""" + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + assert len(fail_scheduler.waiting) == 1 + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.num_computed_tokens == 0 + + (req_block_ids,) = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id) + invalid_block_ids = {req_block_ids[invalid_block_idx]} + model_runner_output = create_model_runner_output( + reqs=[], + finished_recving=set(), + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + assert len(fail_scheduler.waiting) == 0 diff --git a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py new file mode 100644 index 0000000000000..940f3a98308b6 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py @@ -0,0 +1,454 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Tests for correctness in invalid block handling. + +These tests verify correct behavior in three scenarios: +1. Sync recompute case: Blocks should not be freed for running requests + that need to recompute invalid blocks +2. Sync fail case: Invalid blocks must be evicted from cache when request fails +3. Async recompute case: Invalid blocks should not be cached after transfer +""" + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import FinishReason, Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +@pytest.fixture +def recompute_scheduler(): + """scheduler with kv_load_failure_policy='recompute'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute" + return create_scheduler(vllm_config) + + +def test_sync_recompute_blocks_not_freed_for_running_requests( + recompute_scheduler: Scheduler, +): + """ + Test sync recompute case - blocks must not be freed for running requests. + + When a running request has invalid blocks and retry_policy is 'recompute': + 1. Request should remain in RUNNING state + 2. num_computed_tokens should be truncated to invalid block boundary + 3. Blocks should NOT be freed (request still needs them for recomputation) + 4. Request should remain in scheduler.requests and scheduler.running + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + recompute_scheduler.connector = Mock() + recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + recompute_scheduler.connector.request_finished.return_value = (False, None) + recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = recompute_scheduler.schedule() + + # request should be running with sync KV load + assert len(recompute_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert request.status == RequestStatus.RUNNING + + # get the allocated block IDs before invalid blocks are reported + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {req_block_ids[invalid_block_idx]} + + # store original num_computed_tokens for comparison + original_num_computed_tokens = request.num_computed_tokens + + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=False, # not finished - should continue running + ) + + outputs = recompute_scheduler.update_from_output( + scheduler_output, model_runner_output + ) + + # critical assertions for recompute case: + + # 1. request should still be RUNNING (not finished, not aborted) + assert request.status == RequestStatus.RUNNING, ( + f"Request should remain RUNNING for recompute, got {request.status}" + ) + + # 2. num_computed_tokens should be truncated to first invalid block + expected_truncated_tokens = invalid_block_idx * recompute_scheduler.block_size + assert request.num_computed_tokens == expected_truncated_tokens, ( + f"num_computed_tokens should be truncated to {expected_truncated_tokens}, " + f"got {request.num_computed_tokens}" + ) + assert request.num_computed_tokens < original_num_computed_tokens, ( + "num_computed_tokens should be reduced after invalid block detection" + ) + + # 3. no output should be generated (request is still running) + # the request should be skipped in the output loop + assert len(outputs) == 0 or request.request_id not in [ + out.request_id for outs in outputs.values() for out in outs.outputs + ], "No output should be generated for recompute requests" + + # 4. request should still be in running queue + assert request in recompute_scheduler.running, ( + "Request should remain in running queue for recomputation" + ) + + # 5. request should still be in scheduler.requests (not deleted) + assert request.request_id in recompute_scheduler.requests, ( + "Request should not be deleted from scheduler.requests" + ) + + # 6. blocks should NOT be freed - verify blocks are still allocated + try: + allocated_blocks = recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + assert allocated_blocks is not None + assert len(allocated_blocks[0]) > 0, ( + "Blocks should still be allocated for recomputation" + ) + except KeyError: + pytest.fail( + "Blocks were freed incorrectly! Running requests need their blocks " + "to recompute invalid portions." + ) + + # 7. verify request can be rescheduled in next step + scheduler_output_2 = recompute_scheduler.schedule() + + # request should appear in the new schedule to recompute invalid blocks + scheduled_req_ids = [ + req.request_id for req in scheduler_output_2.scheduled_new_reqs + ] + if scheduler_output_2.num_scheduled_tokens: + scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys()) + + assert ( + request.request_id in scheduled_req_ids or len(recompute_scheduler.running) > 0 + ), "Request should be reschedulable for recomputation" + + +def test_sync_fail_invalid_blocks_evicted(fail_scheduler: Scheduler): + """ + Test sync fail case - invalid blocks must be evicted from cache. + + When a request fails with policy='fail' and has invalid blocks from sync loading: + 1. Request should be finished with FINISHED_ERROR + 2. Invalid blocks should be evicted from the KV cache + 3. Valid blocks (if shared) should remain in cache + 4. Future requests should not reuse the invalid blocks + + This test verifies that invalid blocks are properly evicted to prevent + cache corruption and reuse of invalid data. + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + # request should be running with sync KV load + assert len(fail_scheduler.running) == 1 + assert request.status == RequestStatus.RUNNING + + # get allocated block IDs + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # verify the block is in the block pool before we report it as invalid + block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + assert block is not None + + # report invalid blocks - request should fail + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify request is finished with error + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + # verify output is generated + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + # verify the request was removed from scheduler + assert request.request_id not in fail_scheduler.requests + assert len(fail_scheduler.running) == 0 + + # critical: verify invalid block was actually freed from cache + # this is the key assertion - the invalid block should no longer be + # tracked by the KV cache manager for this request + # if it's still there, a future request could reuse the invalid data + try: + block_ids = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id) + # if we get here, check if blocks were actually freed + if block_ids is not None and len(block_ids[0]) > 0: + pytest.fail( + f"Invalid blocks still tracked for finished request! " + f"Request {request.request_id} should have been freed but " + f"still has {len(block_ids[0])} blocks allocated." + ) + # blocks list exists but is empty - this is fine, they were freed + except KeyError: + # expected - request completely removed from tracking + pass + + # critical: verify invalid block was evicted from prefix cache + # the block should no longer have a hash (hash is reset on eviction) + assert block.block_hash is None, ( + f"Invalid block {invalid_block_id} should have been evicted from cache " + f"(hash should be None), but hash is still {block.block_hash}" + ) + + +def test_async_recompute_blocks_not_cached_when_invalid( + recompute_scheduler: Scheduler, +): + """ + Test async recompute case - invalid blocks not cached after transfer. + + When async KV loading has invalid blocks and retry_policy is 'recompute': + 1. Blocks are allocated but not cached yet + 2. When async transfer completes, only valid blocks should be cached + 3. Invalid blocks should never enter the prefix cache + + This test verifies correctness, the failed_recving_kv_req_ids protection + ensures only valid blocks are cached when the transfer completes, and we + only evict blocks from cache that are already hashed in the block table. + """ + from unittest.mock import patch + + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating async load + recompute_scheduler.connector = Mock() + recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True) + ) + recompute_scheduler.connector.request_finished.return_value = (False, None) + recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = recompute_scheduler.schedule() + + # request should be waiting for remote KVs + assert len(recompute_scheduler.waiting) == 1 + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.num_computed_tokens == 0 + + # get the allocated block IDs + (req_block_ids,) = recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # get the block object to verify it's not cached yet and stays uncached + block = recompute_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + + # verify block has no hash before invalid blocks are reported + assert block.block_hash is None, ( + "Async loading blocks should not be cached yet (no hash)" + ) + + # report invalid blocks (transfer not finished yet) + model_runner_output = create_model_runner_output( + reqs=[], + finished_recving=None, # transfer NOT finished + invalid_block_ids=invalid_block_ids, + use_eos=False, + ) + + # critical: spy on evict_blocks to verify it's NOT called for async blocks + original_evict_blocks = recompute_scheduler.kv_cache_manager.evict_blocks + evict_blocks_calls = [] + + def evict_blocks_spy(block_ids): + evict_blocks_calls.append(set(block_ids)) + return original_evict_blocks(block_ids) + + with patch.object( + recompute_scheduler.kv_cache_manager, "evict_blocks", evict_blocks_spy + ): + recompute_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify evict_blocks was NOT called (async blocks excluded from eviction) + assert len(evict_blocks_calls) == 0, ( + f"evict_blocks should not be called for async-only invalid blocks, " + f"but was called {len(evict_blocks_calls)} time(s) with {evict_blocks_calls}" + ) + + # request should still be waiting (not finished with error due to recompute policy) + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids + + # verify num_computed_tokens was truncated to before invalid block + expected_valid_tokens = invalid_block_idx * recompute_scheduler.block_size + assert request.num_computed_tokens == expected_valid_tokens + + # verify invalid block still has no hash (was not evicted) + assert block.block_hash is None, ( + f"Async loading blocks shouldn't be cached or evicted. " + f"Block {invalid_block_id} hash should be None but is {block.block_hash}" + ) + + # now simulate async transfer completing + model_runner_output_2 = create_model_runner_output( + reqs=[], + finished_recving={request.request_id}, + invalid_block_ids=None, + use_eos=False, + ) + + recompute_scheduler.update_from_output(scheduler_output, model_runner_output_2) + + # verify request is now marked as finished receiving and ready to be processed + assert request.request_id in recompute_scheduler.finished_recving_kv_req_ids + assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids + + # critical: verify invalid block still has no hash before recompute + # the async transfer invalid data was never cached + assert block.block_hash is None, ( + f"Invalid block {invalid_block_id} should not be cached before recompute " + f"(hash should be None), but hash is {block.block_hash}" + ) + + # critical end-to-end test: spy on cache_blocks to verify it's called with + # the truncated num_computed_tokens value + original_cache_blocks = recompute_scheduler.kv_cache_manager.cache_blocks + cache_blocks_calls = [] + + def cache_blocks_spy(req, num_tokens): + cache_blocks_calls.append((req.request_id, num_tokens)) + return original_cache_blocks(req, num_tokens) + + with patch.object( + recompute_scheduler.kv_cache_manager, "cache_blocks", cache_blocks_spy + ): + # call schedule() again - this triggers _update_waiting_for_remote_kv() + # which should call cache_blocks with the truncated value + recompute_scheduler.schedule() + + # verify cache_blocks was called with the truncated value + assert len(cache_blocks_calls) == 1, ( + f"cache_blocks should be called exactly once, " + f"got {len(cache_blocks_calls)} calls" + ) + cached_req_id, cached_num_tokens = cache_blocks_calls[0] + assert cached_req_id == request.request_id + assert cached_num_tokens == expected_valid_tokens, ( + f"cache_blocks should be called with truncated value {expected_valid_tokens}, " + f"but was called with {cached_num_tokens}" + ) + + # request should now be RUNNING (scheduled immediately after transfer completes) + # the flow is: WAITING_FOR_REMOTE_KVS -> WAITING -> RUNNING in same schedule() call + assert request.status == RequestStatus.RUNNING + + # num_computed_tokens should be >= expected_valid_tokens because the scheduler + # will schedule additional new tokens (up to max_num_batched_tokens) for the request + assert request.num_computed_tokens >= expected_valid_tokens, ( + f"num_computed_tokens should be at least {expected_valid_tokens}, " + f"got {request.num_computed_tokens}" + ) + + # request should no longer be in the failed/finished receiving sets + assert request.request_id not in recompute_scheduler.failed_recving_kv_req_ids + assert request.request_id not in recompute_scheduler.finished_recving_kv_req_ids + + # request should be in the running queue + assert request in recompute_scheduler.running diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py index 88f8b91c292bb..98cea821c678e 100644 --- a/vllm/config/kv_transfer.py +++ b/vllm/config/kv_transfer.py @@ -64,6 +64,11 @@ class KVTransferConfig: enable_permute_local_kv: bool = False """Experiment feature flag to enable HND to NHD KV Transfer""" + kv_load_failure_policy: Literal["recompute", "fail"] = "recompute" + """Policy for handling KV cache load failures. + 'recompute': reschedule the request to recompute failed blocks (default) + 'fail': immediately fail the request with an error finish reason""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c6333d170c663..2560a5b2cdf41 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -51,7 +51,11 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, + clamp_prompt_logprobs, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall @@ -380,6 +384,8 @@ class OpenAIServingChat(OpenAIServing): tokenizer, request_metadata, ) + except GenerationError as e: + return self._convert_generation_error_to_response(e) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -1120,6 +1126,10 @@ class OpenAIServingChat(OpenAIServing): # if the model is finished generating else: + # check for error finish reason and abort streaming + # finish_reason='error' indicates a retryable error + self._raise_if_error(output.finish_reason, request_id) + # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing @@ -1287,6 +1297,8 @@ class OpenAIServingChat(OpenAIServing): delta=False, ) + except GenerationError as e: + yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") @@ -1327,6 +1339,9 @@ class OpenAIServingChat(OpenAIServing): role = self.get_chat_request_role(request) for output in final_res.outputs: + # check for error finish reason and raise GenerationError + # finish_reason='error' indicates a retryable request-level internal error + self._raise_if_error(output.finish_reason, request_id) token_ids = output.token_ids out_logprobs = output.logprobs tool_call_info = None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3e421e21e3e80..1be0afc8c74e5 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -24,7 +24,11 @@ from vllm.entrypoints.openai.protocol import ( RequestResponseMetadata, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, + clamp_prompt_logprobs, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.utils import get_max_tokens, should_include_usage @@ -300,6 +304,8 @@ class OpenAIServingCompletion(OpenAIServing): ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") + except GenerationError as e: + return self._convert_generation_error_to_response(e) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -437,6 +443,8 @@ class OpenAIServingCompletion(OpenAIServing): finish_reason = output.finish_reason stop_reason = output.stop_reason + self._raise_if_error(finish_reason, request_id) + chunk = CompletionStreamResponse( id=request_id, created=created_time, @@ -498,8 +506,11 @@ class OpenAIServingCompletion(OpenAIServing): # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info + except GenerationError as e: + yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: # TODO: Use a vllm-specific Validation Error + logger.exception("Error in completion stream generator.") data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" @@ -530,6 +541,8 @@ class OpenAIServingCompletion(OpenAIServing): out_logprobs: GenericSequence[dict[int, Logprob] | None] | None for output in final_res.outputs: + self._raise_if_error(output.finish_reason, request_id) + assert request.max_tokens is not None if request.echo: if request.return_token_ids: diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 44b0f1842a6c1..a799432baeb40 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -133,6 +133,15 @@ from vllm.utils.async_utils import ( from vllm.utils.collection_utils import is_list_of from vllm.v1.engine import EngineCoreRequest + +class GenerationError(Exception): + """raised when finish_reason indicates internal server error (500)""" + + def __init__(self, message: str = "Internal server error"): + super().__init__(message) + self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR + + logger = init_logger(__name__) CompletionLikeRequest: TypeAlias = ( @@ -456,6 +465,29 @@ class OpenAIServing: # Iterate through all beam inference results for i, result in enumerate(output): current_beam = all_beams[i] + + # check for error finish reason and abort beam search + if result.outputs[0].finish_reason == "error": + # yield error output and terminate beam search + yield RequestOutput( + request_id=request_id, + prompt=prompt_text, + outputs=[ + CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + ], + finished=True, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, + ) + return + if result.outputs[0].logprobs is not None: logprobs = result.outputs[0].logprobs[0] all_beams_token_id.extend(list(logprobs.keys())) @@ -780,6 +812,35 @@ class OpenAIServing: ) return json_str + def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None: + """Raise GenerationError if finish_reason indicates an error.""" + if finish_reason == "error": + logger.error( + "Request %s failed with an internal error during generation", + request_id, + ) + raise GenerationError("Internal server error") + + def _convert_generation_error_to_response( + self, e: GenerationError + ) -> ErrorResponse: + """Convert GenerationError to ErrorResponse.""" + return self.create_error_response( + str(e), + err_type="InternalServerError", + status_code=e.status_code, + ) + + def _convert_generation_error_to_streaming_response( + self, e: GenerationError + ) -> str: + """Convert GenerationError to streaming error response.""" + return self.create_streaming_error_response( + str(e), + err_type="InternalServerError", + status_code=e.status_code, + ) + async def _check_model( self, request: AnyRequest, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 91616a78e11dc..60d14337dcaaf 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -50,6 +50,7 @@ from openai.types.responses.response_reasoning_item import ( ) from openai.types.responses.tool import Mcp, Tool from openai_harmony import Message as OpenAIHarmonyMessage +from pydantic import TypeAdapter from vllm import envs from vllm.engine.protocol import EngineClient @@ -94,7 +95,10 @@ from vllm.entrypoints.openai.protocol import ( ResponseUsage, StreamingResponsesResponse, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.responses_utils import ( construct_input_messages, @@ -541,6 +545,8 @@ class OpenAIServingResponses(OpenAIServing): tokenizer, request_metadata, ) + except GenerationError as e: + return self._convert_generation_error_to_response(e) except Exception as e: return self.create_error_response(str(e)) @@ -648,6 +654,8 @@ class OpenAIServingResponses(OpenAIServing): status = "incomplete" elif context.finish_reason == "abort": status = "cancelled" + else: + self._raise_if_error(context.finish_reason, request.request_id) else: status = "incomplete" elif isinstance(context, ParsableContext): @@ -673,6 +681,9 @@ class OpenAIServingResponses(OpenAIServing): assert len(final_res.outputs) == 1 final_output = final_res.outputs[0] + # finish_reason='error' indicates retryable internal error + self._raise_if_error(final_output.finish_reason, request.request_id) + output = self._make_response_output_items(request, final_output, tokenizer) if request.enable_response_messages: @@ -1066,6 +1077,8 @@ class OpenAIServingResponses(OpenAIServing): async for event in generator: event_deque.append(event) new_event_signal.set() # Signal new event available + except GenerationError as e: + response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) response = self.create_error_response(str(e)) @@ -1089,6 +1102,8 @@ class OpenAIServingResponses(OpenAIServing): ): try: response = await self.responses_full_generator(request, *args, **kwargs) + except GenerationError as e: + response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) response = self.create_error_response(str(e)) @@ -1227,6 +1242,8 @@ class OpenAIServingResponses(OpenAIServing): continue if ctx.last_output.outputs: output = ctx.last_output.outputs[0] + # finish_reason='error' indicates a retryable error + self._raise_if_error(output.finish_reason, request.request_id) if reasoning_parser: delta_message = reasoning_parser.extract_reasoning_streaming( previous_text=previous_text, @@ -1522,6 +1539,9 @@ class OpenAIServingResponses(OpenAIServing): async for ctx in result_generator: assert isinstance(ctx, StreamingHarmonyContext) + # finish_reason='error' indicates a retryable error + self._raise_if_error(ctx.finish_reason, request.request_id) + if ctx.is_expecting_start(): current_output_index += 1 sent_output_item_added = False @@ -2016,18 +2036,25 @@ class OpenAIServingResponses(OpenAIServing): ) ) - async for event_data in processer( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - created_time, - _increment_sequence_number_and_return, - ): - yield event_data + try: + async for event_data in processer( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + _increment_sequence_number_and_return, + ): + yield event_data + except GenerationError as e: + error_json = self._convert_generation_error_to_streaming_response(e) + yield _increment_sequence_number_and_return( + TypeAdapter(StreamingResponsesResponse).validate_json(error_json) + ) + return async def empty_async_generator(): # A hack to trick Python to think this is a generator but diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index cfb2c02e00f1b..c779e3d34b3ed 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -397,6 +397,25 @@ class BlockPool: [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null] ) + def evict_blocks(self, block_ids: set[int]) -> None: + """evict blocks from the prefix cache by their block IDs. + + only evicts blocks that are currently cached (have a hash). blocks + with ref_cnt > 0 are not freed from the block pool, only evicted + from the prefix cache hash table. + + Args: + block_ids: Set of block IDs to evict from cache. + """ + for block_id in block_ids: + assert block_id < len(self.blocks), ( + f"Invalid block_id {block_id} >= {len(self.blocks)}. " + f"This indicates a bug in the KV connector - workers should " + f"only report block IDs that were allocated by the scheduler." + ) + block = self.blocks[block_id] + self._maybe_evict_cached_block(block) + def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalid prefix caching after the weights are updated, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 33e8c81514c5f..13086a66f6ea6 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -333,6 +333,14 @@ class KVCacheManager: """ self.coordinator.free(request.request_id) + def evict_blocks(self, block_ids: set[int]) -> None: + """evict blocks from the prefix cache by their block IDs. + + Args: + block_ids: Set of block IDs to evict from cache. + """ + self.block_pool.evict_blocks(block_ids) + def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalidate prefix caching after the weights are updated, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d858e840039c4..c3d504f2e72c3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -106,6 +106,7 @@ class Scheduler(SchedulerInterface): # KV Connector pushes/pull of remote KVs for P/D and offloading. self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None + self.recompute_kv_load_failures = True if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -117,6 +118,10 @@ class Scheduler(SchedulerInterface): ) if self.log_stats: self.connector_prefix_cache_stats = PrefixCacheStats() + kv_load_failure_policy = ( + self.vllm_config.kv_transfer_config.kv_load_failure_policy + ) + self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1066,7 +1071,7 @@ class Scheduler(SchedulerInterface): for req_id, num_tokens_scheduled in num_scheduled_tokens.items(): assert num_tokens_scheduled > 0 if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids: - # Skip requests that were recovered from KV load failure + # skip failed or rescheduled requests from KV load failure continue request = self.requests.get(req_id) if request is None: @@ -1177,6 +1182,21 @@ class Scheduler(SchedulerInterface): # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) + if failed_kv_load_req_ids and not self.recompute_kv_load_failures: + requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids] + self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR) + for request in requests: + outputs[request.client_index].append( + EngineCoreOutput( + request_id=request.request_id, + new_token_ids=[], + finish_reason=request.get_finished_reason(), + events=request.take_events(), + trace_headers=request.trace_headers, + num_cached_tokens=request.num_cached_tokens, + ) + ) + # KV Connector: update state for finished KV Transfers. if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) @@ -1610,8 +1630,11 @@ class Scheduler(SchedulerInterface): self._free_blocks(self.requests[req_id]) def _update_requests_with_invalid_blocks( - self, requests: Iterable[Request], invalid_block_ids: set[int] - ) -> tuple[set[str], int]: + self, + requests: Iterable[Request], + invalid_block_ids: set[int], + evict_blocks: bool = True, + ) -> tuple[set[str], int, set[int]]: """ Identify and update requests affected by invalid KV cache blocks. @@ -1623,16 +1646,21 @@ class Scheduler(SchedulerInterface): Args: requests: The set of requests to scan for invalid blocks. invalid_block_ids: IDs of invalid blocks. + evict_blocks: Whether to collect blocks for eviction (False for + async requests which aren't cached yet). Returns: tuple: - affected_req_ids (set[str]): IDs of requests impacted by invalid blocks. - total_affected_tokens (int): Total number of tokens that must - be recomputed across all affected requests (for observability). + be recomputed across all affected requests. + - blocks_to_evict (set[int]): Block IDs to evict from cache, + including invalid blocks and downstream dependent blocks. """ affected_req_ids: set[str] = set() total_affected_tokens = 0 + blocks_to_evict: set[int] = set() # If a block is invalid and shared by multiple requests in the batch, # these requests must be rescheduled, but only the first will recompute # it. This set tracks blocks already marked for recomputation. @@ -1690,6 +1718,9 @@ class Scheduler(SchedulerInterface): ) total_affected_tokens += num_affected_tokens request.num_external_computed_tokens -= num_affected_tokens + # collect invalid block and all downstream dependent blocks + if evict_blocks: + blocks_to_evict.update(req_block_ids[idx:]) if is_affected: if not marked_invalid_block: @@ -1705,47 +1736,70 @@ class Scheduler(SchedulerInterface): affected_req_ids.add(request.request_id) - return affected_req_ids, total_affected_tokens + return affected_req_ids, total_affected_tokens, blocks_to_evict def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]: - total_requests_to_reschedule = 0 - total_tokens_to_reschedule = 0 + """ + Handle requests affected by invalid KV cache blocks. - # --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) --- + Returns: + Set of affected request IDs to skip in update_from_output main loop. + """ + should_fail = not self.recompute_kv_load_failures + + # handle async KV loads (not cached yet, evict_blocks=False) async_load_reqs = ( req for req in self.waiting if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS ) - async_affected_req_ids, num_tokens_to_reschedule = ( + async_failed_req_ids, num_failed_tokens, _ = ( self._update_requests_with_invalid_blocks( - async_load_reqs, invalid_block_ids + async_load_reqs, invalid_block_ids, evict_blocks=False ) ) - total_requests_to_reschedule += len(async_affected_req_ids) - total_tokens_to_reschedule += num_tokens_to_reschedule + total_failed_requests = len(async_failed_req_ids) + total_failed_tokens = num_failed_tokens - # Mark requests with async KV load failures; they will be rescheduled - # once loading completes. - self.failed_recving_kv_req_ids |= async_affected_req_ids - - # --- Handle sync KV loads (running requests) --- - sync_affected_req_ids, num_tokens_to_reschedule = ( - self._update_requests_with_invalid_blocks(self.running, invalid_block_ids) + # handle sync loads (may be cached, collect blocks for eviction) + sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = ( + self._update_requests_with_invalid_blocks( + self.running, invalid_block_ids, evict_blocks=True + ) ) - total_requests_to_reschedule += len(sync_affected_req_ids) - total_tokens_to_reschedule += num_tokens_to_reschedule + total_failed_requests += len(sync_failed_req_ids) + total_failed_tokens += num_failed_tokens - if total_requests_to_reschedule: - logger.warning( - "Recovered from KV load failure: " - "%d request(s) rescheduled (%d tokens affected).", - total_requests_to_reschedule, - total_tokens_to_reschedule, + if not total_failed_requests: + return set() + + # evict invalid blocks and downstream dependent blocks from cache + # only when not using recompute policy (where blocks will be recomputed + # and reused by other requests sharing them) + if sync_blocks_to_evict and not self.recompute_kv_load_failures: + self.kv_cache_manager.evict_blocks(sync_blocks_to_evict) + + if should_fail: + all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids + logger.error( + "Failing %d request(s) due to KV load failure " + "(failure_policy=fail, %d tokens affected). Request IDs: %s", + total_failed_requests, + total_failed_tokens, + all_failed_req_ids, ) + return all_failed_req_ids - # Return the IDs of affected running requests to skip in - # update_from_output. - return sync_affected_req_ids + logger.warning( + "Recovered from KV load failure: " + "%d request(s) rescheduled (%d tokens affected).", + total_failed_requests, + total_failed_tokens, + ) + + # Mark async requests with KV load failures for retry once loading completes + self.failed_recving_kv_req_ids |= async_failed_req_ids + # Return sync affected IDs to skip in update_from_output + return sync_failed_req_ids diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index ce2aae77108da..4f54d12f4b8d0 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -19,24 +19,27 @@ from vllm.v1.serial_utils import UtilityResult # These are possible values of RequestOutput.finish_reason, # so form part of the external API. -FINISH_REASON_STRINGS = ("stop", "length", "abort") +FINISH_REASON_STRINGS = ("stop", "length", "abort", "error") class FinishReason(enum.IntEnum): """ - Reason a request finished - stop, length, or abort. + Reason a request finished - stop, length, abort, or error. Int rather than Str for more compact serialization. stop - a stop string was emitted length - max_tokens was consumed, or max_model_len was reached - abort - aborted for another reason + abort - aborted by client + error - retryable request-level internal error (e.g., KV load failure). + Invariant: always converted to 500 Internal Server Error. """ STOP = 0 LENGTH = 1 ABORT = 2 + ERROR = 3 def __str__(self): return FINISH_REASON_STRINGS[self.value] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 33762fe34e64f..a775e840e841c 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum): FINISHED_LENGTH_CAPPED = enum.auto() FINISHED_ABORTED = enum.auto() FINISHED_IGNORED = enum.auto() + FINISHED_ERROR = enum.auto() def __str__(self): return self.name @@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = { RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH, RequestStatus.FINISHED_ABORTED: FinishReason.ABORT, RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH, + RequestStatus.FINISHED_ERROR: FinishReason.ERROR, } From e72d65b959f759fcf56b329ecaaee7d166c012d2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 03:10:58 +0800 Subject: [PATCH 02/16] {Deprecation] Remove tokenizer setter (#30400) Signed-off-by: DarkLight1337 --- vllm/entrypoints/llm.py | 13 +------------ vllm/v1/engine/async_llm.py | 4 ---- vllm/v1/engine/input_processor.py | 4 ---- vllm/v1/engine/llm_engine.py | 4 ---- 4 files changed, 1 insertion(+), 24 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5d5c4a1cdb77b..3fce3338503ef 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -9,7 +9,7 @@ import cloudpickle import torch.nn as nn from pydantic import ValidationError from tqdm.auto import tqdm -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar from vllm.beam_search import ( BeamSearchInstance, @@ -73,7 +73,6 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams from vllm.tasks import PoolingTask from vllm.tokenizers import MistralTokenizer, TokenizerLike -from vllm.tokenizers.hf import get_cached_tokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils.collection_utils import as_iter, is_list_of from vllm.utils.counter import Counter @@ -367,16 +366,6 @@ class LLM: def get_tokenizer(self) -> TokenizerLike: return self.llm_engine.get_tokenizer() - @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.") - def set_tokenizer(self, tokenizer: TokenizerLike) -> None: - # While CachedTokenizer is dynamic, have no choice but - # compare class name. Misjudgment will arise from - # user-defined tokenizer started with 'Cached' - if tokenizer.__class__.__name__.startswith("Cached"): - self.llm_engine.tokenizer = tokenizer - else: - self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer) - def reset_mm_cache(self) -> None: self.input_processor.clear_mm_cache() self.llm_engine.reset_mm_cache() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fa3fb7a18895a..8eff61563ccea 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -701,10 +701,6 @@ class AsyncLLM(EngineClient): def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_processor.tokenizer = tokenizer - async def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index e6a94f4e3de5d..a3c18464d3f52 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -64,10 +64,6 @@ class InputProcessor: def tokenizer(self) -> TokenizerLike | None: return self.input_preprocessor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_preprocessor.tokenizer = tokenizer - def _validate_logprobs( self, params: SamplingParams, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1cb206c4e004c..4422eced82fea 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -358,10 +358,6 @@ class LLMEngine: def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_processor.tokenizer = tokenizer - def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( From 9f042ba26b59e1bfc9bef031165033fa931f3457 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 10 Dec 2025 11:13:01 -0800 Subject: [PATCH 03/16] [Perf] Enable environment cache in EngineCore to enable the feature for UniProcExecutor as well (#29289) Signed-off-by: Jialin Ouyang --- tests/test_envs.py | 38 ++++++++++++++++++++++++++++++ vllm/distributed/parallel_state.py | 2 ++ vllm/envs.py | 20 ++++++++++++++++ vllm/v1/engine/core.py | 7 +++--- 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index 11bbec38202bf..b6b7cf38d4abc 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -8,6 +8,7 @@ import pytest import vllm.envs as envs from vllm.envs import ( + disable_envs_cache, enable_envs_cache, env_list_with_choices, env_set_with_choices, @@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): envs.__getattr__ = envs.__getattr__.__wrapped__ +def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + # Enable envs cache and ignore ongoing environment changes + enable_envs_cache() + assert envs.VLLM_HOST_IP == "1.1.1.1" + # With cache enabled, the environment variable value is cached and unchanged + monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2") + assert envs.VLLM_HOST_IP == "1.1.1.1" + + disable_envs_cache() + assert envs.VLLM_HOST_IP == "2.2.2.2" + # After cache disabled, the environment variable value would be synced + # with os.environ + monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3") + assert envs.VLLM_HOST_IP == "3.3.3.3" + + +def test_is_envs_cache_enabled() -> None: + assert not envs._is_envs_cache_enabled() + enable_envs_cache() + assert envs._is_envs_cache_enabled() + + # Only wrap one-layer of cache, so we only need to + # call disable once to reset. + enable_envs_cache() + enable_envs_cache() + enable_envs_cache() + disable_envs_cache() + assert not envs._is_envs_cache_enabled() + + disable_envs_cache() + assert not envs._is_envs_cache_enabled() + + class TestEnvWithChoices: """Test cases for env_with_choices function.""" diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f910f10407d44..338cb1f1814b5 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1586,6 +1586,8 @@ def destroy_distributed_environment(): def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + # Reset environment variable cache + envs.disable_envs_cache() # Ensure all objects are not frozen before cleanup gc.unfreeze() diff --git a/vllm/envs.py b/vllm/envs.py index 8246109eb73af..230f2cf3450a9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1580,6 +1580,12 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def _is_envs_cache_enabled() -> bool: + """Checked if __getattr__ is wrapped with functools.cache""" + global __getattr__ + return hasattr(__getattr__, "cache_clear") + + def enable_envs_cache() -> None: """ Enables caching of environment variables. This is useful for performance @@ -1590,6 +1596,9 @@ def enable_envs_cache() -> None: runtime overhead. This also means that environment variables should NOT be updated after the service is initialized. """ + if _is_envs_cache_enabled(): + # Avoid wrapping functools.cache multiple times + return # Tag __getattr__ with functools.cache global __getattr__ __getattr__ = functools.cache(__getattr__) @@ -1599,6 +1608,17 @@ def enable_envs_cache() -> None: __getattr__(key) +def disable_envs_cache() -> None: + """ + Resets the environment variables cache. It could be used to isolate environments + between unit tests. + """ + global __getattr__ + # If __getattr__ is wrapped by functions.cache, unwrap the caching layer. + if _is_envs_cache_enabled(): + __getattr__ = __getattr__.__wrapped__ + + def __dir__(): return list(environment_variables.keys()) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3d3a1e138ddef..0045b8c1dd3e7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -211,6 +211,9 @@ class EngineCore: freeze_gc_heap() # If enable, attach GC debugger after static variable freeze. maybe_attach_gc_debug_callback() + # Enable environment variable cache (e.g. assume no more + # environment variable overrides after this point) + enable_envs_cache() def _initialize_kv_caches( self, vllm_config: VllmConfig @@ -672,10 +675,6 @@ class EngineCoreProc(EngineCore): assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - # Enable environment variable cache (e.g. assume no more - # environment variable overrides after this point) - enable_envs_cache() - @contextmanager def _perform_handshakes( self, From eea41804a4b4f84a80f63375ce2e77668d70bda5 Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Thu, 11 Dec 2025 03:18:51 +0800 Subject: [PATCH 04/16] [bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241) Signed-off-by: Po-Han Huang --- vllm/utils/flashinfer.py | 5 ++++- vllm/v1/attention/backends/flashinfer.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 7aaf690cbaa13..9a66049350cd8 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool: def force_use_trtllm_attention() -> bool | None: """ + This function should only be called during initialization stage when vllm config + is set. Return `None` if --attention-config.use_trtllm_attention is not set, return `True` if TRTLLM attention is forced to be used, return `False` if TRTLLM attention is forced to be not used. @@ -296,11 +298,12 @@ def use_trtllm_attention( kv_cache_dtype: str, q_dtype: torch.dtype, is_prefill: bool, + # None means auto-detection, True means force on, False means force off + force_use_trtllm: bool | None = None, has_sinks: bool = False, has_spec: bool = False, ) -> bool: """Return `True` if TRTLLM attention is used.""" - force_use_trtllm = force_use_trtllm_attention() # CLI argument is set to 0 - respect it if force_use_trtllm is not None and not force_use_trtllm: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8e9d764e4a123..4174b80ee312e 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config + self.attention_config = vllm_config.attention_config self._workspace_buffer = None self._prefill_wrapper: ( BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None @@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.cache_dtype, self.q_data_type, is_prefill=True, + force_use_trtllm=self.attention_config.use_trtllm_attention, has_sinks=self.has_sinks, has_spec=uses_spec_reorder, ) From 6ccb7baeb1a124ad9b6e87fe9bbd48ae40830869 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 10 Dec 2025 11:52:01 -0800 Subject: [PATCH 05/16] [LMCache] Fix breakage due to new LMCache version (#30216) Signed-off-by: Nick Hill --- requirements/kv_connectors.txt | 2 +- .../kv_connector/v1/lmcache_integration/vllm_v1_adapter.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index 083230c171096..f60a01a55d07c 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1,2 +1,2 @@ -lmcache +lmcache >= 0.3.10.post1 nixl >= 0.7.1 # Required for disaggregated prefill diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 15ac5b049fce9..cdc2969a7735e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -27,7 +27,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import ( LMCacheAsyncLookupServer, ) from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer -from lmcache.v1.plugin.plugin_launcher import PluginLauncher +from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig @@ -683,7 +683,7 @@ class LMCacheConnectorV1Impl: self.api_server = InternalAPIServer(self) self.api_server.start() # Launch plugins - self.plugin_launcher = PluginLauncher( + self.plugin_launcher = RuntimePluginLauncher( self.config, role, self.worker_count, From fcb894222f2b8a353072e1aea33b38f4403bbd7a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 10 Dec 2025 15:56:51 -0500 Subject: [PATCH 06/16] [Docs] Update EPLB docs (#30426) Signed-off-by: mgoin --- docs/serving/expert_parallel_deployment.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index ec07896592ba3..98f242ab8b892 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE Where: -- `TP_SIZE`: Tensor parallel size (always 1 for now) +- `TP_SIZE`: Tensor parallel size - `DP_SIZE`: Data parallel size - `EP_SIZE`: Expert parallel size (computed automatically) +When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`. + ### Example Command The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section. @@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu Enable EPLB with the `--enable-eplb` flag. -!!! note "Model Support" - Currently only DeepSeek V3 architecture is supported. - When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution. ### EPLB Parameters @@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T | `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 | | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` | | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` | +| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` | +| `policy` | The policy type for expert parallel load balancing | `"default"` | For example: From b9e0951f964e1b8adfebb973c30462c0e0417c1f Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Wed, 10 Dec 2025 17:15:54 -0500 Subject: [PATCH 07/16] [docs] Improve wide-EP performance + benchmarking documentation (#27933) Signed-off-by: Seiji Eicher --- docs/serving/data_parallel_deployment.md | 14 ++++++++++- docs/serving/expert_parallel_deployment.md | 28 +++++++++++++++++++++- tools/ep_kernels/README.md | 4 ++-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md index eff9c5d5e4efa..e5954917cd790 100644 --- a/docs/serving/data_parallel_deployment.md +++ b/docs/serving/data_parallel_deployment.md @@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w vLLM supports "self-contained" data parallel deployments that expose a single API endpoint. -It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. +It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank. Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks. @@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco ![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png) +## Hybrid Load Balancing + +Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints. + +Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are: + +- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns. +- Not compatible with `--headless` since every node exposes an API endpoint. +- Scale `--api-server-count` per node based on the number of local ranks + +In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes. + ## External Load Balancing For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally. diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 98f242ab8b892..923020dc88c91 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -83,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \ --data-parallel-size-local 8 \ # Local DP size on this node (8 GPUs per node) --data-parallel-address 192.168.1.100 \ # Replace with actual IP of Node 1 --data-parallel-rpc-port 13345 \ # RPC communication port, can be any port as long as reachable by all nodes - --api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended) + --api-server-count=8 # Number of API servers for load handling (scaling this out to # local ranks is recommended) # Node 2 (Secondary - headless mode, no API server) vllm serve deepseek-ai/DeepSeek-V3-0324 \ @@ -184,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \ For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available. +## Advanced Configuration + +### Performance Optimization + +- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads +- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details. +- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution. + +### Troubleshooting + +- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited". +- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`. +- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband. + +### Benchmarking + +- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks. + +- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`. + ## Disaggregated Serving (Prefill/Decode Split) For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations. @@ -274,3 +294,9 @@ except Exception as e: print(f"❌ Error during disaggregated serving: {e}") print("Check that both prefill and decode instances are running and accessible") ``` + +### Benchmarking + +- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation. + +- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache. diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index 85e9d2a4f8129..ab0e358802bf8 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -7,7 +7,7 @@ Here we break down the requirements in 2 steps: 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine. -2 is necessary for multi-node deployment. +Step 2 is necessary for multi-node deployment. All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. @@ -23,6 +23,6 @@ TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh Additional step for multi-node deployment: ```bash -sudo bash configure_system_drivers.sh +sudo bash configure_system_drivers.sh # update-initramfs can take several minutes sudo reboot # Reboot is required to load the new driver ``` From 166ac3c94d6ee845d4d8dc1a6dced4d9033fa4e3 Mon Sep 17 00:00:00 2001 From: Christina Norman Date: Wed, 10 Dec 2025 17:01:19 -0600 Subject: [PATCH 08/16] fix(shm): Add memory barriers for cross-process shared memory visibility (#30407) Signed-off-by: Christina Holland Signed-off-by: Christina --- .../device_communicators/shm_broadcast.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 114516ff07a1f..31c6084c9b507 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import pickle +import threading import time from contextlib import contextmanager from dataclasses import dataclass, field @@ -43,6 +44,33 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL from_bytes_big = functools.partial(int.from_bytes, byteorder="big") +# Memory fence for cross-process shared memory visibility. +# Required for correct producer-consumer synchronization when using +# shared memory without locks. +_memory_fence_lock = threading.Lock() + + +def memory_fence(): + """ + Full memory barrier for shared memory synchronization. + + Ensures all prior memory writes are visible to other processes before + any subsequent reads. This is critical for lock-free producer-consumer + patterns using shared memory. + + Implementation acquires and immediately releases a lock. Python's + threading.Lock provides sequentially consistent memory barrier semantics + across all major platforms (POSIX, Windows). This is a lightweight + operation (~20ns) that guarantees: + - All stores before the barrier are visible to other threads/processes + - All loads after the barrier see the latest values + """ + # Lock acquire/release provides full memory barrier semantics. + # Using context manager ensures lock release even on exceptions. + with _memory_fence_lock: + pass + + def to_bytes_big(value: int, size: int) -> bytes: return value.to_bytes(size, byteorder="big") @@ -414,6 +442,10 @@ class MessageQueue: n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + # Memory fence ensures we see the latest read flags from readers. + # Without this, we may read stale flags from our CPU cache and + # spin indefinitely even though readers have completed. + memory_fence() read_count = sum(metadata_buffer[1:]) written_flag = metadata_buffer[0] if written_flag and read_count != self.buffer.n_reader: @@ -458,6 +490,10 @@ class MessageQueue: metadata_buffer[i] = 0 # mark the block as written metadata_buffer[0] = 1 + # Memory fence ensures the write is visible to readers on other cores + # before we proceed. Without this, readers may spin indefinitely + # waiting for a write that's stuck in our CPU's store buffer. + memory_fence() self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks break @@ -473,6 +509,10 @@ class MessageQueue: n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + # Memory fence ensures we see the latest writes from the writer. + # Without this, we may read stale flags from our CPU cache + # and spin indefinitely even though writer has updated them. + memory_fence() read_flag = metadata_buffer[self.local_reader_rank + 1] written_flag = metadata_buffer[0] if not written_flag or read_flag: @@ -513,6 +553,10 @@ class MessageQueue: # caller has read from the buffer # set the read flag metadata_buffer[self.local_reader_rank + 1] = 1 + # Memory fence ensures the read flag is visible to the writer. + # Without this, writer may not see our read completion and + # could wait indefinitely for all readers to finish. + memory_fence() self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks self._read_spin_timer.record_activity() From 8580919ac36b9ada425668264437c70935943e05 Mon Sep 17 00:00:00 2001 From: shivampr Date: Wed, 10 Dec 2025 15:17:41 -0800 Subject: [PATCH 09/16] [Bugfix] fix confusing OOM errors during v1 init (#28051) Signed-off-by: Shivam Signed-off-by: shivampr Co-authored-by: Chen Zhang --- tests/v1/engine/test_init_error_messaging.py | 54 +++++++ vllm/v1/core/kv_cache_utils.py | 10 +- vllm/v1/worker/gpu_model_runner.py | 139 ++++++++++--------- 3 files changed, 138 insertions(+), 65 deletions(-) create mode 100644 tests/v1/engine/test_init_error_messaging.py diff --git a/tests/v1/engine/test_init_error_messaging.py b/tests/v1/engine/test_init_error_messaging.py new file mode 100644 index 0000000000000..bc23a68f9deb1 --- /dev/null +++ b/tests/v1/engine/test_init_error_messaging.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory +from vllm.v1.kv_cache_interface import FullAttentionSpec + + +def test_kv_cache_oom_no_memory(): + from unittest.mock import MagicMock + + config = MagicMock() + config.model_config.max_model_len = 2048 + + spec = { + "layer_0": FullAttentionSpec( + block_size=16, + num_kv_heads=8, + head_size=128, + dtype="float16", + ) + } + + with pytest.raises(ValueError): + check_enough_kv_cache_memory(config, spec, 0) + + +def test_kv_cache_oom_insufficient_memory(monkeypatch): + from unittest.mock import MagicMock + + config = MagicMock() + config.model_config.max_model_len = 2048 + config.cache_config.block_size = 16 + config.parallel_config.tensor_parallel_size = 1 + config.parallel_config.pipeline_parallel_size = 1 + config.parallel_config.decode_context_parallel_size = 1 + + monkeypatch.setattr( + "vllm.v1.core.kv_cache_utils.max_memory_usage_bytes", + lambda c, s: 100 * 1024**3, # 100 GiB + ) + + spec = { + "layer_0": FullAttentionSpec( + block_size=16, + num_kv_heads=8, + head_size=128, + dtype="float16", + ) + } + + with pytest.raises(ValueError): + check_enough_kv_cache_memory(config, spec, 1024**3) # 1 GiB diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 774200deed158..e4360de3717d1 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -687,7 +687,9 @@ def check_enough_kv_cache_memory( raise ValueError( "No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " - "initializing the engine." + "initializing the engine. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more details." ) max_model_len = vllm_config.model_config.max_model_len @@ -711,8 +713,10 @@ def check_enough_kv_cache_memory( f"cache is needed, which is larger than the available KV cache " f"memory ({available_memory / GiB_bytes:.2f} GiB). " f"{estimated_msg} " - f"Try increasing `gpu_memory_utilization` or decreasing " - f"`max_model_len` when initializing the engine." + f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " + f"when initializing the engine. " + f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + f"for more details." ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ca06f048f290b..7dc86f1ee4815 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3571,74 +3571,89 @@ class GPUModelRunner( if self.parallel_config.enable_eplb: self.eplb_state = EplbState(self.parallel_config, self.device) eplb_models = 0 - with DeviceMemoryProfiler() as m: - time_before_load = time.perf_counter() - model_loader = get_model_loader(self.load_config) - self.model = model_loader.load_model( - vllm_config=self.vllm_config, model_config=self.model_config - ) - if self.lora_config: - self.model = self.load_lora_model( - self.model, self.vllm_config, self.device + + try: + with DeviceMemoryProfiler() as m: + time_before_load = time.perf_counter() + model_loader = get_model_loader(self.load_config) + self.model = model_loader.load_model( + vllm_config=self.vllm_config, model_config=self.model_config ) - if hasattr(self, "drafter"): - logger.info_once("Loading drafter model...") - self.drafter.load_model(self.model) - if ( - hasattr(self.drafter, "model") - and is_mixture_of_experts(self.drafter.model) - and self.parallel_config.enable_eplb - ): - spec_config = self.vllm_config.speculative_config - assert spec_config is not None - assert spec_config.draft_model_config is not None - logger.info_once( - "EPLB is enabled for drafter model %s.", - spec_config.draft_model_config.model, + if self.lora_config: + self.model = self.load_lora_model( + self.model, self.vllm_config, self.device ) + if hasattr(self, "drafter"): + logger.info_once("Loading drafter model...") + self.drafter.load_model(self.model) + if ( + hasattr(self.drafter, "model") + and is_mixture_of_experts(self.drafter.model) + and self.parallel_config.enable_eplb + ): + spec_config = self.vllm_config.speculative_config + assert spec_config is not None + assert spec_config.draft_model_config is not None + logger.info_once( + "EPLB is enabled for drafter model %s.", + spec_config.draft_model_config.model, + ) - global_expert_load = ( - global_expert_loads[eplb_models] - if global_expert_loads - else None - ) - old_global_expert_indices = ( - old_global_expert_indices_per_model[eplb_models] - if old_global_expert_indices_per_model - else None - ) - if self.eplb_state is None: - self.eplb_state = EplbState(self.parallel_config, self.device) - self.eplb_state.add_model( - self.drafter.model, - spec_config.draft_model_config, - global_expert_load, - old_global_expert_indices, - rank_mapping, - ) - eplb_models += 1 + global_expert_load = ( + global_expert_loads[eplb_models] + if global_expert_loads + else None + ) + old_global_expert_indices = ( + old_global_expert_indices_per_model[eplb_models] + if old_global_expert_indices_per_model + else None + ) + if self.eplb_state is None: + self.eplb_state = EplbState( + self.parallel_config, self.device + ) + self.eplb_state.add_model( + self.drafter.model, + spec_config.draft_model_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, + ) + eplb_models += 1 - if self.use_aux_hidden_state_outputs: - if not supports_eagle3(self.get_model()): - raise RuntimeError( - "Model does not support EAGLE3 interface but " - "aux_hidden_state_outputs was requested" - ) + if self.use_aux_hidden_state_outputs: + if not supports_eagle3(self.get_model()): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) - # Try to get auxiliary layers from speculative config, - # otherwise use model's default layers - aux_layers = self._get_eagle3_aux_layers_from_config() - if aux_layers: - logger.info( - "Using auxiliary layers from speculative config: %s", - aux_layers, - ) - else: - aux_layers = self.model.get_eagle3_aux_hidden_state_layers() + # Try to get auxiliary layers from speculative config, + # otherwise use model's default layers + aux_layers = self._get_eagle3_aux_layers_from_config() + if aux_layers: + logger.info( + "Using auxiliary layers from speculative config: %s", + aux_layers, + ) + else: + aux_layers = self.model.get_eagle3_aux_hidden_state_layers() - self.model.set_aux_hidden_state_layers(aux_layers) - time_after_load = time.perf_counter() - self.model_memory_usage = m.consumed_memory + self.model.set_aux_hidden_state_layers(aux_layers) + time_after_load = time.perf_counter() + self.model_memory_usage = m.consumed_memory + except torch.cuda.OutOfMemoryError as e: + msg = ( + "Failed to load model - not enough GPU memory. " + "Try lowering --gpu-memory-utilization to free memory for weights, " + "increasing --tensor-parallel-size, or using --quantization. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more tips." + ) + combined_msg = f"{msg} (original error: {e})" + logger.error(combined_msg) + raise e logger.info_once( "Model loading took %.4f GiB memory and %.6f seconds", self.model_memory_usage / GiB_bytes, From 25221b44bbb6856c25d7a3c01bb6f79e999927b0 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Thu, 11 Dec 2025 08:12:21 +0800 Subject: [PATCH 10/16] Add more docs for regex (#30106) Signed-off-by: Xu Song Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/features/structured_outputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 7d52891bea7b9..3ac987559e622 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as print(completion.choices[0].message.content) ``` -The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template: +The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template: ??? code From b4054c8ab469a9c3c3c77a1c2f22f54a69b87145 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 10 Dec 2025 16:48:35 -0800 Subject: [PATCH 11/16] Revert "[CI] Add Async Eplb nightly CI tests (#29385)" (#30431) --- .../deepseek_v2_lite_ep_async_eplb.sh | 73 ------------------ .../deepseek_v2_lite_ep_eplb.sh | 1 - .../qwen3_next_mtp_async_eplb.sh | 74 ------------------- .buildkite/test-pipeline.yaml | 20 +---- vllm/distributed/eplb/rebalance_execute.py | 3 + 5 files changed, 4 insertions(+), 167 deletions(-) delete mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh delete mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh deleted file mode 100644 index d7167161b0059..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8030} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="deepseek-ai/DeepSeek-V2-lite" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 2 \ - --data-parallel-size 2 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 693418da6093e..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do --data-parallel-size 2 \ --enable-expert-parallel \ --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600}' \ --trust-remote-code \ --max-model-len 2048 \ --port $PORT & diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh deleted file mode 100644 index 937a43d1a3221..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8040} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --gpu-memory-utilization 0.9 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8fc3587f7813c..750e7c038351c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1379,22 +1379,4 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - -- label: DeepSeek V2-Lite Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 55856d940f001..376dad8a72ef1 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -322,6 +322,9 @@ async def transfer_layer( num_local_physical_experts = next(iter(expert_weights[0])).shape[0] assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) assert num_physical_experts == ep_size * num_local_physical_experts + # A buffer to hold the expert weights in one layer during the exchange. + # NOTE: Currently we assume the same weights across different layers + # have the same shape. is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer( num_local_experts=num_local_physical_experts, From b51255f369cf45456e3062e32ecbfebd03a9f169 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 10 Dec 2025 19:12:58 -0600 Subject: [PATCH 12/16] [ROCm] Fix broken import in platform attention backend dispatching (#30432) Signed-off-by: Andreas Karatzas --- vllm/platforms/rocm.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f7adecbd88746..876114c2d33a4 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -403,7 +403,21 @@ class RocmPlatform(Platform): compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE if cache_config and cache_config.block_size is None: - cache_config.block_size = 16 + if ( + envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER + # NOTE: This block has been deprecated + # or get_env_variable_attn_backend() + # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN + # TODO: monitor https://github.com/vllm-project/vllm/pull/30396 + # to see how we can transition to the new way of selecting + # attention backends + ): + cache_config.block_size = 64 + logger.warning( + "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64." + ) + else: + cache_config.block_size = 16 if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" From d1e1fb4363c61080b7cd20469d5a751e88a1cdb3 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Wed, 10 Dec 2025 21:47:18 -0600 Subject: [PATCH 13/16] [Bugfix] Fix grouped_topk pytorch impl when num_experts can't be grouped properly (#29439) Signed-off-by: Divakar Verma Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: TJian --- vllm/model_executor/layers/fused_moe/layer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 61dd1892d67ea..7f803720d4770 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1556,6 +1556,14 @@ class FusedMoE(CustomOp): f"EPLB is not supported for {self.quant_method.method_name}." ) + def valid_grouping() -> bool: + # Check if num_experts is greater than num_expert_group + # and is divisible by num_expert_group + num_experts = router_logits.shape[-1] + if num_experts <= self.num_expert_group: + return False + return num_experts % self.num_expert_group == 0 + indices_type = self.quant_method.topk_indices_dtype # Check if we should use a routing simulation strategy @@ -1570,7 +1578,7 @@ class FusedMoE(CustomOp): ) # DeepSeekv2 uses grouped_top_k - elif self.use_grouped_topk: + elif self.use_grouped_topk and valid_grouping(): assert self.topk_group is not None assert self.num_expert_group is not None if rocm_aiter_ops.is_fused_moe_enabled(): From 5a87d8b9b1f357a65a9b73773178ae17fd7cd9c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 11:59:35 +0800 Subject: [PATCH 14/16] [Deprecation] Remove deprecated plugin and compilation fields for v0.13 release (#30396) Signed-off-by: DarkLight1337 --- docs/design/plugin_system.md | 4 +- tests/compile/test_config.py | 63 +--------------------- tests/kernels/moe/test_ocp_mx_moe.py | 4 +- tests/quantization/test_quark.py | 4 +- tests/test_config.py | 2 +- vllm/attention/backends/registry.py | 32 ----------- vllm/attention/selector.py | 46 +++++----------- vllm/config/compilation.py | 81 +--------------------------- vllm/config/vllm.py | 2 +- vllm/engine/arg_utils.py | 22 -------- 10 files changed, 22 insertions(+), 238 deletions(-) diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 3485c40c36811..b0ca2dad23d5b 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you ## Deprecation announcement !!! warning "Deprecations" - - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0. - - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. + - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 0e91cf525411e..04bb56ecb6470 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -import logging from contextlib import nullcontext from unittest.mock import patch @@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig from vllm.config.compilation import CompilationMode, PassConfig from vllm.engine.arg_utils import EngineArgs -from vllm.logger import _print_warning_once from vllm.platforms import current_platform from vllm.utils.torch_utils import _is_torch_equal_or_newer @@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config={"enable_attn_fusion": True, "enable_noop": True}, + pass_config={"fuse_attn_quant": True, "eliminate_noops": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, ), @@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init( vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size ) - - -def test_pass_config_deprecation(caplog_vllm): - caplog_vllm.set_level(logging.WARNING) - - # Clear cache to ensure warnings are re-issued - _print_warning_once.cache_clear() - - # Test enable_fusion -> fuse_norm_quant, fuse_act_quant - caplog_vllm.clear() - config = PassConfig(enable_fusion=True) - assert "enable_fusion is deprecated" in caplog_vllm.text - assert config.fuse_norm_quant is True - assert config.fuse_act_quant is True - assert config.enable_fusion is True - - # Test enable_attn_fusion -> fuse_attn_quant - caplog_vllm.clear() - config = PassConfig(enable_attn_fusion=True) - assert "enable_attn_fusion is deprecated" in caplog_vllm.text - assert config.fuse_attn_quant is True - assert config.enable_attn_fusion is True - - # Test enable_noop -> eliminate_noops - caplog_vllm.clear() - config = PassConfig(enable_noop=True) - assert "enable_noop is deprecated" in caplog_vllm.text - assert config.eliminate_noops is True - assert config.enable_noop is True - - # Test enable_sequence_parallelism -> enable_sp - caplog_vllm.clear() - config = PassConfig(enable_sequence_parallelism=True) - assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text - assert config.enable_sp is True - assert config.enable_sequence_parallelism is True - - # Test enable_async_tp -> fuse_gemm_comms - caplog_vllm.clear() - config = PassConfig(enable_async_tp=True) - assert "enable_async_tp is deprecated" in caplog_vllm.text - assert config.fuse_gemm_comms is True - assert config.enable_async_tp is True - - # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms - caplog_vllm.clear() - config = PassConfig(enable_fi_allreduce_fusion=True) - assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text - assert config.fuse_allreduce_rms is True - assert config.enable_fi_allreduce_fusion is True - - # Test hash consistency - config_old = PassConfig(enable_fusion=True) - config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True) - assert config_old.compute_hash() == config_new.compute_hash() - - config_old = PassConfig(enable_async_tp=True) - config_new = PassConfig(fuse_gemm_comms=True) - assert config_old.compute_hash() == config_new.compute_hash() diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index 91b508d4163cc..5a850dda4f6fd 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -70,12 +70,12 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): f"{torch.cuda.device_count()}" ) - # `cuda_graph_sizes=[16]` to reduce load time. + # `cudagraph_capture_sizes=[16]` to reduce load time. with vllm_runner( model_case.model_id, tensor_parallel_size=model_case.tp, load_format="dummy", - cuda_graph_sizes=[16], + cudagraph_capture_sizes=[16], ) as llm: # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562 # def check_model(model): diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 334f9a65e4c03..0ff6e8407ce67 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int): task = "wikitext" rtol = 0.1 - # Smaller cuda_graph_sizes to speed up the test. + # Smaller cudagraph_capture_sizes to speed up the test. results = lm_eval.simple_evaluate( model="vllm", model_args=config.get_model_args( - tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]} + tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]} ), tasks=task, batch_size=64, diff --git a/tests/test_config.py b/tests/test_config.py index 77d3a7115978e..0768c6d2cddf5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1085,7 +1085,7 @@ def test_vllm_config_explicit_overrides(): ) # Override one field but not others - pass_config = PassConfig(enable_noop=False) + pass_config = PassConfig(eliminate_noops=False) compilation_config = CompilationConfig(pass_config=pass_config) config = VllmConfig( model_config=regular_model, diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index 125e4e3827747..eaa0fa1d5db39 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -252,35 +252,3 @@ def register_backend( return lambda x: x return decorator - - -# Backwards compatibility alias for plugins -class _BackendMeta(type): - """Metaclass to provide deprecation warnings when accessing _Backend.""" - - def __getattribute__(cls, name: str): - if name not in ("__class__", "__mro__", "__name__"): - logger.warning( - "_Backend has been renamed to AttentionBackendEnum. " - "Please update your code to use AttentionBackendEnum instead. " - "_Backend will be removed in a future release." - ) - return getattr(AttentionBackendEnum, name) - - def __getitem__(cls, name: str): - logger.warning( - "_Backend has been renamed to AttentionBackendEnum. " - "Please update your code to use AttentionBackendEnum instead. " - "_Backend will be removed in a future release." - ) - return AttentionBackendEnum[name] - - -class _Backend(metaclass=_BackendMeta): - """Deprecated: Use AttentionBackendEnum instead. - - This class is provided for backwards compatibility with plugins - and will be removed in a future release. - """ - - pass diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index f6aba271d2e96..bbf95ff009001 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import inspect from functools import cache from typing import cast, get_args @@ -73,39 +72,18 @@ def _cached_get_attn_backend( ) -> type[AttentionBackend]: from vllm.platforms import current_platform - sig = inspect.signature(current_platform.get_attn_backend_cls) - if "use_v1" in sig.parameters: - logger.warning_once( - "use_v1 parameter for get_attn_backend_cls is deprecated and will " - "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please " - "remove it from your plugin code." - ) - attention_cls = current_platform.get_attn_backend_cls( - backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - True, # use_v1 - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, - ) - else: - attention_cls = current_platform.get_attn_backend_cls( - backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, - ) + attention_cls = current_platform.get_attn_backend_cls( + backend, + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla, + has_sink, + use_sparse, + use_mm_prefix, + attn_type, + ) if not attention_cls: raise ValueError( f"Invalid attention backend for {current_platform.device_name}" diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 51e4912aad9db..3b6cb8a343608 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -17,7 +17,6 @@ from vllm.config.utils import ( Range, config, get_hash_factors, - handle_deprecated, hash_factors, ) from vllm.logger import init_logger @@ -127,27 +126,6 @@ class PassConfig: fuse_allreduce_rms: bool = Field(default=None) """Enable flashinfer allreduce fusion.""" - # Deprecated flags - enable_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant - instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner. - """ - enable_attn_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_attn_quant instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_noop: bool = Field(default=None) - """Deprecated in: v0.12.0. Use eliminate_noops instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_sequence_parallelism: bool = Field(default=None) - """Deprecated in: v0.12.0. Use enable_sp instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_async_tp: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_fi_allreduce_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - fi_allreduce_fusion_max_size_mb: float | None = None """The threshold of the communicated tensor sizes under which vllm should use flashinfer fused allreduce. Specified as a @@ -206,15 +184,7 @@ class PassConfig: Any future fields that don't affect compilation should be excluded. """ - ignored_fields = [ - "enable_fusion", - "enable_attn_fusion", - "enable_noop", - "enable_sequence_parallelism", - "enable_async_tp", - "enable_fi_allreduce_fusion", - ] - return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields)) + return hash_factors(get_hash_factors(self, set())) @field_validator( "fuse_norm_quant", @@ -224,12 +194,6 @@ class PassConfig: "enable_sp", "fuse_gemm_comms", "fuse_allreduce_rms", - "enable_fusion", - "enable_attn_fusion", - "enable_noop", - "enable_sequence_parallelism", - "enable_async_tp", - "enable_fi_allreduce_fusion", mode="wrap", ) @classmethod @@ -242,49 +206,6 @@ class PassConfig: def __post_init__(self) -> None: # Handle deprecation and defaults - # Map old flags to new flags and issue warnings - handle_deprecated( - self, - "enable_fusion", - ["fuse_norm_quant", "fuse_act_quant"], - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_attn_fusion", - "fuse_attn_quant", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_sequence_parallelism", - "enable_sp", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_async_tp", - "fuse_gemm_comms", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_fi_allreduce_fusion", - "fuse_allreduce_rms", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_noop", - "eliminate_noops", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - if not self.eliminate_noops: if self.fuse_norm_quant or self.fuse_act_quant: logger.warning_once( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 607bb44cddd26..a3a9eec9b3203 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1014,7 +1014,7 @@ class VllmConfig: max_graph_size = min(max_num_seqs * 2, 512) # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16 # up to max_graph_size - cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( + cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( range(256, max_graph_size + 1, 16)) In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2f307a7ccf16d..cbb4862434a98 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -375,7 +375,6 @@ class EngineArgs: kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: int | None = 0 max_model_len: int | None = ModelConfig.max_model_len - cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes ) @@ -1121,15 +1120,6 @@ class EngineArgs: compilation_group.add_argument( "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] ) - compilation_kwargs["cudagraph_capture_sizes"]["help"] = ( - "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0," - " whichever is soonest. Please use --cudagraph-capture-sizes instead." - ) - compilation_group.add_argument( - "--cuda-graph-sizes", - **compilation_kwargs["cudagraph_capture_sizes"], - deprecated=True, - ) compilation_group.add_argument( "--max-cudagraph-capture-size", **compilation_kwargs["max_cudagraph_capture_size"], @@ -1741,18 +1731,6 @@ class EngineArgs: # Compilation config overrides compilation_config = copy.deepcopy(self.compilation_config) - if self.cuda_graph_sizes is not None: - logger.warning( - "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or " - "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes " - "instead." - ) - if compilation_config.cudagraph_capture_sizes is not None: - raise ValueError( - "cuda_graph_sizes and compilation_config." - "cudagraph_capture_sizes are mutually exclusive" - ) - compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes if self.cudagraph_capture_sizes is not None: if compilation_config.cudagraph_capture_sizes is not None: raise ValueError( From 7e24e5d4d65abbe5ffc7e653fdfd670c7e300944 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 11:59:39 +0800 Subject: [PATCH 15/16] [Deprecation] Remove deprecated task, seed and MM settings (#30397) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_ngram_proposer.py | 2 +- examples/offline_inference/audio_language.py | 2 +- .../encoder_decoder_multimodal.py | 2 +- .../qwen2_5_omni/only_thinker.py | 2 +- .../qwen3_omni/only_thinker.py | 2 +- examples/offline_inference/vision_language.py | 2 +- .../vision_language_multi_image.py | 6 +- .../plugin/prithvi_geospatial_mae_client.py | 2 +- .../pooling/vision_language_pooling.py | 6 +- tests/conftest.py | 2 +- tests/test_config.py | 58 -------- tests/utils.py | 4 +- vllm/config/model.py | 131 ------------------ vllm/engine/arg_utils.py | 73 ++-------- vllm/entrypoints/llm.py | 2 +- vllm/envs.py | 5 - 16 files changed, 25 insertions(+), 276 deletions(-) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index cac401456b62a..872a263318ff7 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -37,7 +37,7 @@ def benchmark_propose(args): tokenizer="facebook/opt-125m", tokenizer_mode="auto", dtype="auto", - seed=None, + seed=0, trust_remote_code=False, ) proposer = NgramProposer( diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index df6e96ca375fc..40462c78ae8c2 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -422,7 +422,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index c1d6c6db53dfb..857767ac3c628 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -77,7 +77,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index ed005e6a69b80..cee83519fadcc 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py index 88a61ed694c2e..62131633da8aa 100644 --- a/examples/offline_inference/qwen3_omni/only_thinker.py +++ b/examples/offline_inference/qwen3_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 22802dddf7893..9142279140e56 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -2031,7 +2031,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 28c466c03dfa5..3c01806baa203 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1382,7 +1382,7 @@ def run_generate( model, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1416,7 +1416,7 @@ def run_chat( model: str, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1494,7 +1494,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/pooling/plugin/prithvi_geospatial_mae_client.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py index a6246999c14d6..1ba1fd6a92ca4 100644 --- a/examples/pooling/plugin/prithvi_geospatial_mae_client.py +++ b/examples/pooling/plugin/prithvi_geospatial_mae_client.py @@ -16,7 +16,7 @@ import requests # - start vllm in serving mode with the below args # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model-impl terratorch -# --task embed --trust-remote-code +# --trust-remote-code # --skip-tokenizer-init --enforce-eager # --io-processor-plugin terratorch_segmentation # --enable-mm-embeds diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py index 530aad4bc031c..dda56bc34df2e 100644 --- a/examples/pooling/pooling/vision_language_pooling.py +++ b/examples/pooling/pooling/vision_language_pooling.py @@ -305,7 +305,7 @@ def get_query(modality: QueryModality): raise ValueError(msg) -def run_encode(model: str, modality: QueryModality, seed: int | None): +def run_encode(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None): print("-" * 50) -def run_score(model: str, modality: QueryModality, seed: int | None): +def run_score(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -390,7 +390,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/tests/conftest.py b/tests/conftest.py index 9f811d5d8db2a..5b26a02823c56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -741,7 +741,7 @@ class VllmRunner: tokenizer_name: str | None = None, tokenizer_mode: str = "auto", trust_remote_code: bool = True, - seed: int | None = 0, + seed: int = 0, max_model_len: int | None = 1024, dtype: str = "auto", disable_log_stats: bool = True, diff --git a/tests/test_config.py b/tests/test_config.py index 0768c6d2cddf5..ee706ab3d9c87 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -89,64 +89,6 @@ def test_update_config(): new_config3 = update_config(config3, {"a": "new_value"}) -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "generate", "none", "generate"), - ("intfloat/multilingual-e5-small", "pooling", "none", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"), - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_auto_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="auto") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "pooling", "embed", "embed"), - ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"), - ("openai/whisper-small", "pooling", "embed", "embed"), - ], -) -def test_score_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="score") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_transcription_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="transcription") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - @pytest.mark.parametrize( ("model_id", "expected_runner_type", "expected_convert_type"), [ diff --git a/tests/utils.py b/tests/utils.py index ea3675b1461b8..d8102331b3612 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -119,7 +119,7 @@ class RemoteOpenAIServer: vllm_serve_args: list[str], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, override_hf_configs: dict[str, Any] | None = None, @@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, ) -> None: diff --git a/vllm/config/model.py b/vllm/config/model.py index 764bdf7000561..bd98111ffb5db 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -73,17 +73,6 @@ logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] ConvertType = Literal["none", "embed", "classify", "reward"] ConvertOption = Literal["auto", ConvertType] -TaskOption = Literal[ - "auto", - "generate", - "embedding", - "embed", - "classify", - "score", - "reward", - "transcription", - "draft", -] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ @@ -93,12 +82,6 @@ HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig] ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] LayerBlockType = Literal["attention", "linear_attention", "mamba"] -_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { - "generate": ["generate", "transcription"], - "pooling": ["embedding", "embed", "classify", "score", "reward"], - "draft": ["draft"], -} - _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "generate": [], "pooling": ["embed", "classify", "reward"], @@ -126,12 +109,6 @@ class ModelConfig: """Convert the model using adapters defined in [vllm.model_executor.models.adapters][]. The most common use case is to adapt a text generation model to be used for pooling tasks.""" - task: TaskOption | None = None - """[DEPRECATED] The task to use the model for. If the model supports more - than one model runner, this is used to select which model runner to run. - - Note that the model may support other tasks using the same model runner. - """ tokenizer: SkipValidation[str] = None # type: ignore """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" @@ -335,7 +312,6 @@ class ModelConfig: ignored_factors = { "runner", "convert", - "task", "tokenizer", "tokenizer_mode", "seed", @@ -510,97 +486,6 @@ class ModelConfig: is_generative_model = registry.is_text_generation_model(architectures, self) is_pooling_model = registry.is_pooling_model(architectures, self) - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - logger.warning( - "Pooling models now default support all pooling; " - "you can use it without any settings." - ) - return "embed" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ( - "The 'task' option has been deprecated and will be " - "removed in v0.13.0 or v1.0, whichever comes first." - ) - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model." - ) - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - else: - # Neither generative nor pooling model - try to convert if possible - if is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--runner pooling " - f"--convert {convert}` to continue using this model " - "as a pooling model." - ) - else: - debug_info = { - "architectures": architectures, - "is_generative_model": is_generative_model, - "is_pooling_model": is_pooling_model, - } - raise AssertionError( - "The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}. Found: {debug_info}" - ) - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - self.runner_type = self._get_runner_type(architectures, self.runner) self.convert_type = self._get_convert_type( architectures, self.runner_type, self.convert @@ -918,22 +803,6 @@ class ModelConfig: return convert_type - def _get_default_pooling_task( - self, - architectures: list[str], - ) -> Literal["embed", "classify", "reward"]: - if self.registry.is_cross_encoder_model(architectures, self): - return "classify" - - for arch in architectures: - match = try_match_architecture_defaults(arch, runner_type="pooling") - if match: - _, (_, convert_type) = match - assert convert_type != "none" - return convert_type - - return "embed" - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): quant_cfg = getattr(hf_config, "quantization_config", None) if quant_cfg is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cbb4862434a98..f303bef17b6a9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,7 +71,6 @@ from vllm.config.model import ( LogprobsMode, ModelDType, RunnerOption, - TaskOption, TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode @@ -360,7 +359,6 @@ class EngineArgs: hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert - task: TaskOption | None = ModelConfig.task skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode @@ -373,7 +371,7 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = 0 + seed: int = ModelConfig.seed max_model_len: int | None = ModelConfig.max_model_len cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes @@ -462,7 +460,6 @@ class EngineArgs: MultiModalConfig, "media_io_kwargs" ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs - disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb mm_processor_cache_type: MMCacheType | None = ( MultiModalConfig.mm_processor_cache_type @@ -558,9 +555,6 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - # DEPRECATED - enable_multimodal_encoder_data_parallel: bool = False - logits_processors: list[str | type[LogitsProcessor]] | None = ( ModelConfig.logits_processors ) @@ -628,7 +622,6 @@ class EngineArgs: model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) - model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( @@ -882,11 +875,6 @@ class EngineArgs: parallel_group.add_argument( "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"] ) - parallel_group.add_argument( - "--enable-multimodal-encoder-data-parallel", - action="store_true", - deprecated=True, - ) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -960,9 +948,6 @@ class EngineArgs: multimodal_group.add_argument( "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"] ) - multimodal_group.add_argument( - "--disable-mm-preprocessor-cache", action="store_true", deprecated=True - ) multimodal_group.add_argument( "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"] ) @@ -1192,62 +1177,20 @@ class EngineArgs: if is_gguf(self.model): self.quantization = self.load_format = "gguf" - # NOTE(woosuk): In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. - if self.seed is None: - logger.warning_once( - "`seed=None` is equivalent to `seed=0` in V1 Engine. " - "You will no longer be allowed to pass `None` in v0.13.", - scope="local", + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, ) - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", - self.seed, - ) - - if self.disable_mm_preprocessor_cache: - logger.warning_once( - "`--disable-mm-preprocessor-cache` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb 0` instead.", - scope="local", - ) - - self.mm_processor_cache_gb = 0 - elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: - logger.warning_once( - "VLLM_MM_INPUT_CACHE_GIB` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb %d` instead.", - envs.VLLM_MM_INPUT_CACHE_GIB, - scope="local", - ) - - self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB - - if self.enable_multimodal_encoder_data_parallel: - logger.warning_once( - "--enable-multimodal-encoder-data-parallel` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-encoder-tp-mode data` instead.", - scope="local", - ) - - self.mm_encoder_tp_mode = "data" - return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, - task=self.task, tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3fce3338503ef..6440b702f4fa6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -198,7 +198,7 @@ class LLM: quantization: QuantizationMethods | None = None, revision: str | None = None, tokenizer_revision: str | None = None, - seed: int | None = None, + seed: int = 0, gpu_memory_utilization: float = 0.9, swap_space: float = 4, cpu_offload_gb: float = 0, diff --git a/vllm/envs.py b/vllm/envs.py index 230f2cf3450a9..0cf0408054063 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -72,7 +72,6 @@ if TYPE_CHECKING: VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MEDIA_CONNECTOR: str = "http" - VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" @@ -786,9 +785,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # imported at runtime. # If a non-existing backend is used, an AssertionError will be thrown. "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), - # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache - # Default is 4 GiB per API process + 4 GiB per engine core process - "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser( @@ -1681,7 +1677,6 @@ def compile_factors() -> dict[str, object]: "VLLM_MEDIA_CONNECTOR", "VLLM_ASSETS_CACHE", "VLLM_ASSETS_CACHE_MODEL_CLEAN", - "VLLM_MM_INPUT_CACHE_GIB", "VLLM_WORKER_MULTIPROC_METHOD", "VLLM_ENABLE_V1_MULTIPROCESSING", "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", From d6464f267979946a1c2d9c6029ef2007be73ca09 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 10 Dec 2025 23:05:56 -0500 Subject: [PATCH 16/16] [Chore] Fix torch precision warning (#30428) Signed-off-by: yewentao256 --- tests/v1/e2e/test_async_scheduling.py | 4 ++-- vllm/envs.py | 10 ++++++---- vllm/v1/worker/gpu_worker.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 13b36c54123ce..5cef9b33c9984 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -152,8 +152,8 @@ def run_tests( m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA") else: m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - # lock matmul precision to full FP32 - m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") + # lock matmul precision to full FP32 (IEEE) + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( diff --git a/vllm/envs.py b/vllm/envs.py index 0cf0408054063..cb75ba1a62de9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,7 +74,7 @@ if TYPE_CHECKING: VLLM_MEDIA_CONNECTOR: str = "http" VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" - VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", # Controls PyTorch float32 matmul precision mode within vLLM workers. - # Valid options mirror torch.set_float32_matmul_precision + # Accepted values: + # - "ieee" (default): force full IEEE FP32 matmul precision. + # - "tf32": enable TensorFloat32-based fast matmul. "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( "VLLM_FLOAT32_MATMUL_PRECISION", - "highest", - ["highest", "high", "medium"], + "ieee", + ["ieee", "tf32"], case_sensitive=False, ), # Maximum number of compilation jobs to run in parallel. diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f2b6a1f76b0b9..25ac5aaf99818 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -81,7 +81,7 @@ class Worker(WorkerBase): # configure float32 matmul precision according to vLLM env. precision = envs.VLLM_FLOAT32_MATMUL_PRECISION - torch.set_float32_matmul_precision(precision) + torch.backends.cuda.matmul.fp32_precision = precision if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing