From 53d2420b4447fbcab572dc23d2c3bb9224a8a561 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Wed, 10 Dec 2025 13:58:35 +0100 Subject: [PATCH 01/67] [Bugfix] tpu_model_runner: set vllm config context when calling reset_dynamo_cache() (#30331) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò --- vllm/v1/worker/tpu_worker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 7a10ac198985e..5f6136b178b46 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import VllmConfig, set_current_vllm_config from vllm.distributed import ( ensure_model_parallel_initialized, init_distributed_environment, @@ -207,7 +207,8 @@ class TPUWorker: # one compiled bytecode. Having one FX graph/cached bytecode per # compiled model is required for `support_torch_compile` decorator to # skip dynamo guard. - self.model_runner.reset_dynamo_cache() + with set_current_vllm_config(self.vllm_config): + self.model_runner.reset_dynamo_cache() # Get the maximum amount of memory used by the model weights and # intermediate activations. From cebda2a4afa9ec9c6656c0aa5e96d0003e9b185d Mon Sep 17 00:00:00 2001 From: Aditya Tewari Date: Wed, 10 Dec 2025 12:58:42 +0000 Subject: [PATCH 02/67] [CPU] Support for Whisper (#30062) Signed-off-by: Aditya Tewari --- .../scripts/hardware_ci/run-cpu-test-arm.sh | 5 +++ csrc/cpu/cpu_attn.cpp | 1 - .../multimodal/generation/test_whisper.py | 21 +++++++++- vllm/v1/attention/backends/cpu_attn.py | 38 +++++++++---------- vllm/v1/worker/utils.py | 8 +++- 5 files changed, 49 insertions(+), 24 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index 9c6e7766b2ac4..b6274d698d01a 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -36,6 +36,11 @@ function cpu_tests() { set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + # Run model tests + docker exec cpu-test bash -c " + set -e + pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model" + # Run kernel tests docker exec cpu-test bash -c " set -e diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 92f8bee5a47a0..02c722ba031a4 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata( input.casual = casual; input.isa = isa; input.enable_kv_split = enable_kv_split; - TORCH_CHECK(casual, "Only supports casual mask for now."); VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() { CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] { diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index eca2b61e37d53..8c99b6b4690a9 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -92,13 +92,14 @@ def run_test( *, tensor_parallel_size: int, distributed_executor_backend: str | None = None, + dtype: str = "half", ) -> None: prompt_list = PROMPTS * 10 expected_list = EXPECTED[model] * 10 with vllm_runner( model, - dtype="half", + dtype=dtype, max_model_len=448, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, @@ -120,12 +121,28 @@ def run_test( @pytest.mark.core_model @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("dtype", ["half"]) @create_new_process_for_each_test() -def test_models(vllm_runner, model) -> None: +def test_models(vllm_runner, model, dtype) -> None: run_test( vllm_runner, model, tensor_parallel_size=1, + dtype=dtype, + ) + + +@pytest.mark.cpu_model +@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_cpu(vllm_runner, model, dtype) -> None: + # @create_new_process_for_each_test() does not work for some runners + # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh + run_test( + vllm_runner, + model, + tensor_parallel_size=1, + dtype=dtype, ) diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index fed7dcdf293bd..394d0c2f67136 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, split_decodes_and_prefills, ) -from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec logger = init_logger(__name__) @@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: - """CPU attention supports decoder and encoder-only attention.""" + """CPU attention supports decoder, + encoder-only and encoder-decoder attention.""" return attn_type in ( AttentionType.DECODER, AttentionType.ENCODER, AttentionType.ENCODER_ONLY, + AttentionType.ENCODER_DECODER, ) @staticmethod @@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] self.window_size = -1 self.block_size = vllm_config.cache_config.block_size self.isa = _get_attn_isa(self.dtype, self.block_size) + self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec) def build( self, @@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping - causal = common_attn_metadata.causal + causal = False if self.is_cross_attention else common_attn_metadata.causal sdpa_start_loc = query_start_loc num_decode_tokens = 0 @@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] query_start_loc = query_start_loc[: num_decodes + 1] block_table_tensor = block_table_tensor[:num_decodes] - sheduler_metadata = None - if causal: - # for decode batch, use the custom kernel - sheduler_metadata = ops.cpu_attn_get_scheduler_metadata( - num_reqs=num_reqs, - num_heads=self.num_heads, - num_kv_heads=self.num_kv_heads, - head_dim=self.head_dim, - seq_lens=seq_lens, - dtype=self.dtype, - query_start_loc=query_start_loc, - causal=causal, - sliding_window_size=self.window_size, - isa=self.isa, - enable_kv_split=True, - ) + sheduler_metadata = ops.cpu_attn_get_scheduler_metadata( + num_reqs=num_reqs, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + seq_lens=seq_lens, + dtype=self.dtype, + query_start_loc=query_start_loc, + causal=causal, + sliding_window_size=self.window_size, + isa=self.isa, + enable_kv_split=True, + ) attn_metadata = CPUAttentionMetadata( isa=self.isa, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 4dd9463ee6285..e9c48223d58b9 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -313,8 +313,12 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda_alike() or current_platform.is_xpu(): - # We know that the GPU runner is not impacted by this + if ( + current_platform.is_cuda_alike() + or current_platform.is_xpu() + or current_platform.is_cpu() + ): + # We know that the GPU / CPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this. pass From d017bceb08eaac7bae2c499124ece737fb4fb22b Mon Sep 17 00:00:00 2001 From: Roger Young <42564206+rogeryoungh@users.noreply.github.com> Date: Wed, 10 Dec 2025 20:58:50 +0800 Subject: [PATCH 03/67] [BugFix] Fix minimax m2 model rotary_dim (#30384) Signed-off-by: xuebi Co-authored-by: xuebi --- vllm/model_executor/models/minimax_m2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index dd98e36ec0851..3e6a9add9ec49 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=rotary_dim, + rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) From c756fb678184b867ed94e5613a529198f1aee423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 10 Dec 2025 15:14:24 +0100 Subject: [PATCH 04/67] [Core] Whisper enable `FULL_DECODE_ONLY` CudaGraph (#30072) Signed-off-by: NickLucche --- .../multimodal/generation/test_whisper.py | 2 ++ vllm/config/vllm.py | 30 ++++++++++++------- vllm/v1/worker/gpu_model_runner.py | 11 ++++++- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 8c99b6b4690a9..592862c2a0bb0 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -103,6 +103,8 @@ def run_test( max_model_len=448, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, + # TODO (NickLucche) figure out output differences with non-eager and re-enable + enforce_eager=True, ) as vllm_model: llm = vllm_model.llm diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 8f27db0013305..607bb44cddd26 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -666,8 +666,9 @@ class VllmConfig: default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level] self._apply_optimization_level_defaults(default_config) + if ( - self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + self.compilation_config.cudagraph_mode.requires_piecewise_compilation() and self.compilation_config.mode != CompilationMode.VLLM_COMPILE ): logger.info( @@ -692,22 +693,29 @@ class VllmConfig: if current_platform.support_static_graph_mode(): # if cudagraph_mode has full cudagraphs, we need to check support - if ( - self.compilation_config.cudagraph_mode.has_full_cudagraphs() - and self.model_config is not None - ): - if self.model_config.pooler_config is not None: + if model_config := self.model_config: + if ( + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + and model_config.pooler_config is not None + ): logger.warning_once( "Pooling models do not support full cudagraphs. " "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - elif self.model_config.is_encoder_decoder: - logger.warning_once( - "Encoder-decoder models do not support full cudagraphs. " - "Overriding cudagraph_mode to PIECEWISE." + elif ( + model_config.is_encoder_decoder + and self.compilation_config.cudagraph_mode + not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY) + ): + logger.info_once( + "Encoder-decoder models do not support %s. " + "Overriding cudagraph_mode to FULL_DECODE_ONLY.", + self.compilation_config.cudagraph_mode.name, + ) + self.compilation_config.cudagraph_mode = ( + CUDAGraphMode.FULL_DECODE_ONLY ) - self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE # disable cudagraph when enforce eager execution if self.model_config is not None and self.model_config.enforce_eager: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 39456d2e80ed0..ca06f048f290b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1267,6 +1267,8 @@ class GPUModelRunner( if not isinstance(kv_cache_spec, CrossAttentionSpec): return None, None + # Zero out buffer for padding requests that are not actually scheduled (CGs) + self.encoder_seq_lens.np[:num_reqs] = 0 # Build encoder_seq_lens array mapping request indices to # encoder lengths for inputs scheduled in this batch for req_id in num_scheduled_tokens: @@ -2764,6 +2766,7 @@ class GPUModelRunner( # be improved in model runner v2) force_uniform_decode: bool | None = None, force_has_lora: bool | None = None, + num_encoder_reqs: int = 0, ) -> tuple[ CUDAGraphMode, BatchDescriptor, @@ -2780,6 +2783,11 @@ class GPUModelRunner( if force_uniform_decode is None else force_uniform_decode ) + # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output + # is present). Also, chunked-prefill is disabled, so batch are uniform. + has_encoder_output = ( + self.model_config.is_encoder_decoder and num_encoder_reqs > 0 + ) has_lora = ( len(self.input_batch.lora_id_to_lora_request) > 0 @@ -2799,7 +2807,7 @@ class GPUModelRunner( ) cudagraph_mode, batch_descriptor = dispatch_cudagraph( - num_tokens_padded, use_cascade_attn + num_tokens_padded, use_cascade_attn or has_encoder_output ) num_tokens_padded = batch_descriptor.num_tokens @@ -2997,6 +3005,7 @@ class GPUModelRunner( num_scheduled_tokens_np=num_scheduled_tokens_np, max_num_scheduled_tokens=max_num_scheduled_tokens, use_cascade_attn=cascade_attn_prefix_lens is not None, + num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs), ) logger.debug( From aacf0abf8bc219211b888a82f11f028e67b59531 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 10 Dec 2025 10:59:23 -0500 Subject: [PATCH 05/67] [BugFix] Fix `AttributeError: 'MergedColumnParallelLinear' object has no attribute 'weight_scale'` (#30399) Signed-off-by: Lucas Wilkinson --- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index e0c584df8760b..936f6b1e28ce1 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module( assert m.quant_method.quant_config is not None w = m.weight - ws = m.weight_scale + ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale quant_block_size = m.quant_method.quant_config.weight_block_size assert isinstance(w, torch.Tensor) From 2dcbac9077ecadff0aa78b7c282f9e147a260e86 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 10 Dec 2025 16:09:34 +0000 Subject: [PATCH 06/67] [Docs] Generate full list of metrics in user docs (#30388) Signed-off-by: Mark McLoughlin Co-authored-by: Claude Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/design/metrics.md | 12 +-- docs/mkdocs/hooks/generate_metrics.py | 149 ++++++++++++++++++++++++++ docs/usage/metrics.md | 16 ++- mkdocs.yaml | 1 + 4 files changed, 163 insertions(+), 15 deletions(-) create mode 100644 docs/mkdocs/hooks/generate_metrics.py diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 28b5405871ac2..2722e12fdaeaf 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request ### v1 Metrics -In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix: +In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example: - `vllm:num_requests_running` (Gauge) - Number of requests currently running. -- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting. - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1). - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries. - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits. -- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries. -- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits. -- `vllm:num_preemptions_total` (Counter) - Number of preemptions. - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed. - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens. -- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step. -- `vllm:cache_config_info` (Gauge) - Information about the cache configuration. - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason). - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts. - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts. -- `vllm:request_params_n` (Histogram) - Histogram of request parameter n. -- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests. - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT). - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency. - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency. -- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue. -- `vllm:request_inference_time_seconds` (Histogram) - Request inference time. - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time. - `vllm:request_decode_time_seconds` (Histogram) - Request decode time. diff --git a/docs/mkdocs/hooks/generate_metrics.py b/docs/mkdocs/hooks/generate_metrics.py new file mode 100644 index 0000000000000..b20d43c4b2e92 --- /dev/null +++ b/docs/mkdocs/hooks/generate_metrics.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import logging +from pathlib import Path +from typing import Literal + +logger = logging.getLogger("mkdocs") + +ROOT_DIR = Path(__file__).parent.parent.parent.parent +DOCS_DIR = ROOT_DIR / "docs" +GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics" + +# Files to scan for metric definitions - each will generate a separate table +METRIC_SOURCE_FILES = [ + {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"}, + { + "path": "vllm/v1/spec_decode/metrics.py", + "output": "spec_decode.md", + }, + { + "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py", + "output": "nixl_connector.md", + }, +] + + +class MetricExtractor(ast.NodeVisitor): + """AST visitor to extract metric definitions.""" + + def __init__(self): + self.metrics: list[dict[str, str]] = [] + + def visit_Call(self, node: ast.Call) -> None: + """Visit function calls to find metric class instantiations.""" + metric_type = self._get_metric_type(node) + if metric_type: + name = self._extract_kwarg(node, "name") + documentation = self._extract_kwarg(node, "documentation") + + if name: + self.metrics.append( + { + "name": name, + "type": metric_type, + "documentation": documentation or "", + } + ) + + self.generic_visit(node) + + def _get_metric_type(self, node: ast.Call) -> str | None: + """Determine if this call creates a metric and return its type.""" + metric_type_map = { + "_gauge_cls": "gauge", + "_counter_cls": "counter", + "_histogram_cls": "histogram", + } + if isinstance(node.func, ast.Attribute): + return metric_type_map.get(node.func.attr) + return None + + def _extract_kwarg(self, node: ast.Call, key: str) -> str | None: + """Extract a keyword argument value from a function call.""" + for keyword in node.keywords: + if keyword.arg == key: + return self._get_string_value(keyword.value) + return None + + def _get_string_value(self, node: ast.AST) -> str | None: + """Extract string value from an AST node.""" + if isinstance(node, ast.Constant): + return str(node.value) if node.value is not None else None + return None + + +def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]: + """Parse a Python file and extract all metric definitions.""" + try: + with open(filepath, encoding="utf-8") as f: + source = f.read() + + tree = ast.parse(source, filename=str(filepath)) + extractor = MetricExtractor() + extractor.visit(tree) + return extractor.metrics + except Exception as e: + raise RuntimeError(f"Failed to parse {filepath}: {e}") from e + + +def generate_markdown_table(metrics: list[dict[str, str]]) -> str: + """Generate a markdown table from extracted metrics.""" + if not metrics: + return "No metrics found.\n" + + # Sort by type, then by name + metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"])) + + lines = [] + lines.append("| Metric Name | Type | Description |") + lines.append("|-------------|------|-------------|") + + for metric in metrics_sorted: + name = metric["name"] + metric_type = metric["type"].capitalize() + doc = metric["documentation"].replace("\n", " ").strip() + lines.append(f"| `{name}` | {metric_type} | {doc} |") + + return "\n".join(lines) + "\n" + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + """Generate metrics documentation tables from source files.""" + logger.info("Generating metrics documentation") + + # Create generated directory if it doesn't exist + GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True) + + total_metrics = 0 + for source_config in METRIC_SOURCE_FILES: + source_path = source_config["path"] + output_file = source_config["output"] + + filepath = ROOT_DIR / source_path + if not filepath.exists(): + raise FileNotFoundError(f"Metrics source file not found: {filepath}") + + logger.debug("Extracting metrics from: %s", source_path) + metrics = extract_metrics_from_file(filepath) + logger.debug("Found %d metrics in %s", len(metrics), source_path) + + # Generate and write the markdown table for this source + table_content = generate_markdown_table(metrics) + output_path = GENERATED_METRICS_DIR / output_file + with open(output_path, "w", encoding="utf-8") as f: + f.write(table_content) + + total_metrics += len(metrics) + logger.info( + "Generated metrics table: %s (%d metrics)", + output_path.relative_to(ROOT_DIR), + len(metrics), + ) + + logger.info( + "Total metrics generated: %d across %d files", + total_metrics, + len(METRIC_SOURCE_FILES), + ) diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index d756e32476f0a..829533b84328f 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server: The following metrics are exposed: -??? code +## General Metrics - ```python - --8<-- "vllm/engine/metrics.py:metrics-definitions" - ``` +--8<-- "docs/generated/metrics/general.md" + +## Speculative Decoding Metrics + +--8<-- "docs/generated/metrics/spec_decode.md" + +## NIXL KV Connector Metrics + +--8<-- "docs/generated/metrics/nixl_connector.md" + +## Deprecation Policy Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, diff --git a/mkdocs.yaml b/mkdocs.yaml index bf97093dafb11..8fb8f0568c6ef 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -51,6 +51,7 @@ hooks: - docs/mkdocs/hooks/remove_announcement.py - docs/mkdocs/hooks/generate_examples.py - docs/mkdocs/hooks/generate_argparse.py + - docs/mkdocs/hooks/generate_metrics.py - docs/mkdocs/hooks/url_schemes.py plugins: From 794a7875ee0df7d2c12ff0ba83b76438ca68bf26 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 10 Dec 2025 12:44:02 -0500 Subject: [PATCH 07/67] [Misc] Consistent case for `vllm bench serve` results (#30403) Signed-off-by: Matthew Bonanni --- benchmarks/benchmark_serving_structured_output.py | 2 +- docs/benchmarking/cli.md | 2 +- vllm/benchmarks/serve.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index a4e1b163dcca9..33aca831883aa 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -574,7 +574,7 @@ async def benchmark( ) print( "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput + "Total token throughput (tok/s):", metrics.total_token_throughput ) ) diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md index 1ce6b611745b1..dd5a12e408b02 100644 --- a/docs/benchmarking/cli.md +++ b/docs/benchmarking/cli.md @@ -84,7 +84,7 @@ Total input tokens: 1369 Total generated tokens: 2212 Request throughput (req/s): 1.73 Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 +Total token throughput (tok/s): 619.85 ---------------Time to First Token---------------- Mean TTFT (ms): 71.54 Median TTFT (ms): 73.88 diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 2e2054a8a4b13..254e4d35e5350 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -788,7 +788,7 @@ async def benchmark( ) print( "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput + "Total token throughput (tok/s):", metrics.total_token_throughput ) ) From 253305d5b22bb0795bb8fd8469053e1df67a9be6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 01:48:38 +0800 Subject: [PATCH 08/67] [Chore] Delay recent deprecations (#30398) Signed-off-by: DarkLight1337 --- vllm/multimodal/inputs.py | 6 +++--- vllm/multimodal/utils.py | 4 ++-- vllm/transformers_utils/tokenizer.py | 14 +++++++------- vllm/transformers_utils/tokenizer_base.py | 4 ++-- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/llm_engine.py | 2 +- vllm/v1/engine/processor.py | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2ed66554e358e..6b1cbbe24e2e7 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = ( ) -@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.") +@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.") class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to @@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod @deprecated( "`MultiModalKwargs.from_hf_inputs` is deprecated and " - "will be removed in v0.13. " + "will be removed in v0.14. " "Please use `MultiModalKwargsItems.from_hf_inputs` and " "access the tensor data using `.get_data()`." ) @@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod @deprecated( "`MultiModalKwargs.from_items` is deprecated and " - "will be removed in v0.13. " + "will be removed in v0.14. " "Please use `MultiModalKwargsItems.from_seq` and " "access the tensor data using `.get_data()`." ) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index d4bdc55e569b2..7fd05af583b0a 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality( if merge_by_field_config is not None: logger.warning_once( "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` " - "is deprecated and will be removed in v0.13." + "is deprecated and will be removed in v0.14." ) if multimodal_cpu_fields is not None: logger.warning_once( "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` " - "is deprecated and will be removed in v0.13." + "is deprecated and will be removed in v0.14." ) from vllm.multimodal.inputs import MultiModalKwargsItems diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 32999903b3480..8745e1d9dbbbc 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -17,7 +17,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to " "`vllm.tokenizers.TokenizerLike`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -29,7 +29,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.get_tokenizer` " "has been moved to `vllm.tokenizers.get_tokenizer`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -41,7 +41,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` " "has been moved to `vllm.tokenizers.cached_get_tokenizer`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -53,7 +53,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` " "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -65,7 +65,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` " "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -75,7 +75,7 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.") +@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.") def decode_tokens( tokenizer: TokenizerLike, token_ids: list[int], @@ -97,7 +97,7 @@ def decode_tokens( return tokenizer.decode(token_ids, **kw_args) -@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.") +@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.") def encode_tokens( tokenizer: TokenizerLike, text: str, diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index 78fb6edc8b9ed..3dfd4b4f2f6c1 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -11,7 +11,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been " "moved to `vllm.tokenizers.TokenizerLike`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -23,7 +23,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been " "moved to `vllm.tokenizers.TokenizerRegistry`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 931d13be3d9b6..fa3fb7a18895a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -192,7 +192,7 @@ class AsyncLLM(EngineClient): @property @deprecated( "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. " - "The old name will be removed in v0.13." + "The old name will be removed in v0.14." ) def processor(self): return self.input_processor diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c31291005477..1cb206c4e004c 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -139,7 +139,7 @@ class LLMEngine: @property @deprecated( "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. " - "The old name will be removed in v0.13." + "The old name will be removed in v0.14." ) def processor(self): return self.input_processor diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index bc5c7fc400fde..a8c93499299d3 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -10,7 +10,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.v1.engine.processor.Processor` has been moved to " "`vllm.v1.engine.input_processor.InputProcessor`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) From e8e8cd73e5ddc4b56896e806066c37e9803e54b7 Mon Sep 17 00:00:00 2001 From: Anker <20343812+anker-c2@users.noreply.github.com> Date: Wed, 10 Dec 2025 19:09:31 +0100 Subject: [PATCH 09/67] [Bugfix] Fix HunyuanOCR cross-image contamination in batch processing (#30344) Signed-off-by: Lennart Brog Signed-off-by: Anker <20343812+anker-c2@users.noreply.github.com> --- vllm/model_executor/models/hunyuan_vision.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index e5c1be626be07..be084f4ee0f8e 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module): cu_seqlens: list = [0] hidden_states = x.to(device=self.device, dtype=self.dtype) + # embeddings = patch_embeds + patch_pos_embed hidden_states = self.embeddings(hidden_states, grid_thw) for t, h, w in grid_thw: @@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module): hidden_states = hidden_states.reshape(seq_len, -1) hidden_states = hidden_states.unsqueeze(0) - for layer_num, layer in enumerate(self.layers): - hidden_states = layer(hidden_states) + + # build per-image lengths once + split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw] + for layer in self.layers: + # hidden_states: (1, T_total, D) + parts = hidden_states.split(split_lengths, dim=1) # list of (1, L_i, D) + parts = [layer(p) for p in parts] + hidden_states = torch.cat(parts, dim=1) # adapter split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() From a9e4106f28834315de4bfb1cb1186c9a2dc95856 Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Wed, 10 Dec 2025 14:00:52 -0500 Subject: [PATCH 10/67] [P/D] KV Load Failure Recovery/Abort Configuration (#26813) Signed-off-by: Will Eaton Signed-off-by: Will Eaton Signed-off-by: Nick Hill Co-authored-by: Mark McLoughlin Co-authored-by: Nick Hill Co-authored-by: chaunceyjiang --- tests/entrypoints/openai/test_chat_error.py | 228 +++++++++ .../openai/test_completion_error.py | 216 +++++++++ .../openai/test_responses_error.py | 89 ++++ .../unit/test_cache_pollution_prevention.py | 163 +++++++ .../unit/test_error_propagation.py | 147 ++++++ .../unit/test_invalid_blocks_correctness.py | 454 ++++++++++++++++++ vllm/config/kv_transfer.py | 5 + vllm/entrypoints/openai/serving_chat.py | 17 +- vllm/entrypoints/openai/serving_completion.py | 15 +- vllm/entrypoints/openai/serving_engine.py | 61 +++ vllm/entrypoints/openai/serving_responses.py | 53 +- vllm/v1/core/block_pool.py | 19 + vllm/v1/core/kv_cache_manager.py | 8 + vllm/v1/core/sched/scheduler.py | 114 +++-- vllm/v1/engine/__init__.py | 9 +- vllm/v1/request.py | 2 + 16 files changed, 1552 insertions(+), 48 deletions(-) create mode 100644 tests/entrypoints/openai/test_chat_error.py create mode 100644 tests/entrypoints/openai/test_completion_error.py create mode 100644 tests/entrypoints/openai/test_responses_error.py create mode 100644 tests/v1/kv_connector/unit/test_cache_pollution_prevention.py create mode 100644 tests/v1/kv_connector/unit/test_error_propagation.py create mode 100644 tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py new file mode 100644 index 0000000000000..102eeaf614410 --- /dev/null +++ b/tests/entrypoints/openai/test_chat_error.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + serving_chat = OpenAIServingChat( + engine, + models, + response_role="assistant", + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) + + async def _fake_process_inputs( + request_id, + engine_prompt, + sampling_params, + *, + lora_request, + trace_headers, + priority, + ): + return dict(engine_prompt), {} + + async def _fake_preprocess_chat(*args, **kwargs): + # return conversation, request_prompts, engine_prompts + return ( + [{"role": "user", "content": "Test"}], + [[1, 2, 3]], + [{"prompt_token_ids": [1, 2, 3]}], + ) + + serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs) + serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat) + return serving_chat + + +@pytest.mark.asyncio +async def test_chat_error_non_stream(): + """test finish_reason='error' returns 500 InternalServerError (non-streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_chat = _build_serving_chat(mock_engine) + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "Test prompt"}], + max_tokens=10, + stream=False, + ) + + response = await serving_chat.create_chat_completion(request) + + assert isinstance(response, ErrorResponse) + assert response.error.type == "InternalServerError" + assert response.error.message == "Internal server error" + assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_chat_error_stream(): + """test finish_reason='error' returns 500 InternalServerError (streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_chat = _build_serving_chat(mock_engine) + + completion_output_1 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ) + + request_output_1 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_1], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + completion_output_2 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output_2 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_2], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output_1 + yield request_output_2 + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "Test prompt"}], + max_tokens=10, + stream=True, + ) + + response = await serving_chat.create_chat_completion(request) + + chunks = [] + async for chunk in response: + chunks.append(chunk) + + assert len(chunks) >= 2 + assert any("Internal server error" in chunk for chunk in chunks), ( + f"Expected error message in chunks: {chunks}" + ) + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py new file mode 100644 index 0000000000000..ca56cc2ddb6a7 --- /dev/null +++ b/tests/entrypoints/openai/test_completion_error.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + serving_completion = OpenAIServingCompletion( + engine, + models, + request_logger=None, + ) + + async def _fake_process_inputs( + request_id, + engine_prompt, + sampling_params, + *, + lora_request, + trace_headers, + priority, + ): + return dict(engine_prompt), {} + + serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs) + return serving_completion + + +@pytest.mark.asyncio +async def test_completion_error_non_stream(): + """test finish_reason='error' returns 500 InternalServerError (non-streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_completion = _build_serving_completion(mock_engine) + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = CompletionRequest( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + stream=False, + ) + + response = await serving_completion.create_completion(request) + + assert isinstance(response, ErrorResponse) + assert response.error.type == "InternalServerError" + assert response.error.message == "Internal server error" + assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_completion_error_stream(): + """test finish_reason='error' returns 500 InternalServerError (streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_completion = _build_serving_completion(mock_engine) + + completion_output_1 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ) + + request_output_1 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_1], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + completion_output_2 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output_2 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_2], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output_1 + yield request_output_2 + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = CompletionRequest( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + stream=True, + ) + + response = await serving_completion.create_completion(request) + + chunks = [] + async for chunk in response: + chunks.append(chunk) + + assert len(chunks) >= 2 + assert any("Internal server error" in chunk for chunk in chunks), ( + f"Expected error message in chunks: {chunks}" + ) + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/tests/entrypoints/openai/test_responses_error.py b/tests/entrypoints/openai/test_responses_error.py new file mode 100644 index 0000000000000..f8ea178288835 --- /dev/null +++ b/tests/entrypoints/openai/test_responses_error.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from http import HTTPStatus +from unittest.mock import MagicMock + +import pytest + +from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing + + +@pytest.mark.asyncio +async def test_raise_if_error_raises_generation_error(): + """test _raise_if_error raises GenerationError""" + # create a minimal OpenAIServing instance + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # test that error finish_reason raises GenerationError + with pytest.raises(GenerationError) as exc_info: + serving._raise_if_error("error", "test-request-id") + + assert str(exc_info.value) == "Internal server error" + assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + + # test that other finish_reasons don't raise + serving._raise_if_error("stop", "test-request-id") # should not raise + serving._raise_if_error("length", "test-request-id") # should not raise + serving._raise_if_error(None, "test-request-id") # should not raise + + +@pytest.mark.asyncio +async def test_convert_generation_error_to_response(): + """test _convert_generation_error_to_response creates proper ErrorResponse""" + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # create a GenerationError + gen_error = GenerationError("Internal server error") + + # convert to ErrorResponse + error_response = serving._convert_generation_error_to_response(gen_error) + + assert isinstance(error_response, ErrorResponse) + assert error_response.error.type == "InternalServerError" + assert error_response.error.message == "Internal server error" + assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_convert_generation_error_to_streaming_response(): + """test _convert_generation_error_to_streaming_response output""" + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # create a GenerationError + gen_error = GenerationError("Internal server error") + + # convert to streaming error response + error_json = serving._convert_generation_error_to_streaming_response(gen_error) + + assert isinstance(error_json, str) + assert "Internal server error" in error_json + assert "InternalServerError" in error_json diff --git a/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py new file mode 100644 index 0000000000000..ec3fb8231e19e --- /dev/null +++ b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +test that invalid blocks are evicted from prefix cache to prevent pollution. + +verifies that when sync-loading fails, invalid blocks are removed from the +prefix cache hash table so future requests cannot match and reuse corrupted data. +""" + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +def test_invalid_blocks_evicted_prevents_cache_pollution( + fail_scheduler: Scheduler, +): + """ + verify invalid blocks are evicted to prevent future cache hits. + + scenario: + 1. request 1 loads externally-computed blocks (sync mode) + 2. some blocks fail to load and are marked invalid + 3. with fail policy, invalid blocks should be evicted from prefix cache + 4. request is marked as FINISHED_ERROR + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + # request 1: will have invalid blocks + request1 = create_request(num_tokens=num_prompt_tokens, request_id=1) + fail_scheduler.add_request(request=request1) + + req_num_new_matched_tokens = { + request1.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + # request should be running with sync KV load + assert len(fail_scheduler.running) == 1 + assert request1.status == RequestStatus.RUNNING + + # get allocated block IDs + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # get the block object to verify eviction later + block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + + # cache the blocks to simulate they've been computed and cached + # (in real scenario blocks would be cached after compute) + fail_scheduler.kv_cache_manager.cache_blocks(request1, num_external_computed_tokens) + + # verify block has a hash (is cached) before reporting invalid blocks + assert block.block_hash is not None, ( + f"block {invalid_block_id} should be cached (have a hash) before " + f"eviction test, but hash is None" + ) + + # report invalid blocks + model_runner_output = create_model_runner_output( + [request1], + invalid_block_ids=invalid_block_ids, + use_eos=False, + ) + + fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify request finished with error (fail policy) + assert request1.status == RequestStatus.FINISHED_ERROR + + # critical assertion: invalid block and all subsequent blocks should be evicted + # all blocks from invalid_block_idx onwards become invalid since they were + # computed based on the failed block + for idx in range(invalid_block_idx, len(req_block_ids)): + block_id = req_block_ids[idx] + block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id] + assert block_obj.block_hash is None, ( + f"block {block_id} at index {idx} should have been evicted " + f"(hash reset to None), but hash is {block_obj.block_hash}. " + f"All blocks from index {invalid_block_idx} onwards should be evicted " + f"since they depend on the invalid block at index {invalid_block_idx}." + ) + + # verify cache contains exactly the valid blocks (before first affected block) + # and none of the invalid blocks (from first affected block onwards) + + # valid blocks: all blocks before invalid_block_idx should be cached + for idx in range(invalid_block_idx): + block_id = req_block_ids[idx] + block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id] + assert block_obj.block_hash is not None, ( + f"valid block {block_id} at index {idx} should still be cached " + f"(have a hash), but hash is None. Only blocks from index " + f"{invalid_block_idx} onwards should be evicted." + ) + + # invalid blocks: verify they're not in the cached_block_hash_to_block map + cached_blocks = ( + fail_scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block + ) + cached_block_ids = { + b.block_id + for blocks_val in cached_blocks._cache.values() + for b in ( + [blocks_val] if not isinstance(blocks_val, dict) else blocks_val.values() + ) + } + + for idx in range(invalid_block_idx, len(req_block_ids)): + block_id = req_block_ids[idx] + assert block_id not in cached_block_ids, ( + f"invalid block {block_id} at index {idx} should not be in cache hash table" + ) diff --git a/tests/v1/kv_connector/unit/test_error_propagation.py b/tests/v1/kv_connector/unit/test_error_propagation.py new file mode 100644 index 0000000000000..20e181f379f5c --- /dev/null +++ b/tests/v1/kv_connector/unit/test_error_propagation.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import FinishReason, Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +def test_error_propagation_sync_load(fail_scheduler: Scheduler): + """test invalid_block_ids with fail policy -> FINISHED_ERROR (sync load)""" + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + assert len(fail_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert fail_scheduler.connector.get_num_new_matched_tokens.call_count == 1 + + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {req_block_ids[invalid_block_idx]} + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + assert len(fail_scheduler.running) == 0 + + +def test_error_propagation_async_load(fail_scheduler: Scheduler): + """test invalid_block_ids with fail policy -> FINISHED_ERROR (async load)""" + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + assert len(fail_scheduler.waiting) == 1 + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.num_computed_tokens == 0 + + (req_block_ids,) = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id) + invalid_block_ids = {req_block_ids[invalid_block_idx]} + model_runner_output = create_model_runner_output( + reqs=[], + finished_recving=set(), + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + assert len(fail_scheduler.waiting) == 0 diff --git a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py new file mode 100644 index 0000000000000..940f3a98308b6 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py @@ -0,0 +1,454 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Tests for correctness in invalid block handling. + +These tests verify correct behavior in three scenarios: +1. Sync recompute case: Blocks should not be freed for running requests + that need to recompute invalid blocks +2. Sync fail case: Invalid blocks must be evicted from cache when request fails +3. Async recompute case: Invalid blocks should not be cached after transfer +""" + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import FinishReason, Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +@pytest.fixture +def recompute_scheduler(): + """scheduler with kv_load_failure_policy='recompute'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute" + return create_scheduler(vllm_config) + + +def test_sync_recompute_blocks_not_freed_for_running_requests( + recompute_scheduler: Scheduler, +): + """ + Test sync recompute case - blocks must not be freed for running requests. + + When a running request has invalid blocks and retry_policy is 'recompute': + 1. Request should remain in RUNNING state + 2. num_computed_tokens should be truncated to invalid block boundary + 3. Blocks should NOT be freed (request still needs them for recomputation) + 4. Request should remain in scheduler.requests and scheduler.running + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + recompute_scheduler.connector = Mock() + recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + recompute_scheduler.connector.request_finished.return_value = (False, None) + recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = recompute_scheduler.schedule() + + # request should be running with sync KV load + assert len(recompute_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert request.status == RequestStatus.RUNNING + + # get the allocated block IDs before invalid blocks are reported + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {req_block_ids[invalid_block_idx]} + + # store original num_computed_tokens for comparison + original_num_computed_tokens = request.num_computed_tokens + + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=False, # not finished - should continue running + ) + + outputs = recompute_scheduler.update_from_output( + scheduler_output, model_runner_output + ) + + # critical assertions for recompute case: + + # 1. request should still be RUNNING (not finished, not aborted) + assert request.status == RequestStatus.RUNNING, ( + f"Request should remain RUNNING for recompute, got {request.status}" + ) + + # 2. num_computed_tokens should be truncated to first invalid block + expected_truncated_tokens = invalid_block_idx * recompute_scheduler.block_size + assert request.num_computed_tokens == expected_truncated_tokens, ( + f"num_computed_tokens should be truncated to {expected_truncated_tokens}, " + f"got {request.num_computed_tokens}" + ) + assert request.num_computed_tokens < original_num_computed_tokens, ( + "num_computed_tokens should be reduced after invalid block detection" + ) + + # 3. no output should be generated (request is still running) + # the request should be skipped in the output loop + assert len(outputs) == 0 or request.request_id not in [ + out.request_id for outs in outputs.values() for out in outs.outputs + ], "No output should be generated for recompute requests" + + # 4. request should still be in running queue + assert request in recompute_scheduler.running, ( + "Request should remain in running queue for recomputation" + ) + + # 5. request should still be in scheduler.requests (not deleted) + assert request.request_id in recompute_scheduler.requests, ( + "Request should not be deleted from scheduler.requests" + ) + + # 6. blocks should NOT be freed - verify blocks are still allocated + try: + allocated_blocks = recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + assert allocated_blocks is not None + assert len(allocated_blocks[0]) > 0, ( + "Blocks should still be allocated for recomputation" + ) + except KeyError: + pytest.fail( + "Blocks were freed incorrectly! Running requests need their blocks " + "to recompute invalid portions." + ) + + # 7. verify request can be rescheduled in next step + scheduler_output_2 = recompute_scheduler.schedule() + + # request should appear in the new schedule to recompute invalid blocks + scheduled_req_ids = [ + req.request_id for req in scheduler_output_2.scheduled_new_reqs + ] + if scheduler_output_2.num_scheduled_tokens: + scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys()) + + assert ( + request.request_id in scheduled_req_ids or len(recompute_scheduler.running) > 0 + ), "Request should be reschedulable for recomputation" + + +def test_sync_fail_invalid_blocks_evicted(fail_scheduler: Scheduler): + """ + Test sync fail case - invalid blocks must be evicted from cache. + + When a request fails with policy='fail' and has invalid blocks from sync loading: + 1. Request should be finished with FINISHED_ERROR + 2. Invalid blocks should be evicted from the KV cache + 3. Valid blocks (if shared) should remain in cache + 4. Future requests should not reuse the invalid blocks + + This test verifies that invalid blocks are properly evicted to prevent + cache corruption and reuse of invalid data. + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + # request should be running with sync KV load + assert len(fail_scheduler.running) == 1 + assert request.status == RequestStatus.RUNNING + + # get allocated block IDs + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # verify the block is in the block pool before we report it as invalid + block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + assert block is not None + + # report invalid blocks - request should fail + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify request is finished with error + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + # verify output is generated + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + # verify the request was removed from scheduler + assert request.request_id not in fail_scheduler.requests + assert len(fail_scheduler.running) == 0 + + # critical: verify invalid block was actually freed from cache + # this is the key assertion - the invalid block should no longer be + # tracked by the KV cache manager for this request + # if it's still there, a future request could reuse the invalid data + try: + block_ids = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id) + # if we get here, check if blocks were actually freed + if block_ids is not None and len(block_ids[0]) > 0: + pytest.fail( + f"Invalid blocks still tracked for finished request! " + f"Request {request.request_id} should have been freed but " + f"still has {len(block_ids[0])} blocks allocated." + ) + # blocks list exists but is empty - this is fine, they were freed + except KeyError: + # expected - request completely removed from tracking + pass + + # critical: verify invalid block was evicted from prefix cache + # the block should no longer have a hash (hash is reset on eviction) + assert block.block_hash is None, ( + f"Invalid block {invalid_block_id} should have been evicted from cache " + f"(hash should be None), but hash is still {block.block_hash}" + ) + + +def test_async_recompute_blocks_not_cached_when_invalid( + recompute_scheduler: Scheduler, +): + """ + Test async recompute case - invalid blocks not cached after transfer. + + When async KV loading has invalid blocks and retry_policy is 'recompute': + 1. Blocks are allocated but not cached yet + 2. When async transfer completes, only valid blocks should be cached + 3. Invalid blocks should never enter the prefix cache + + This test verifies correctness, the failed_recving_kv_req_ids protection + ensures only valid blocks are cached when the transfer completes, and we + only evict blocks from cache that are already hashed in the block table. + """ + from unittest.mock import patch + + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating async load + recompute_scheduler.connector = Mock() + recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True) + ) + recompute_scheduler.connector.request_finished.return_value = (False, None) + recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = recompute_scheduler.schedule() + + # request should be waiting for remote KVs + assert len(recompute_scheduler.waiting) == 1 + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.num_computed_tokens == 0 + + # get the allocated block IDs + (req_block_ids,) = recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # get the block object to verify it's not cached yet and stays uncached + block = recompute_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + + # verify block has no hash before invalid blocks are reported + assert block.block_hash is None, ( + "Async loading blocks should not be cached yet (no hash)" + ) + + # report invalid blocks (transfer not finished yet) + model_runner_output = create_model_runner_output( + reqs=[], + finished_recving=None, # transfer NOT finished + invalid_block_ids=invalid_block_ids, + use_eos=False, + ) + + # critical: spy on evict_blocks to verify it's NOT called for async blocks + original_evict_blocks = recompute_scheduler.kv_cache_manager.evict_blocks + evict_blocks_calls = [] + + def evict_blocks_spy(block_ids): + evict_blocks_calls.append(set(block_ids)) + return original_evict_blocks(block_ids) + + with patch.object( + recompute_scheduler.kv_cache_manager, "evict_blocks", evict_blocks_spy + ): + recompute_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify evict_blocks was NOT called (async blocks excluded from eviction) + assert len(evict_blocks_calls) == 0, ( + f"evict_blocks should not be called for async-only invalid blocks, " + f"but was called {len(evict_blocks_calls)} time(s) with {evict_blocks_calls}" + ) + + # request should still be waiting (not finished with error due to recompute policy) + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids + + # verify num_computed_tokens was truncated to before invalid block + expected_valid_tokens = invalid_block_idx * recompute_scheduler.block_size + assert request.num_computed_tokens == expected_valid_tokens + + # verify invalid block still has no hash (was not evicted) + assert block.block_hash is None, ( + f"Async loading blocks shouldn't be cached or evicted. " + f"Block {invalid_block_id} hash should be None but is {block.block_hash}" + ) + + # now simulate async transfer completing + model_runner_output_2 = create_model_runner_output( + reqs=[], + finished_recving={request.request_id}, + invalid_block_ids=None, + use_eos=False, + ) + + recompute_scheduler.update_from_output(scheduler_output, model_runner_output_2) + + # verify request is now marked as finished receiving and ready to be processed + assert request.request_id in recompute_scheduler.finished_recving_kv_req_ids + assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids + + # critical: verify invalid block still has no hash before recompute + # the async transfer invalid data was never cached + assert block.block_hash is None, ( + f"Invalid block {invalid_block_id} should not be cached before recompute " + f"(hash should be None), but hash is {block.block_hash}" + ) + + # critical end-to-end test: spy on cache_blocks to verify it's called with + # the truncated num_computed_tokens value + original_cache_blocks = recompute_scheduler.kv_cache_manager.cache_blocks + cache_blocks_calls = [] + + def cache_blocks_spy(req, num_tokens): + cache_blocks_calls.append((req.request_id, num_tokens)) + return original_cache_blocks(req, num_tokens) + + with patch.object( + recompute_scheduler.kv_cache_manager, "cache_blocks", cache_blocks_spy + ): + # call schedule() again - this triggers _update_waiting_for_remote_kv() + # which should call cache_blocks with the truncated value + recompute_scheduler.schedule() + + # verify cache_blocks was called with the truncated value + assert len(cache_blocks_calls) == 1, ( + f"cache_blocks should be called exactly once, " + f"got {len(cache_blocks_calls)} calls" + ) + cached_req_id, cached_num_tokens = cache_blocks_calls[0] + assert cached_req_id == request.request_id + assert cached_num_tokens == expected_valid_tokens, ( + f"cache_blocks should be called with truncated value {expected_valid_tokens}, " + f"but was called with {cached_num_tokens}" + ) + + # request should now be RUNNING (scheduled immediately after transfer completes) + # the flow is: WAITING_FOR_REMOTE_KVS -> WAITING -> RUNNING in same schedule() call + assert request.status == RequestStatus.RUNNING + + # num_computed_tokens should be >= expected_valid_tokens because the scheduler + # will schedule additional new tokens (up to max_num_batched_tokens) for the request + assert request.num_computed_tokens >= expected_valid_tokens, ( + f"num_computed_tokens should be at least {expected_valid_tokens}, " + f"got {request.num_computed_tokens}" + ) + + # request should no longer be in the failed/finished receiving sets + assert request.request_id not in recompute_scheduler.failed_recving_kv_req_ids + assert request.request_id not in recompute_scheduler.finished_recving_kv_req_ids + + # request should be in the running queue + assert request in recompute_scheduler.running diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py index 88f8b91c292bb..98cea821c678e 100644 --- a/vllm/config/kv_transfer.py +++ b/vllm/config/kv_transfer.py @@ -64,6 +64,11 @@ class KVTransferConfig: enable_permute_local_kv: bool = False """Experiment feature flag to enable HND to NHD KV Transfer""" + kv_load_failure_policy: Literal["recompute", "fail"] = "recompute" + """Policy for handling KV cache load failures. + 'recompute': reschedule the request to recompute failed blocks (default) + 'fail': immediately fail the request with an error finish reason""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c6333d170c663..2560a5b2cdf41 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -51,7 +51,11 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, + clamp_prompt_logprobs, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall @@ -380,6 +384,8 @@ class OpenAIServingChat(OpenAIServing): tokenizer, request_metadata, ) + except GenerationError as e: + return self._convert_generation_error_to_response(e) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -1120,6 +1126,10 @@ class OpenAIServingChat(OpenAIServing): # if the model is finished generating else: + # check for error finish reason and abort streaming + # finish_reason='error' indicates a retryable error + self._raise_if_error(output.finish_reason, request_id) + # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing @@ -1287,6 +1297,8 @@ class OpenAIServingChat(OpenAIServing): delta=False, ) + except GenerationError as e: + yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") @@ -1327,6 +1339,9 @@ class OpenAIServingChat(OpenAIServing): role = self.get_chat_request_role(request) for output in final_res.outputs: + # check for error finish reason and raise GenerationError + # finish_reason='error' indicates a retryable request-level internal error + self._raise_if_error(output.finish_reason, request_id) token_ids = output.token_ids out_logprobs = output.logprobs tool_call_info = None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3e421e21e3e80..1be0afc8c74e5 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -24,7 +24,11 @@ from vllm.entrypoints.openai.protocol import ( RequestResponseMetadata, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, + clamp_prompt_logprobs, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.utils import get_max_tokens, should_include_usage @@ -300,6 +304,8 @@ class OpenAIServingCompletion(OpenAIServing): ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") + except GenerationError as e: + return self._convert_generation_error_to_response(e) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -437,6 +443,8 @@ class OpenAIServingCompletion(OpenAIServing): finish_reason = output.finish_reason stop_reason = output.stop_reason + self._raise_if_error(finish_reason, request_id) + chunk = CompletionStreamResponse( id=request_id, created=created_time, @@ -498,8 +506,11 @@ class OpenAIServingCompletion(OpenAIServing): # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info + except GenerationError as e: + yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: # TODO: Use a vllm-specific Validation Error + logger.exception("Error in completion stream generator.") data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" @@ -530,6 +541,8 @@ class OpenAIServingCompletion(OpenAIServing): out_logprobs: GenericSequence[dict[int, Logprob] | None] | None for output in final_res.outputs: + self._raise_if_error(output.finish_reason, request_id) + assert request.max_tokens is not None if request.echo: if request.return_token_ids: diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 44b0f1842a6c1..a799432baeb40 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -133,6 +133,15 @@ from vllm.utils.async_utils import ( from vllm.utils.collection_utils import is_list_of from vllm.v1.engine import EngineCoreRequest + +class GenerationError(Exception): + """raised when finish_reason indicates internal server error (500)""" + + def __init__(self, message: str = "Internal server error"): + super().__init__(message) + self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR + + logger = init_logger(__name__) CompletionLikeRequest: TypeAlias = ( @@ -456,6 +465,29 @@ class OpenAIServing: # Iterate through all beam inference results for i, result in enumerate(output): current_beam = all_beams[i] + + # check for error finish reason and abort beam search + if result.outputs[0].finish_reason == "error": + # yield error output and terminate beam search + yield RequestOutput( + request_id=request_id, + prompt=prompt_text, + outputs=[ + CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + ], + finished=True, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, + ) + return + if result.outputs[0].logprobs is not None: logprobs = result.outputs[0].logprobs[0] all_beams_token_id.extend(list(logprobs.keys())) @@ -780,6 +812,35 @@ class OpenAIServing: ) return json_str + def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None: + """Raise GenerationError if finish_reason indicates an error.""" + if finish_reason == "error": + logger.error( + "Request %s failed with an internal error during generation", + request_id, + ) + raise GenerationError("Internal server error") + + def _convert_generation_error_to_response( + self, e: GenerationError + ) -> ErrorResponse: + """Convert GenerationError to ErrorResponse.""" + return self.create_error_response( + str(e), + err_type="InternalServerError", + status_code=e.status_code, + ) + + def _convert_generation_error_to_streaming_response( + self, e: GenerationError + ) -> str: + """Convert GenerationError to streaming error response.""" + return self.create_streaming_error_response( + str(e), + err_type="InternalServerError", + status_code=e.status_code, + ) + async def _check_model( self, request: AnyRequest, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 91616a78e11dc..60d14337dcaaf 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -50,6 +50,7 @@ from openai.types.responses.response_reasoning_item import ( ) from openai.types.responses.tool import Mcp, Tool from openai_harmony import Message as OpenAIHarmonyMessage +from pydantic import TypeAdapter from vllm import envs from vllm.engine.protocol import EngineClient @@ -94,7 +95,10 @@ from vllm.entrypoints.openai.protocol import ( ResponseUsage, StreamingResponsesResponse, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.responses_utils import ( construct_input_messages, @@ -541,6 +545,8 @@ class OpenAIServingResponses(OpenAIServing): tokenizer, request_metadata, ) + except GenerationError as e: + return self._convert_generation_error_to_response(e) except Exception as e: return self.create_error_response(str(e)) @@ -648,6 +654,8 @@ class OpenAIServingResponses(OpenAIServing): status = "incomplete" elif context.finish_reason == "abort": status = "cancelled" + else: + self._raise_if_error(context.finish_reason, request.request_id) else: status = "incomplete" elif isinstance(context, ParsableContext): @@ -673,6 +681,9 @@ class OpenAIServingResponses(OpenAIServing): assert len(final_res.outputs) == 1 final_output = final_res.outputs[0] + # finish_reason='error' indicates retryable internal error + self._raise_if_error(final_output.finish_reason, request.request_id) + output = self._make_response_output_items(request, final_output, tokenizer) if request.enable_response_messages: @@ -1066,6 +1077,8 @@ class OpenAIServingResponses(OpenAIServing): async for event in generator: event_deque.append(event) new_event_signal.set() # Signal new event available + except GenerationError as e: + response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) response = self.create_error_response(str(e)) @@ -1089,6 +1102,8 @@ class OpenAIServingResponses(OpenAIServing): ): try: response = await self.responses_full_generator(request, *args, **kwargs) + except GenerationError as e: + response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) response = self.create_error_response(str(e)) @@ -1227,6 +1242,8 @@ class OpenAIServingResponses(OpenAIServing): continue if ctx.last_output.outputs: output = ctx.last_output.outputs[0] + # finish_reason='error' indicates a retryable error + self._raise_if_error(output.finish_reason, request.request_id) if reasoning_parser: delta_message = reasoning_parser.extract_reasoning_streaming( previous_text=previous_text, @@ -1522,6 +1539,9 @@ class OpenAIServingResponses(OpenAIServing): async for ctx in result_generator: assert isinstance(ctx, StreamingHarmonyContext) + # finish_reason='error' indicates a retryable error + self._raise_if_error(ctx.finish_reason, request.request_id) + if ctx.is_expecting_start(): current_output_index += 1 sent_output_item_added = False @@ -2016,18 +2036,25 @@ class OpenAIServingResponses(OpenAIServing): ) ) - async for event_data in processer( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - created_time, - _increment_sequence_number_and_return, - ): - yield event_data + try: + async for event_data in processer( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + _increment_sequence_number_and_return, + ): + yield event_data + except GenerationError as e: + error_json = self._convert_generation_error_to_streaming_response(e) + yield _increment_sequence_number_and_return( + TypeAdapter(StreamingResponsesResponse).validate_json(error_json) + ) + return async def empty_async_generator(): # A hack to trick Python to think this is a generator but diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index cfb2c02e00f1b..c779e3d34b3ed 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -397,6 +397,25 @@ class BlockPool: [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null] ) + def evict_blocks(self, block_ids: set[int]) -> None: + """evict blocks from the prefix cache by their block IDs. + + only evicts blocks that are currently cached (have a hash). blocks + with ref_cnt > 0 are not freed from the block pool, only evicted + from the prefix cache hash table. + + Args: + block_ids: Set of block IDs to evict from cache. + """ + for block_id in block_ids: + assert block_id < len(self.blocks), ( + f"Invalid block_id {block_id} >= {len(self.blocks)}. " + f"This indicates a bug in the KV connector - workers should " + f"only report block IDs that were allocated by the scheduler." + ) + block = self.blocks[block_id] + self._maybe_evict_cached_block(block) + def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalid prefix caching after the weights are updated, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 33e8c81514c5f..13086a66f6ea6 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -333,6 +333,14 @@ class KVCacheManager: """ self.coordinator.free(request.request_id) + def evict_blocks(self, block_ids: set[int]) -> None: + """evict blocks from the prefix cache by their block IDs. + + Args: + block_ids: Set of block IDs to evict from cache. + """ + self.block_pool.evict_blocks(block_ids) + def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalidate prefix caching after the weights are updated, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d858e840039c4..c3d504f2e72c3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -106,6 +106,7 @@ class Scheduler(SchedulerInterface): # KV Connector pushes/pull of remote KVs for P/D and offloading. self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None + self.recompute_kv_load_failures = True if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -117,6 +118,10 @@ class Scheduler(SchedulerInterface): ) if self.log_stats: self.connector_prefix_cache_stats = PrefixCacheStats() + kv_load_failure_policy = ( + self.vllm_config.kv_transfer_config.kv_load_failure_policy + ) + self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1066,7 +1071,7 @@ class Scheduler(SchedulerInterface): for req_id, num_tokens_scheduled in num_scheduled_tokens.items(): assert num_tokens_scheduled > 0 if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids: - # Skip requests that were recovered from KV load failure + # skip failed or rescheduled requests from KV load failure continue request = self.requests.get(req_id) if request is None: @@ -1177,6 +1182,21 @@ class Scheduler(SchedulerInterface): # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) + if failed_kv_load_req_ids and not self.recompute_kv_load_failures: + requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids] + self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR) + for request in requests: + outputs[request.client_index].append( + EngineCoreOutput( + request_id=request.request_id, + new_token_ids=[], + finish_reason=request.get_finished_reason(), + events=request.take_events(), + trace_headers=request.trace_headers, + num_cached_tokens=request.num_cached_tokens, + ) + ) + # KV Connector: update state for finished KV Transfers. if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) @@ -1610,8 +1630,11 @@ class Scheduler(SchedulerInterface): self._free_blocks(self.requests[req_id]) def _update_requests_with_invalid_blocks( - self, requests: Iterable[Request], invalid_block_ids: set[int] - ) -> tuple[set[str], int]: + self, + requests: Iterable[Request], + invalid_block_ids: set[int], + evict_blocks: bool = True, + ) -> tuple[set[str], int, set[int]]: """ Identify and update requests affected by invalid KV cache blocks. @@ -1623,16 +1646,21 @@ class Scheduler(SchedulerInterface): Args: requests: The set of requests to scan for invalid blocks. invalid_block_ids: IDs of invalid blocks. + evict_blocks: Whether to collect blocks for eviction (False for + async requests which aren't cached yet). Returns: tuple: - affected_req_ids (set[str]): IDs of requests impacted by invalid blocks. - total_affected_tokens (int): Total number of tokens that must - be recomputed across all affected requests (for observability). + be recomputed across all affected requests. + - blocks_to_evict (set[int]): Block IDs to evict from cache, + including invalid blocks and downstream dependent blocks. """ affected_req_ids: set[str] = set() total_affected_tokens = 0 + blocks_to_evict: set[int] = set() # If a block is invalid and shared by multiple requests in the batch, # these requests must be rescheduled, but only the first will recompute # it. This set tracks blocks already marked for recomputation. @@ -1690,6 +1718,9 @@ class Scheduler(SchedulerInterface): ) total_affected_tokens += num_affected_tokens request.num_external_computed_tokens -= num_affected_tokens + # collect invalid block and all downstream dependent blocks + if evict_blocks: + blocks_to_evict.update(req_block_ids[idx:]) if is_affected: if not marked_invalid_block: @@ -1705,47 +1736,70 @@ class Scheduler(SchedulerInterface): affected_req_ids.add(request.request_id) - return affected_req_ids, total_affected_tokens + return affected_req_ids, total_affected_tokens, blocks_to_evict def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]: - total_requests_to_reschedule = 0 - total_tokens_to_reschedule = 0 + """ + Handle requests affected by invalid KV cache blocks. - # --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) --- + Returns: + Set of affected request IDs to skip in update_from_output main loop. + """ + should_fail = not self.recompute_kv_load_failures + + # handle async KV loads (not cached yet, evict_blocks=False) async_load_reqs = ( req for req in self.waiting if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS ) - async_affected_req_ids, num_tokens_to_reschedule = ( + async_failed_req_ids, num_failed_tokens, _ = ( self._update_requests_with_invalid_blocks( - async_load_reqs, invalid_block_ids + async_load_reqs, invalid_block_ids, evict_blocks=False ) ) - total_requests_to_reschedule += len(async_affected_req_ids) - total_tokens_to_reschedule += num_tokens_to_reschedule + total_failed_requests = len(async_failed_req_ids) + total_failed_tokens = num_failed_tokens - # Mark requests with async KV load failures; they will be rescheduled - # once loading completes. - self.failed_recving_kv_req_ids |= async_affected_req_ids - - # --- Handle sync KV loads (running requests) --- - sync_affected_req_ids, num_tokens_to_reschedule = ( - self._update_requests_with_invalid_blocks(self.running, invalid_block_ids) + # handle sync loads (may be cached, collect blocks for eviction) + sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = ( + self._update_requests_with_invalid_blocks( + self.running, invalid_block_ids, evict_blocks=True + ) ) - total_requests_to_reschedule += len(sync_affected_req_ids) - total_tokens_to_reschedule += num_tokens_to_reschedule + total_failed_requests += len(sync_failed_req_ids) + total_failed_tokens += num_failed_tokens - if total_requests_to_reschedule: - logger.warning( - "Recovered from KV load failure: " - "%d request(s) rescheduled (%d tokens affected).", - total_requests_to_reschedule, - total_tokens_to_reschedule, + if not total_failed_requests: + return set() + + # evict invalid blocks and downstream dependent blocks from cache + # only when not using recompute policy (where blocks will be recomputed + # and reused by other requests sharing them) + if sync_blocks_to_evict and not self.recompute_kv_load_failures: + self.kv_cache_manager.evict_blocks(sync_blocks_to_evict) + + if should_fail: + all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids + logger.error( + "Failing %d request(s) due to KV load failure " + "(failure_policy=fail, %d tokens affected). Request IDs: %s", + total_failed_requests, + total_failed_tokens, + all_failed_req_ids, ) + return all_failed_req_ids - # Return the IDs of affected running requests to skip in - # update_from_output. - return sync_affected_req_ids + logger.warning( + "Recovered from KV load failure: " + "%d request(s) rescheduled (%d tokens affected).", + total_failed_requests, + total_failed_tokens, + ) + + # Mark async requests with KV load failures for retry once loading completes + self.failed_recving_kv_req_ids |= async_failed_req_ids + # Return sync affected IDs to skip in update_from_output + return sync_failed_req_ids diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index ce2aae77108da..4f54d12f4b8d0 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -19,24 +19,27 @@ from vllm.v1.serial_utils import UtilityResult # These are possible values of RequestOutput.finish_reason, # so form part of the external API. -FINISH_REASON_STRINGS = ("stop", "length", "abort") +FINISH_REASON_STRINGS = ("stop", "length", "abort", "error") class FinishReason(enum.IntEnum): """ - Reason a request finished - stop, length, or abort. + Reason a request finished - stop, length, abort, or error. Int rather than Str for more compact serialization. stop - a stop string was emitted length - max_tokens was consumed, or max_model_len was reached - abort - aborted for another reason + abort - aborted by client + error - retryable request-level internal error (e.g., KV load failure). + Invariant: always converted to 500 Internal Server Error. """ STOP = 0 LENGTH = 1 ABORT = 2 + ERROR = 3 def __str__(self): return FINISH_REASON_STRINGS[self.value] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 33762fe34e64f..a775e840e841c 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum): FINISHED_LENGTH_CAPPED = enum.auto() FINISHED_ABORTED = enum.auto() FINISHED_IGNORED = enum.auto() + FINISHED_ERROR = enum.auto() def __str__(self): return self.name @@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = { RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH, RequestStatus.FINISHED_ABORTED: FinishReason.ABORT, RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH, + RequestStatus.FINISHED_ERROR: FinishReason.ERROR, } From e72d65b959f759fcf56b329ecaaee7d166c012d2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 03:10:58 +0800 Subject: [PATCH 11/67] {Deprecation] Remove tokenizer setter (#30400) Signed-off-by: DarkLight1337 --- vllm/entrypoints/llm.py | 13 +------------ vllm/v1/engine/async_llm.py | 4 ---- vllm/v1/engine/input_processor.py | 4 ---- vllm/v1/engine/llm_engine.py | 4 ---- 4 files changed, 1 insertion(+), 24 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5d5c4a1cdb77b..3fce3338503ef 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -9,7 +9,7 @@ import cloudpickle import torch.nn as nn from pydantic import ValidationError from tqdm.auto import tqdm -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar from vllm.beam_search import ( BeamSearchInstance, @@ -73,7 +73,6 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams from vllm.tasks import PoolingTask from vllm.tokenizers import MistralTokenizer, TokenizerLike -from vllm.tokenizers.hf import get_cached_tokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils.collection_utils import as_iter, is_list_of from vllm.utils.counter import Counter @@ -367,16 +366,6 @@ class LLM: def get_tokenizer(self) -> TokenizerLike: return self.llm_engine.get_tokenizer() - @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.") - def set_tokenizer(self, tokenizer: TokenizerLike) -> None: - # While CachedTokenizer is dynamic, have no choice but - # compare class name. Misjudgment will arise from - # user-defined tokenizer started with 'Cached' - if tokenizer.__class__.__name__.startswith("Cached"): - self.llm_engine.tokenizer = tokenizer - else: - self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer) - def reset_mm_cache(self) -> None: self.input_processor.clear_mm_cache() self.llm_engine.reset_mm_cache() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fa3fb7a18895a..8eff61563ccea 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -701,10 +701,6 @@ class AsyncLLM(EngineClient): def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_processor.tokenizer = tokenizer - async def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index e6a94f4e3de5d..a3c18464d3f52 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -64,10 +64,6 @@ class InputProcessor: def tokenizer(self) -> TokenizerLike | None: return self.input_preprocessor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_preprocessor.tokenizer = tokenizer - def _validate_logprobs( self, params: SamplingParams, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1cb206c4e004c..4422eced82fea 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -358,10 +358,6 @@ class LLMEngine: def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_processor.tokenizer = tokenizer - def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( From 9f042ba26b59e1bfc9bef031165033fa931f3457 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 10 Dec 2025 11:13:01 -0800 Subject: [PATCH 12/67] [Perf] Enable environment cache in EngineCore to enable the feature for UniProcExecutor as well (#29289) Signed-off-by: Jialin Ouyang --- tests/test_envs.py | 38 ++++++++++++++++++++++++++++++ vllm/distributed/parallel_state.py | 2 ++ vllm/envs.py | 20 ++++++++++++++++ vllm/v1/engine/core.py | 7 +++--- 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index 11bbec38202bf..b6b7cf38d4abc 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -8,6 +8,7 @@ import pytest import vllm.envs as envs from vllm.envs import ( + disable_envs_cache, enable_envs_cache, env_list_with_choices, env_set_with_choices, @@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): envs.__getattr__ = envs.__getattr__.__wrapped__ +def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + # Enable envs cache and ignore ongoing environment changes + enable_envs_cache() + assert envs.VLLM_HOST_IP == "1.1.1.1" + # With cache enabled, the environment variable value is cached and unchanged + monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2") + assert envs.VLLM_HOST_IP == "1.1.1.1" + + disable_envs_cache() + assert envs.VLLM_HOST_IP == "2.2.2.2" + # After cache disabled, the environment variable value would be synced + # with os.environ + monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3") + assert envs.VLLM_HOST_IP == "3.3.3.3" + + +def test_is_envs_cache_enabled() -> None: + assert not envs._is_envs_cache_enabled() + enable_envs_cache() + assert envs._is_envs_cache_enabled() + + # Only wrap one-layer of cache, so we only need to + # call disable once to reset. + enable_envs_cache() + enable_envs_cache() + enable_envs_cache() + disable_envs_cache() + assert not envs._is_envs_cache_enabled() + + disable_envs_cache() + assert not envs._is_envs_cache_enabled() + + class TestEnvWithChoices: """Test cases for env_with_choices function.""" diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f910f10407d44..338cb1f1814b5 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1586,6 +1586,8 @@ def destroy_distributed_environment(): def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + # Reset environment variable cache + envs.disable_envs_cache() # Ensure all objects are not frozen before cleanup gc.unfreeze() diff --git a/vllm/envs.py b/vllm/envs.py index 8246109eb73af..230f2cf3450a9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1580,6 +1580,12 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def _is_envs_cache_enabled() -> bool: + """Checked if __getattr__ is wrapped with functools.cache""" + global __getattr__ + return hasattr(__getattr__, "cache_clear") + + def enable_envs_cache() -> None: """ Enables caching of environment variables. This is useful for performance @@ -1590,6 +1596,9 @@ def enable_envs_cache() -> None: runtime overhead. This also means that environment variables should NOT be updated after the service is initialized. """ + if _is_envs_cache_enabled(): + # Avoid wrapping functools.cache multiple times + return # Tag __getattr__ with functools.cache global __getattr__ __getattr__ = functools.cache(__getattr__) @@ -1599,6 +1608,17 @@ def enable_envs_cache() -> None: __getattr__(key) +def disable_envs_cache() -> None: + """ + Resets the environment variables cache. It could be used to isolate environments + between unit tests. + """ + global __getattr__ + # If __getattr__ is wrapped by functions.cache, unwrap the caching layer. + if _is_envs_cache_enabled(): + __getattr__ = __getattr__.__wrapped__ + + def __dir__(): return list(environment_variables.keys()) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3d3a1e138ddef..0045b8c1dd3e7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -211,6 +211,9 @@ class EngineCore: freeze_gc_heap() # If enable, attach GC debugger after static variable freeze. maybe_attach_gc_debug_callback() + # Enable environment variable cache (e.g. assume no more + # environment variable overrides after this point) + enable_envs_cache() def _initialize_kv_caches( self, vllm_config: VllmConfig @@ -672,10 +675,6 @@ class EngineCoreProc(EngineCore): assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - # Enable environment variable cache (e.g. assume no more - # environment variable overrides after this point) - enable_envs_cache() - @contextmanager def _perform_handshakes( self, From eea41804a4b4f84a80f63375ce2e77668d70bda5 Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Thu, 11 Dec 2025 03:18:51 +0800 Subject: [PATCH 13/67] [bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241) Signed-off-by: Po-Han Huang --- vllm/utils/flashinfer.py | 5 ++++- vllm/v1/attention/backends/flashinfer.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 7aaf690cbaa13..9a66049350cd8 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool: def force_use_trtllm_attention() -> bool | None: """ + This function should only be called during initialization stage when vllm config + is set. Return `None` if --attention-config.use_trtllm_attention is not set, return `True` if TRTLLM attention is forced to be used, return `False` if TRTLLM attention is forced to be not used. @@ -296,11 +298,12 @@ def use_trtllm_attention( kv_cache_dtype: str, q_dtype: torch.dtype, is_prefill: bool, + # None means auto-detection, True means force on, False means force off + force_use_trtllm: bool | None = None, has_sinks: bool = False, has_spec: bool = False, ) -> bool: """Return `True` if TRTLLM attention is used.""" - force_use_trtllm = force_use_trtllm_attention() # CLI argument is set to 0 - respect it if force_use_trtllm is not None and not force_use_trtllm: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8e9d764e4a123..4174b80ee312e 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config + self.attention_config = vllm_config.attention_config self._workspace_buffer = None self._prefill_wrapper: ( BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None @@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.cache_dtype, self.q_data_type, is_prefill=True, + force_use_trtllm=self.attention_config.use_trtllm_attention, has_sinks=self.has_sinks, has_spec=uses_spec_reorder, ) From 6ccb7baeb1a124ad9b6e87fe9bbd48ae40830869 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 10 Dec 2025 11:52:01 -0800 Subject: [PATCH 14/67] [LMCache] Fix breakage due to new LMCache version (#30216) Signed-off-by: Nick Hill --- requirements/kv_connectors.txt | 2 +- .../kv_connector/v1/lmcache_integration/vllm_v1_adapter.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index 083230c171096..f60a01a55d07c 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1,2 +1,2 @@ -lmcache +lmcache >= 0.3.10.post1 nixl >= 0.7.1 # Required for disaggregated prefill diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 15ac5b049fce9..cdc2969a7735e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -27,7 +27,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import ( LMCacheAsyncLookupServer, ) from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer -from lmcache.v1.plugin.plugin_launcher import PluginLauncher +from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig @@ -683,7 +683,7 @@ class LMCacheConnectorV1Impl: self.api_server = InternalAPIServer(self) self.api_server.start() # Launch plugins - self.plugin_launcher = PluginLauncher( + self.plugin_launcher = RuntimePluginLauncher( self.config, role, self.worker_count, From fcb894222f2b8a353072e1aea33b38f4403bbd7a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 10 Dec 2025 15:56:51 -0500 Subject: [PATCH 15/67] [Docs] Update EPLB docs (#30426) Signed-off-by: mgoin --- docs/serving/expert_parallel_deployment.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index ec07896592ba3..98f242ab8b892 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE Where: -- `TP_SIZE`: Tensor parallel size (always 1 for now) +- `TP_SIZE`: Tensor parallel size - `DP_SIZE`: Data parallel size - `EP_SIZE`: Expert parallel size (computed automatically) +When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`. + ### Example Command The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section. @@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu Enable EPLB with the `--enable-eplb` flag. -!!! note "Model Support" - Currently only DeepSeek V3 architecture is supported. - When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution. ### EPLB Parameters @@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T | `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 | | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` | | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` | +| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` | +| `policy` | The policy type for expert parallel load balancing | `"default"` | For example: From b9e0951f964e1b8adfebb973c30462c0e0417c1f Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Wed, 10 Dec 2025 17:15:54 -0500 Subject: [PATCH 16/67] [docs] Improve wide-EP performance + benchmarking documentation (#27933) Signed-off-by: Seiji Eicher --- docs/serving/data_parallel_deployment.md | 14 ++++++++++- docs/serving/expert_parallel_deployment.md | 28 +++++++++++++++++++++- tools/ep_kernels/README.md | 4 ++-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md index eff9c5d5e4efa..e5954917cd790 100644 --- a/docs/serving/data_parallel_deployment.md +++ b/docs/serving/data_parallel_deployment.md @@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w vLLM supports "self-contained" data parallel deployments that expose a single API endpoint. -It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. +It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank. Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks. @@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco ![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png) +## Hybrid Load Balancing + +Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints. + +Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are: + +- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns. +- Not compatible with `--headless` since every node exposes an API endpoint. +- Scale `--api-server-count` per node based on the number of local ranks + +In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes. + ## External Load Balancing For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally. diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index 98f242ab8b892..923020dc88c91 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -83,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \ --data-parallel-size-local 8 \ # Local DP size on this node (8 GPUs per node) --data-parallel-address 192.168.1.100 \ # Replace with actual IP of Node 1 --data-parallel-rpc-port 13345 \ # RPC communication port, can be any port as long as reachable by all nodes - --api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended) + --api-server-count=8 # Number of API servers for load handling (scaling this out to # local ranks is recommended) # Node 2 (Secondary - headless mode, no API server) vllm serve deepseek-ai/DeepSeek-V3-0324 \ @@ -184,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \ For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available. +## Advanced Configuration + +### Performance Optimization + +- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads +- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details. +- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution. + +### Troubleshooting + +- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited". +- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`. +- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband. + +### Benchmarking + +- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks. + +- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`. + ## Disaggregated Serving (Prefill/Decode Split) For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations. @@ -274,3 +294,9 @@ except Exception as e: print(f"❌ Error during disaggregated serving: {e}") print("Check that both prefill and decode instances are running and accessible") ``` + +### Benchmarking + +- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation. + +- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache. diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index 85e9d2a4f8129..ab0e358802bf8 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -7,7 +7,7 @@ Here we break down the requirements in 2 steps: 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine. -2 is necessary for multi-node deployment. +Step 2 is necessary for multi-node deployment. All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. @@ -23,6 +23,6 @@ TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh Additional step for multi-node deployment: ```bash -sudo bash configure_system_drivers.sh +sudo bash configure_system_drivers.sh # update-initramfs can take several minutes sudo reboot # Reboot is required to load the new driver ``` From 166ac3c94d6ee845d4d8dc1a6dced4d9033fa4e3 Mon Sep 17 00:00:00 2001 From: Christina Norman Date: Wed, 10 Dec 2025 17:01:19 -0600 Subject: [PATCH 17/67] fix(shm): Add memory barriers for cross-process shared memory visibility (#30407) Signed-off-by: Christina Holland Signed-off-by: Christina --- .../device_communicators/shm_broadcast.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 114516ff07a1f..31c6084c9b507 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import pickle +import threading import time from contextlib import contextmanager from dataclasses import dataclass, field @@ -43,6 +44,33 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL from_bytes_big = functools.partial(int.from_bytes, byteorder="big") +# Memory fence for cross-process shared memory visibility. +# Required for correct producer-consumer synchronization when using +# shared memory without locks. +_memory_fence_lock = threading.Lock() + + +def memory_fence(): + """ + Full memory barrier for shared memory synchronization. + + Ensures all prior memory writes are visible to other processes before + any subsequent reads. This is critical for lock-free producer-consumer + patterns using shared memory. + + Implementation acquires and immediately releases a lock. Python's + threading.Lock provides sequentially consistent memory barrier semantics + across all major platforms (POSIX, Windows). This is a lightweight + operation (~20ns) that guarantees: + - All stores before the barrier are visible to other threads/processes + - All loads after the barrier see the latest values + """ + # Lock acquire/release provides full memory barrier semantics. + # Using context manager ensures lock release even on exceptions. + with _memory_fence_lock: + pass + + def to_bytes_big(value: int, size: int) -> bytes: return value.to_bytes(size, byteorder="big") @@ -414,6 +442,10 @@ class MessageQueue: n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + # Memory fence ensures we see the latest read flags from readers. + # Without this, we may read stale flags from our CPU cache and + # spin indefinitely even though readers have completed. + memory_fence() read_count = sum(metadata_buffer[1:]) written_flag = metadata_buffer[0] if written_flag and read_count != self.buffer.n_reader: @@ -458,6 +490,10 @@ class MessageQueue: metadata_buffer[i] = 0 # mark the block as written metadata_buffer[0] = 1 + # Memory fence ensures the write is visible to readers on other cores + # before we proceed. Without this, readers may spin indefinitely + # waiting for a write that's stuck in our CPU's store buffer. + memory_fence() self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks break @@ -473,6 +509,10 @@ class MessageQueue: n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + # Memory fence ensures we see the latest writes from the writer. + # Without this, we may read stale flags from our CPU cache + # and spin indefinitely even though writer has updated them. + memory_fence() read_flag = metadata_buffer[self.local_reader_rank + 1] written_flag = metadata_buffer[0] if not written_flag or read_flag: @@ -513,6 +553,10 @@ class MessageQueue: # caller has read from the buffer # set the read flag metadata_buffer[self.local_reader_rank + 1] = 1 + # Memory fence ensures the read flag is visible to the writer. + # Without this, writer may not see our read completion and + # could wait indefinitely for all readers to finish. + memory_fence() self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks self._read_spin_timer.record_activity() From 8580919ac36b9ada425668264437c70935943e05 Mon Sep 17 00:00:00 2001 From: shivampr Date: Wed, 10 Dec 2025 15:17:41 -0800 Subject: [PATCH 18/67] [Bugfix] fix confusing OOM errors during v1 init (#28051) Signed-off-by: Shivam Signed-off-by: shivampr Co-authored-by: Chen Zhang --- tests/v1/engine/test_init_error_messaging.py | 54 +++++++ vllm/v1/core/kv_cache_utils.py | 10 +- vllm/v1/worker/gpu_model_runner.py | 139 ++++++++++--------- 3 files changed, 138 insertions(+), 65 deletions(-) create mode 100644 tests/v1/engine/test_init_error_messaging.py diff --git a/tests/v1/engine/test_init_error_messaging.py b/tests/v1/engine/test_init_error_messaging.py new file mode 100644 index 0000000000000..bc23a68f9deb1 --- /dev/null +++ b/tests/v1/engine/test_init_error_messaging.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory +from vllm.v1.kv_cache_interface import FullAttentionSpec + + +def test_kv_cache_oom_no_memory(): + from unittest.mock import MagicMock + + config = MagicMock() + config.model_config.max_model_len = 2048 + + spec = { + "layer_0": FullAttentionSpec( + block_size=16, + num_kv_heads=8, + head_size=128, + dtype="float16", + ) + } + + with pytest.raises(ValueError): + check_enough_kv_cache_memory(config, spec, 0) + + +def test_kv_cache_oom_insufficient_memory(monkeypatch): + from unittest.mock import MagicMock + + config = MagicMock() + config.model_config.max_model_len = 2048 + config.cache_config.block_size = 16 + config.parallel_config.tensor_parallel_size = 1 + config.parallel_config.pipeline_parallel_size = 1 + config.parallel_config.decode_context_parallel_size = 1 + + monkeypatch.setattr( + "vllm.v1.core.kv_cache_utils.max_memory_usage_bytes", + lambda c, s: 100 * 1024**3, # 100 GiB + ) + + spec = { + "layer_0": FullAttentionSpec( + block_size=16, + num_kv_heads=8, + head_size=128, + dtype="float16", + ) + } + + with pytest.raises(ValueError): + check_enough_kv_cache_memory(config, spec, 1024**3) # 1 GiB diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 774200deed158..e4360de3717d1 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -687,7 +687,9 @@ def check_enough_kv_cache_memory( raise ValueError( "No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " - "initializing the engine." + "initializing the engine. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more details." ) max_model_len = vllm_config.model_config.max_model_len @@ -711,8 +713,10 @@ def check_enough_kv_cache_memory( f"cache is needed, which is larger than the available KV cache " f"memory ({available_memory / GiB_bytes:.2f} GiB). " f"{estimated_msg} " - f"Try increasing `gpu_memory_utilization` or decreasing " - f"`max_model_len` when initializing the engine." + f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " + f"when initializing the engine. " + f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + f"for more details." ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ca06f048f290b..7dc86f1ee4815 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3571,74 +3571,89 @@ class GPUModelRunner( if self.parallel_config.enable_eplb: self.eplb_state = EplbState(self.parallel_config, self.device) eplb_models = 0 - with DeviceMemoryProfiler() as m: - time_before_load = time.perf_counter() - model_loader = get_model_loader(self.load_config) - self.model = model_loader.load_model( - vllm_config=self.vllm_config, model_config=self.model_config - ) - if self.lora_config: - self.model = self.load_lora_model( - self.model, self.vllm_config, self.device + + try: + with DeviceMemoryProfiler() as m: + time_before_load = time.perf_counter() + model_loader = get_model_loader(self.load_config) + self.model = model_loader.load_model( + vllm_config=self.vllm_config, model_config=self.model_config ) - if hasattr(self, "drafter"): - logger.info_once("Loading drafter model...") - self.drafter.load_model(self.model) - if ( - hasattr(self.drafter, "model") - and is_mixture_of_experts(self.drafter.model) - and self.parallel_config.enable_eplb - ): - spec_config = self.vllm_config.speculative_config - assert spec_config is not None - assert spec_config.draft_model_config is not None - logger.info_once( - "EPLB is enabled for drafter model %s.", - spec_config.draft_model_config.model, + if self.lora_config: + self.model = self.load_lora_model( + self.model, self.vllm_config, self.device ) + if hasattr(self, "drafter"): + logger.info_once("Loading drafter model...") + self.drafter.load_model(self.model) + if ( + hasattr(self.drafter, "model") + and is_mixture_of_experts(self.drafter.model) + and self.parallel_config.enable_eplb + ): + spec_config = self.vllm_config.speculative_config + assert spec_config is not None + assert spec_config.draft_model_config is not None + logger.info_once( + "EPLB is enabled for drafter model %s.", + spec_config.draft_model_config.model, + ) - global_expert_load = ( - global_expert_loads[eplb_models] - if global_expert_loads - else None - ) - old_global_expert_indices = ( - old_global_expert_indices_per_model[eplb_models] - if old_global_expert_indices_per_model - else None - ) - if self.eplb_state is None: - self.eplb_state = EplbState(self.parallel_config, self.device) - self.eplb_state.add_model( - self.drafter.model, - spec_config.draft_model_config, - global_expert_load, - old_global_expert_indices, - rank_mapping, - ) - eplb_models += 1 + global_expert_load = ( + global_expert_loads[eplb_models] + if global_expert_loads + else None + ) + old_global_expert_indices = ( + old_global_expert_indices_per_model[eplb_models] + if old_global_expert_indices_per_model + else None + ) + if self.eplb_state is None: + self.eplb_state = EplbState( + self.parallel_config, self.device + ) + self.eplb_state.add_model( + self.drafter.model, + spec_config.draft_model_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, + ) + eplb_models += 1 - if self.use_aux_hidden_state_outputs: - if not supports_eagle3(self.get_model()): - raise RuntimeError( - "Model does not support EAGLE3 interface but " - "aux_hidden_state_outputs was requested" - ) + if self.use_aux_hidden_state_outputs: + if not supports_eagle3(self.get_model()): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) - # Try to get auxiliary layers from speculative config, - # otherwise use model's default layers - aux_layers = self._get_eagle3_aux_layers_from_config() - if aux_layers: - logger.info( - "Using auxiliary layers from speculative config: %s", - aux_layers, - ) - else: - aux_layers = self.model.get_eagle3_aux_hidden_state_layers() + # Try to get auxiliary layers from speculative config, + # otherwise use model's default layers + aux_layers = self._get_eagle3_aux_layers_from_config() + if aux_layers: + logger.info( + "Using auxiliary layers from speculative config: %s", + aux_layers, + ) + else: + aux_layers = self.model.get_eagle3_aux_hidden_state_layers() - self.model.set_aux_hidden_state_layers(aux_layers) - time_after_load = time.perf_counter() - self.model_memory_usage = m.consumed_memory + self.model.set_aux_hidden_state_layers(aux_layers) + time_after_load = time.perf_counter() + self.model_memory_usage = m.consumed_memory + except torch.cuda.OutOfMemoryError as e: + msg = ( + "Failed to load model - not enough GPU memory. " + "Try lowering --gpu-memory-utilization to free memory for weights, " + "increasing --tensor-parallel-size, or using --quantization. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more tips." + ) + combined_msg = f"{msg} (original error: {e})" + logger.error(combined_msg) + raise e logger.info_once( "Model loading took %.4f GiB memory and %.6f seconds", self.model_memory_usage / GiB_bytes, From 25221b44bbb6856c25d7a3c01bb6f79e999927b0 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Thu, 11 Dec 2025 08:12:21 +0800 Subject: [PATCH 19/67] Add more docs for regex (#30106) Signed-off-by: Xu Song Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/features/structured_outputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 7d52891bea7b9..3ac987559e622 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as print(completion.choices[0].message.content) ``` -The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template: +The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template: ??? code From b4054c8ab469a9c3c3c77a1c2f22f54a69b87145 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 10 Dec 2025 16:48:35 -0800 Subject: [PATCH 20/67] Revert "[CI] Add Async Eplb nightly CI tests (#29385)" (#30431) --- .../deepseek_v2_lite_ep_async_eplb.sh | 73 ------------------ .../deepseek_v2_lite_ep_eplb.sh | 1 - .../qwen3_next_mtp_async_eplb.sh | 74 ------------------- .buildkite/test-pipeline.yaml | 20 +---- vllm/distributed/eplb/rebalance_execute.py | 3 + 5 files changed, 4 insertions(+), 167 deletions(-) delete mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh delete mode 100644 .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh deleted file mode 100644 index d7167161b0059..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8030} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="deepseek-ai/DeepSeek-V2-lite" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 2 \ - --data-parallel-size 2 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 693418da6093e..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do --data-parallel-size 2 \ --enable-expert-parallel \ --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600}' \ --trust-remote-code \ --max-model-len 2048 \ --port $PORT & diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh deleted file mode 100644 index 937a43d1a3221..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8040} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --gpu-memory-utilization 0.9 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8fc3587f7813c..750e7c038351c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1379,22 +1379,4 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - -- label: DeepSeek V2-Lite Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 55856d940f001..376dad8a72ef1 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -322,6 +322,9 @@ async def transfer_layer( num_local_physical_experts = next(iter(expert_weights[0])).shape[0] assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) assert num_physical_experts == ep_size * num_local_physical_experts + # A buffer to hold the expert weights in one layer during the exchange. + # NOTE: Currently we assume the same weights across different layers + # have the same shape. is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer( num_local_experts=num_local_physical_experts, From b51255f369cf45456e3062e32ecbfebd03a9f169 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 10 Dec 2025 19:12:58 -0600 Subject: [PATCH 21/67] [ROCm] Fix broken import in platform attention backend dispatching (#30432) Signed-off-by: Andreas Karatzas --- vllm/platforms/rocm.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f7adecbd88746..876114c2d33a4 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -403,7 +403,21 @@ class RocmPlatform(Platform): compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE if cache_config and cache_config.block_size is None: - cache_config.block_size = 16 + if ( + envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER + # NOTE: This block has been deprecated + # or get_env_variable_attn_backend() + # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN + # TODO: monitor https://github.com/vllm-project/vllm/pull/30396 + # to see how we can transition to the new way of selecting + # attention backends + ): + cache_config.block_size = 64 + logger.warning( + "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64." + ) + else: + cache_config.block_size = 16 if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" From d1e1fb4363c61080b7cd20469d5a751e88a1cdb3 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Wed, 10 Dec 2025 21:47:18 -0600 Subject: [PATCH 22/67] [Bugfix] Fix grouped_topk pytorch impl when num_experts can't be grouped properly (#29439) Signed-off-by: Divakar Verma Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: TJian --- vllm/model_executor/layers/fused_moe/layer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 61dd1892d67ea..7f803720d4770 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1556,6 +1556,14 @@ class FusedMoE(CustomOp): f"EPLB is not supported for {self.quant_method.method_name}." ) + def valid_grouping() -> bool: + # Check if num_experts is greater than num_expert_group + # and is divisible by num_expert_group + num_experts = router_logits.shape[-1] + if num_experts <= self.num_expert_group: + return False + return num_experts % self.num_expert_group == 0 + indices_type = self.quant_method.topk_indices_dtype # Check if we should use a routing simulation strategy @@ -1570,7 +1578,7 @@ class FusedMoE(CustomOp): ) # DeepSeekv2 uses grouped_top_k - elif self.use_grouped_topk: + elif self.use_grouped_topk and valid_grouping(): assert self.topk_group is not None assert self.num_expert_group is not None if rocm_aiter_ops.is_fused_moe_enabled(): From 5a87d8b9b1f357a65a9b73773178ae17fd7cd9c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 11:59:35 +0800 Subject: [PATCH 23/67] [Deprecation] Remove deprecated plugin and compilation fields for v0.13 release (#30396) Signed-off-by: DarkLight1337 --- docs/design/plugin_system.md | 4 +- tests/compile/test_config.py | 63 +--------------------- tests/kernels/moe/test_ocp_mx_moe.py | 4 +- tests/quantization/test_quark.py | 4 +- tests/test_config.py | 2 +- vllm/attention/backends/registry.py | 32 ----------- vllm/attention/selector.py | 46 +++++----------- vllm/config/compilation.py | 81 +--------------------------- vllm/config/vllm.py | 2 +- vllm/engine/arg_utils.py | 22 -------- 10 files changed, 22 insertions(+), 238 deletions(-) diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 3485c40c36811..b0ca2dad23d5b 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you ## Deprecation announcement !!! warning "Deprecations" - - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0. - - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. + - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 0e91cf525411e..04bb56ecb6470 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -import logging from contextlib import nullcontext from unittest.mock import patch @@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig from vllm.config.compilation import CompilationMode, PassConfig from vllm.engine.arg_utils import EngineArgs -from vllm.logger import _print_warning_once from vllm.platforms import current_platform from vllm.utils.torch_utils import _is_torch_equal_or_newer @@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config={"enable_attn_fusion": True, "enable_noop": True}, + pass_config={"fuse_attn_quant": True, "eliminate_noops": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, ), @@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init( vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size ) - - -def test_pass_config_deprecation(caplog_vllm): - caplog_vllm.set_level(logging.WARNING) - - # Clear cache to ensure warnings are re-issued - _print_warning_once.cache_clear() - - # Test enable_fusion -> fuse_norm_quant, fuse_act_quant - caplog_vllm.clear() - config = PassConfig(enable_fusion=True) - assert "enable_fusion is deprecated" in caplog_vllm.text - assert config.fuse_norm_quant is True - assert config.fuse_act_quant is True - assert config.enable_fusion is True - - # Test enable_attn_fusion -> fuse_attn_quant - caplog_vllm.clear() - config = PassConfig(enable_attn_fusion=True) - assert "enable_attn_fusion is deprecated" in caplog_vllm.text - assert config.fuse_attn_quant is True - assert config.enable_attn_fusion is True - - # Test enable_noop -> eliminate_noops - caplog_vllm.clear() - config = PassConfig(enable_noop=True) - assert "enable_noop is deprecated" in caplog_vllm.text - assert config.eliminate_noops is True - assert config.enable_noop is True - - # Test enable_sequence_parallelism -> enable_sp - caplog_vllm.clear() - config = PassConfig(enable_sequence_parallelism=True) - assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text - assert config.enable_sp is True - assert config.enable_sequence_parallelism is True - - # Test enable_async_tp -> fuse_gemm_comms - caplog_vllm.clear() - config = PassConfig(enable_async_tp=True) - assert "enable_async_tp is deprecated" in caplog_vllm.text - assert config.fuse_gemm_comms is True - assert config.enable_async_tp is True - - # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms - caplog_vllm.clear() - config = PassConfig(enable_fi_allreduce_fusion=True) - assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text - assert config.fuse_allreduce_rms is True - assert config.enable_fi_allreduce_fusion is True - - # Test hash consistency - config_old = PassConfig(enable_fusion=True) - config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True) - assert config_old.compute_hash() == config_new.compute_hash() - - config_old = PassConfig(enable_async_tp=True) - config_new = PassConfig(fuse_gemm_comms=True) - assert config_old.compute_hash() == config_new.compute_hash() diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index 91b508d4163cc..5a850dda4f6fd 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -70,12 +70,12 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): f"{torch.cuda.device_count()}" ) - # `cuda_graph_sizes=[16]` to reduce load time. + # `cudagraph_capture_sizes=[16]` to reduce load time. with vllm_runner( model_case.model_id, tensor_parallel_size=model_case.tp, load_format="dummy", - cuda_graph_sizes=[16], + cudagraph_capture_sizes=[16], ) as llm: # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562 # def check_model(model): diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 334f9a65e4c03..0ff6e8407ce67 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int): task = "wikitext" rtol = 0.1 - # Smaller cuda_graph_sizes to speed up the test. + # Smaller cudagraph_capture_sizes to speed up the test. results = lm_eval.simple_evaluate( model="vllm", model_args=config.get_model_args( - tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]} + tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]} ), tasks=task, batch_size=64, diff --git a/tests/test_config.py b/tests/test_config.py index 77d3a7115978e..0768c6d2cddf5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1085,7 +1085,7 @@ def test_vllm_config_explicit_overrides(): ) # Override one field but not others - pass_config = PassConfig(enable_noop=False) + pass_config = PassConfig(eliminate_noops=False) compilation_config = CompilationConfig(pass_config=pass_config) config = VllmConfig( model_config=regular_model, diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index 125e4e3827747..eaa0fa1d5db39 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -252,35 +252,3 @@ def register_backend( return lambda x: x return decorator - - -# Backwards compatibility alias for plugins -class _BackendMeta(type): - """Metaclass to provide deprecation warnings when accessing _Backend.""" - - def __getattribute__(cls, name: str): - if name not in ("__class__", "__mro__", "__name__"): - logger.warning( - "_Backend has been renamed to AttentionBackendEnum. " - "Please update your code to use AttentionBackendEnum instead. " - "_Backend will be removed in a future release." - ) - return getattr(AttentionBackendEnum, name) - - def __getitem__(cls, name: str): - logger.warning( - "_Backend has been renamed to AttentionBackendEnum. " - "Please update your code to use AttentionBackendEnum instead. " - "_Backend will be removed in a future release." - ) - return AttentionBackendEnum[name] - - -class _Backend(metaclass=_BackendMeta): - """Deprecated: Use AttentionBackendEnum instead. - - This class is provided for backwards compatibility with plugins - and will be removed in a future release. - """ - - pass diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index f6aba271d2e96..bbf95ff009001 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import inspect from functools import cache from typing import cast, get_args @@ -73,39 +72,18 @@ def _cached_get_attn_backend( ) -> type[AttentionBackend]: from vllm.platforms import current_platform - sig = inspect.signature(current_platform.get_attn_backend_cls) - if "use_v1" in sig.parameters: - logger.warning_once( - "use_v1 parameter for get_attn_backend_cls is deprecated and will " - "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please " - "remove it from your plugin code." - ) - attention_cls = current_platform.get_attn_backend_cls( - backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - True, # use_v1 - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, - ) - else: - attention_cls = current_platform.get_attn_backend_cls( - backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, - ) + attention_cls = current_platform.get_attn_backend_cls( + backend, + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla, + has_sink, + use_sparse, + use_mm_prefix, + attn_type, + ) if not attention_cls: raise ValueError( f"Invalid attention backend for {current_platform.device_name}" diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 51e4912aad9db..3b6cb8a343608 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -17,7 +17,6 @@ from vllm.config.utils import ( Range, config, get_hash_factors, - handle_deprecated, hash_factors, ) from vllm.logger import init_logger @@ -127,27 +126,6 @@ class PassConfig: fuse_allreduce_rms: bool = Field(default=None) """Enable flashinfer allreduce fusion.""" - # Deprecated flags - enable_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant - instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner. - """ - enable_attn_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_attn_quant instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_noop: bool = Field(default=None) - """Deprecated in: v0.12.0. Use eliminate_noops instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_sequence_parallelism: bool = Field(default=None) - """Deprecated in: v0.12.0. Use enable_sp instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_async_tp: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_fi_allreduce_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - fi_allreduce_fusion_max_size_mb: float | None = None """The threshold of the communicated tensor sizes under which vllm should use flashinfer fused allreduce. Specified as a @@ -206,15 +184,7 @@ class PassConfig: Any future fields that don't affect compilation should be excluded. """ - ignored_fields = [ - "enable_fusion", - "enable_attn_fusion", - "enable_noop", - "enable_sequence_parallelism", - "enable_async_tp", - "enable_fi_allreduce_fusion", - ] - return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields)) + return hash_factors(get_hash_factors(self, set())) @field_validator( "fuse_norm_quant", @@ -224,12 +194,6 @@ class PassConfig: "enable_sp", "fuse_gemm_comms", "fuse_allreduce_rms", - "enable_fusion", - "enable_attn_fusion", - "enable_noop", - "enable_sequence_parallelism", - "enable_async_tp", - "enable_fi_allreduce_fusion", mode="wrap", ) @classmethod @@ -242,49 +206,6 @@ class PassConfig: def __post_init__(self) -> None: # Handle deprecation and defaults - # Map old flags to new flags and issue warnings - handle_deprecated( - self, - "enable_fusion", - ["fuse_norm_quant", "fuse_act_quant"], - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_attn_fusion", - "fuse_attn_quant", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_sequence_parallelism", - "enable_sp", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_async_tp", - "fuse_gemm_comms", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_fi_allreduce_fusion", - "fuse_allreduce_rms", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_noop", - "eliminate_noops", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - if not self.eliminate_noops: if self.fuse_norm_quant or self.fuse_act_quant: logger.warning_once( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 607bb44cddd26..a3a9eec9b3203 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1014,7 +1014,7 @@ class VllmConfig: max_graph_size = min(max_num_seqs * 2, 512) # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16 # up to max_graph_size - cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( + cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( range(256, max_graph_size + 1, 16)) In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2f307a7ccf16d..cbb4862434a98 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -375,7 +375,6 @@ class EngineArgs: kv_cache_dtype: CacheDType = CacheConfig.cache_dtype seed: int | None = 0 max_model_len: int | None = ModelConfig.max_model_len - cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes ) @@ -1121,15 +1120,6 @@ class EngineArgs: compilation_group.add_argument( "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] ) - compilation_kwargs["cudagraph_capture_sizes"]["help"] = ( - "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0," - " whichever is soonest. Please use --cudagraph-capture-sizes instead." - ) - compilation_group.add_argument( - "--cuda-graph-sizes", - **compilation_kwargs["cudagraph_capture_sizes"], - deprecated=True, - ) compilation_group.add_argument( "--max-cudagraph-capture-size", **compilation_kwargs["max_cudagraph_capture_size"], @@ -1741,18 +1731,6 @@ class EngineArgs: # Compilation config overrides compilation_config = copy.deepcopy(self.compilation_config) - if self.cuda_graph_sizes is not None: - logger.warning( - "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or " - "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes " - "instead." - ) - if compilation_config.cudagraph_capture_sizes is not None: - raise ValueError( - "cuda_graph_sizes and compilation_config." - "cudagraph_capture_sizes are mutually exclusive" - ) - compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes if self.cudagraph_capture_sizes is not None: if compilation_config.cudagraph_capture_sizes is not None: raise ValueError( From 7e24e5d4d65abbe5ffc7e653fdfd670c7e300944 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 11:59:39 +0800 Subject: [PATCH 24/67] [Deprecation] Remove deprecated task, seed and MM settings (#30397) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_ngram_proposer.py | 2 +- examples/offline_inference/audio_language.py | 2 +- .../encoder_decoder_multimodal.py | 2 +- .../qwen2_5_omni/only_thinker.py | 2 +- .../qwen3_omni/only_thinker.py | 2 +- examples/offline_inference/vision_language.py | 2 +- .../vision_language_multi_image.py | 6 +- .../plugin/prithvi_geospatial_mae_client.py | 2 +- .../pooling/vision_language_pooling.py | 6 +- tests/conftest.py | 2 +- tests/test_config.py | 58 -------- tests/utils.py | 4 +- vllm/config/model.py | 131 ------------------ vllm/engine/arg_utils.py | 73 ++-------- vllm/entrypoints/llm.py | 2 +- vllm/envs.py | 5 - 16 files changed, 25 insertions(+), 276 deletions(-) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index cac401456b62a..872a263318ff7 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -37,7 +37,7 @@ def benchmark_propose(args): tokenizer="facebook/opt-125m", tokenizer_mode="auto", dtype="auto", - seed=None, + seed=0, trust_remote_code=False, ) proposer = NgramProposer( diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index df6e96ca375fc..40462c78ae8c2 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -422,7 +422,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index c1d6c6db53dfb..857767ac3c628 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -77,7 +77,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index ed005e6a69b80..cee83519fadcc 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py index 88a61ed694c2e..62131633da8aa 100644 --- a/examples/offline_inference/qwen3_omni/only_thinker.py +++ b/examples/offline_inference/qwen3_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 22802dddf7893..9142279140e56 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -2031,7 +2031,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 28c466c03dfa5..3c01806baa203 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1382,7 +1382,7 @@ def run_generate( model, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1416,7 +1416,7 @@ def run_chat( model: str, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1494,7 +1494,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/pooling/plugin/prithvi_geospatial_mae_client.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py index a6246999c14d6..1ba1fd6a92ca4 100644 --- a/examples/pooling/plugin/prithvi_geospatial_mae_client.py +++ b/examples/pooling/plugin/prithvi_geospatial_mae_client.py @@ -16,7 +16,7 @@ import requests # - start vllm in serving mode with the below args # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model-impl terratorch -# --task embed --trust-remote-code +# --trust-remote-code # --skip-tokenizer-init --enforce-eager # --io-processor-plugin terratorch_segmentation # --enable-mm-embeds diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py index 530aad4bc031c..dda56bc34df2e 100644 --- a/examples/pooling/pooling/vision_language_pooling.py +++ b/examples/pooling/pooling/vision_language_pooling.py @@ -305,7 +305,7 @@ def get_query(modality: QueryModality): raise ValueError(msg) -def run_encode(model: str, modality: QueryModality, seed: int | None): +def run_encode(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None): print("-" * 50) -def run_score(model: str, modality: QueryModality, seed: int | None): +def run_score(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -390,7 +390,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/tests/conftest.py b/tests/conftest.py index 9f811d5d8db2a..5b26a02823c56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -741,7 +741,7 @@ class VllmRunner: tokenizer_name: str | None = None, tokenizer_mode: str = "auto", trust_remote_code: bool = True, - seed: int | None = 0, + seed: int = 0, max_model_len: int | None = 1024, dtype: str = "auto", disable_log_stats: bool = True, diff --git a/tests/test_config.py b/tests/test_config.py index 0768c6d2cddf5..ee706ab3d9c87 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -89,64 +89,6 @@ def test_update_config(): new_config3 = update_config(config3, {"a": "new_value"}) -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "generate", "none", "generate"), - ("intfloat/multilingual-e5-small", "pooling", "none", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"), - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_auto_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="auto") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "pooling", "embed", "embed"), - ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"), - ("openai/whisper-small", "pooling", "embed", "embed"), - ], -) -def test_score_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="score") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_transcription_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="transcription") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - @pytest.mark.parametrize( ("model_id", "expected_runner_type", "expected_convert_type"), [ diff --git a/tests/utils.py b/tests/utils.py index ea3675b1461b8..d8102331b3612 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -119,7 +119,7 @@ class RemoteOpenAIServer: vllm_serve_args: list[str], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, override_hf_configs: dict[str, Any] | None = None, @@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, ) -> None: diff --git a/vllm/config/model.py b/vllm/config/model.py index 764bdf7000561..bd98111ffb5db 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -73,17 +73,6 @@ logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] ConvertType = Literal["none", "embed", "classify", "reward"] ConvertOption = Literal["auto", ConvertType] -TaskOption = Literal[ - "auto", - "generate", - "embedding", - "embed", - "classify", - "score", - "reward", - "transcription", - "draft", -] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ @@ -93,12 +82,6 @@ HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig] ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] LayerBlockType = Literal["attention", "linear_attention", "mamba"] -_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { - "generate": ["generate", "transcription"], - "pooling": ["embedding", "embed", "classify", "score", "reward"], - "draft": ["draft"], -} - _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "generate": [], "pooling": ["embed", "classify", "reward"], @@ -126,12 +109,6 @@ class ModelConfig: """Convert the model using adapters defined in [vllm.model_executor.models.adapters][]. The most common use case is to adapt a text generation model to be used for pooling tasks.""" - task: TaskOption | None = None - """[DEPRECATED] The task to use the model for. If the model supports more - than one model runner, this is used to select which model runner to run. - - Note that the model may support other tasks using the same model runner. - """ tokenizer: SkipValidation[str] = None # type: ignore """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" @@ -335,7 +312,6 @@ class ModelConfig: ignored_factors = { "runner", "convert", - "task", "tokenizer", "tokenizer_mode", "seed", @@ -510,97 +486,6 @@ class ModelConfig: is_generative_model = registry.is_text_generation_model(architectures, self) is_pooling_model = registry.is_pooling_model(architectures, self) - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - logger.warning( - "Pooling models now default support all pooling; " - "you can use it without any settings." - ) - return "embed" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ( - "The 'task' option has been deprecated and will be " - "removed in v0.13.0 or v1.0, whichever comes first." - ) - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model." - ) - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - else: - # Neither generative nor pooling model - try to convert if possible - if is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--runner pooling " - f"--convert {convert}` to continue using this model " - "as a pooling model." - ) - else: - debug_info = { - "architectures": architectures, - "is_generative_model": is_generative_model, - "is_pooling_model": is_pooling_model, - } - raise AssertionError( - "The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}. Found: {debug_info}" - ) - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - self.runner_type = self._get_runner_type(architectures, self.runner) self.convert_type = self._get_convert_type( architectures, self.runner_type, self.convert @@ -918,22 +803,6 @@ class ModelConfig: return convert_type - def _get_default_pooling_task( - self, - architectures: list[str], - ) -> Literal["embed", "classify", "reward"]: - if self.registry.is_cross_encoder_model(architectures, self): - return "classify" - - for arch in architectures: - match = try_match_architecture_defaults(arch, runner_type="pooling") - if match: - _, (_, convert_type) = match - assert convert_type != "none" - return convert_type - - return "embed" - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): quant_cfg = getattr(hf_config, "quantization_config", None) if quant_cfg is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cbb4862434a98..f303bef17b6a9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,7 +71,6 @@ from vllm.config.model import ( LogprobsMode, ModelDType, RunnerOption, - TaskOption, TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode @@ -360,7 +359,6 @@ class EngineArgs: hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert - task: TaskOption | None = ModelConfig.task skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode @@ -373,7 +371,7 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = 0 + seed: int = ModelConfig.seed max_model_len: int | None = ModelConfig.max_model_len cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes @@ -462,7 +460,6 @@ class EngineArgs: MultiModalConfig, "media_io_kwargs" ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs - disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb mm_processor_cache_type: MMCacheType | None = ( MultiModalConfig.mm_processor_cache_type @@ -558,9 +555,6 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - # DEPRECATED - enable_multimodal_encoder_data_parallel: bool = False - logits_processors: list[str | type[LogitsProcessor]] | None = ( ModelConfig.logits_processors ) @@ -628,7 +622,6 @@ class EngineArgs: model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) - model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( @@ -882,11 +875,6 @@ class EngineArgs: parallel_group.add_argument( "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"] ) - parallel_group.add_argument( - "--enable-multimodal-encoder-data-parallel", - action="store_true", - deprecated=True, - ) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -960,9 +948,6 @@ class EngineArgs: multimodal_group.add_argument( "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"] ) - multimodal_group.add_argument( - "--disable-mm-preprocessor-cache", action="store_true", deprecated=True - ) multimodal_group.add_argument( "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"] ) @@ -1192,62 +1177,20 @@ class EngineArgs: if is_gguf(self.model): self.quantization = self.load_format = "gguf" - # NOTE(woosuk): In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. - if self.seed is None: - logger.warning_once( - "`seed=None` is equivalent to `seed=0` in V1 Engine. " - "You will no longer be allowed to pass `None` in v0.13.", - scope="local", + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, ) - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", - self.seed, - ) - - if self.disable_mm_preprocessor_cache: - logger.warning_once( - "`--disable-mm-preprocessor-cache` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb 0` instead.", - scope="local", - ) - - self.mm_processor_cache_gb = 0 - elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: - logger.warning_once( - "VLLM_MM_INPUT_CACHE_GIB` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb %d` instead.", - envs.VLLM_MM_INPUT_CACHE_GIB, - scope="local", - ) - - self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB - - if self.enable_multimodal_encoder_data_parallel: - logger.warning_once( - "--enable-multimodal-encoder-data-parallel` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-encoder-tp-mode data` instead.", - scope="local", - ) - - self.mm_encoder_tp_mode = "data" - return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, - task=self.task, tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3fce3338503ef..6440b702f4fa6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -198,7 +198,7 @@ class LLM: quantization: QuantizationMethods | None = None, revision: str | None = None, tokenizer_revision: str | None = None, - seed: int | None = None, + seed: int = 0, gpu_memory_utilization: float = 0.9, swap_space: float = 4, cpu_offload_gb: float = 0, diff --git a/vllm/envs.py b/vllm/envs.py index 230f2cf3450a9..0cf0408054063 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -72,7 +72,6 @@ if TYPE_CHECKING: VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MEDIA_CONNECTOR: str = "http" - VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" @@ -786,9 +785,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # imported at runtime. # If a non-existing backend is used, an AssertionError will be thrown. "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), - # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache - # Default is 4 GiB per API process + 4 GiB per engine core process - "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser( @@ -1681,7 +1677,6 @@ def compile_factors() -> dict[str, object]: "VLLM_MEDIA_CONNECTOR", "VLLM_ASSETS_CACHE", "VLLM_ASSETS_CACHE_MODEL_CLEAN", - "VLLM_MM_INPUT_CACHE_GIB", "VLLM_WORKER_MULTIPROC_METHOD", "VLLM_ENABLE_V1_MULTIPROCESSING", "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", From d6464f267979946a1c2d9c6029ef2007be73ca09 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 10 Dec 2025 23:05:56 -0500 Subject: [PATCH 25/67] [Chore] Fix torch precision warning (#30428) Signed-off-by: yewentao256 --- tests/v1/e2e/test_async_scheduling.py | 4 ++-- vllm/envs.py | 10 ++++++---- vllm/v1/worker/gpu_worker.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 13b36c54123ce..5cef9b33c9984 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -152,8 +152,8 @@ def run_tests( m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA") else: m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - # lock matmul precision to full FP32 - m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") + # lock matmul precision to full FP32 (IEEE) + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( diff --git a/vllm/envs.py b/vllm/envs.py index 0cf0408054063..cb75ba1a62de9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,7 +74,7 @@ if TYPE_CHECKING: VLLM_MEDIA_CONNECTOR: str = "http" VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" - VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", # Controls PyTorch float32 matmul precision mode within vLLM workers. - # Valid options mirror torch.set_float32_matmul_precision + # Accepted values: + # - "ieee" (default): force full IEEE FP32 matmul precision. + # - "tf32": enable TensorFloat32-based fast matmul. "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( "VLLM_FLOAT32_MATMUL_PRECISION", - "highest", - ["highest", "high", "medium"], + "ieee", + ["ieee", "tf32"], case_sensitive=False, ), # Maximum number of compilation jobs to run in parallel. diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f2b6a1f76b0b9..25ac5aaf99818 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -81,7 +81,7 @@ class Worker(WorkerBase): # configure float32 matmul precision according to vLLM env. precision = envs.VLLM_FLOAT32_MATMUL_PRECISION - torch.set_float32_matmul_precision(precision) + torch.backends.cuda.matmul.fp32_precision = precision if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing From 1a516557e11809cd7ab01c8cc399333ea02f7ac6 Mon Sep 17 00:00:00 2001 From: xyDong0223 Date: Thu, 11 Dec 2025 12:52:17 +0800 Subject: [PATCH 26/67] [Doc] Add Baidu Kunlun XPU support (#30455) Signed-off-by: xyDong0223 --- docs/getting_started/installation/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index d5082bc7dd3a9..cff7ce1a882a1 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -26,3 +26,4 @@ The backends below live **outside** the main `vllm` repository and follow the | Rebellions ATOM / REBEL NPU | `vllm-rbln` | | | IBM Spyre AIU | `vllm-spyre` | | | Cambricon MLU | `vllm-mlu` | | +| Baidu Kunlun XPU | N/A, install from source | | From 36c9ce25543b4f48194d7adc4ba3d17f5b6102be Mon Sep 17 00:00:00 2001 From: gh-wf <111619017+gh-wf@users.noreply.github.com> Date: Thu, 11 Dec 2025 00:26:49 -0500 Subject: [PATCH 27/67] Ensure minimum frames for GLM 4.6V compatibility (#30285) Signed-off-by: Wayne Ferguson --- vllm/model_executor/models/glm4_1v.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 741edfdda3e2c..de091f03e881c 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1257,6 +1257,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): ) height = min(height, overrides.height) + num_frames = max(num_frames, 2) # GLM 4.6V requires 2 frames video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) video_items = [] for i in range(num_videos): From 979f50efd04552654eca57c7e71e38160a7cbb5c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 14:58:23 +0800 Subject: [PATCH 28/67] [Deprecation] Remove fallbacks for `embed_input_ids` and `embed_multimodal` (#30458) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 15 +------ vllm/model_executor/models/interfaces_base.py | 9 ----- .../models/mistral_large_3_eagle.py | 39 +++---------------- vllm/model_executor/models/phi3v.py | 7 +--- vllm/model_executor/models/qwen3_vl.py | 7 +--- 5 files changed, 9 insertions(+), 68 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 607ff55835f1d..1e5d80dd2f313 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -111,13 +111,7 @@ class SupportsMultiModal(Protocol): the appearances of their corresponding multimodal data item in the input prompt. """ - if hasattr(self, "get_multimodal_embeddings"): - logger.warning_once( - "`get_multimodal_embeddings` for vLLM models is deprecated and will be " - "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename " - "this method to `embed_multimodal`." - ) - return self.get_multimodal_embeddings(**kwargs) + ... def get_language_model(self) -> VllmModel: """ @@ -196,12 +190,7 @@ class SupportsMultiModal(Protocol): if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - if is_multimodal is None: - raise ValueError( - "`embed_input_ids` now requires `is_multimodal` arg, " - "please update your model runner according to " - "https://github.com/vllm-project/vllm/pull/16229." - ) + assert is_multimodal is not None return _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index e8d521ec2e8aa..f988873c9c77c 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -68,15 +68,6 @@ def _check_vllm_model_init(model: type[object] | object) -> bool: def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool: model_embed_input_ids = getattr(model, "embed_input_ids", None) if not callable(model_embed_input_ids): - model_get_input_embeddings = getattr(model, "get_input_embeddings", None) - if callable(model_get_input_embeddings): - logger.warning( - "`get_input_embeddings` for vLLM models is deprecated and will be " - "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename " - "this method to `embed_input_ids`." - ) - model.embed_input_ids = model_get_input_embeddings - return True logger.warning( "The model (%s) is missing the `embed_input_ids` method.", model, diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py index e3ca9e4ca82d0..37cd4324e53d9 100644 --- a/vllm/model_executor/models/mistral_large_3_eagle.py +++ b/vllm/model_executor/models/mistral_large_3_eagle.py @@ -18,15 +18,10 @@ from vllm.model_executor.models.deepseek_v2 import ( DeepseekV2DecoderLayer, DeepseekV2Model, ) -from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM -from vllm.multimodal.inputs import NestedTensors -from .utils import ( - _merge_multimodal_embeddings, - make_empty_intermediate_tensors_factory, - maybe_prefix, -) +from .interfaces import SupportsMultiModal +from .utils import make_empty_intermediate_tensors_factory, maybe_prefix logger = init_logger(__name__) @@ -117,26 +112,10 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM): ) super().__init__(vllm_config=vllm_config, prefix=prefix) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings | None = None, - *, - is_multimodal: torch.Tensor | None = None, - handle_oov_mm_token: bool = False, - ) -> torch.Tensor: - inputs_embeds = super().embed_input_ids(input_ids) + def get_language_model(self) -> torch.nn.Module: + return self.model - if multimodal_embeddings is None or len(multimodal_embeddings) == 0: - return inputs_embeds - - assert is_multimodal is not None - - return _merge_multimodal_embeddings( - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) + embed_input_ids = SupportsMultiModal.embed_input_ids # type: ignore def forward( self, @@ -155,11 +134,3 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM): "model.embed_tokens.weight", "lm_head.weight", } - - def embed_input_ids( - self, - input_ids: torch.Tensor, - multimodal_embeddings: NestedTensors | None = None, - is_multimodal: torch.Tensor | None = None, - ) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index b7ae548069f25..0d39e29dcc97b 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -687,12 +687,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - if is_multimodal is None: - raise ValueError( - "`embed_input_ids` now requires `is_multimodal` arg, " - "please update your model runner according to " - "https://github.com/vllm-project/vllm/pull/16229." - ) + assert is_multimodal is not None return _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1add39d6b0a84..eac3774196a0a 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1572,12 +1572,7 @@ class Qwen3VLForConditionalGeneration( if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - if is_multimodal is None: - raise ValueError( - "`embed_input_ids` now requires `is_multimodal` arg, " - "please update your model runner according to " - "https://github.com/vllm-project/vllm/pull/16229." - ) + assert is_multimodal is not None if self.use_deepstack: ( From d02d1043dea56e4d2b1149a311079d82ff251d9d Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Thu, 11 Dec 2025 15:30:33 +0800 Subject: [PATCH 29/67] fix: enhance human_readable_int function (#30337) Signed-off-by: Andy Xie --- tests/engine/test_arg_utils.py | 22 ++++++++++++++++++---- vllm/engine/arg_utils.py | 3 +++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index e46f118f8e846..c2cf77ffa12b6 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -350,21 +350,35 @@ def test_human_readable_model_len(): assert args.max_model_len == 1_000_000 args = parser.parse_args(["--max-model-len", "10k"]) assert args.max_model_len == 10_000 + args = parser.parse_args(["--max-model-len", "2g"]) + assert args.max_model_len == 2_000_000_000 + args = parser.parse_args(["--max-model-len", "2t"]) + assert args.max_model_len == 2_000_000_000_000 # Capital args = parser.parse_args(["--max-model-len", "3K"]) - assert args.max_model_len == 1024 * 3 + assert args.max_model_len == 2**10 * 3 args = parser.parse_args(["--max-model-len", "10M"]) assert args.max_model_len == 2**20 * 10 + args = parser.parse_args(["--max-model-len", "4G"]) + assert args.max_model_len == 2**30 * 4 + args = parser.parse_args(["--max-model-len", "4T"]) + assert args.max_model_len == 2**40 * 4 # Decimal values args = parser.parse_args(["--max-model-len", "10.2k"]) assert args.max_model_len == 10200 # ..truncated to the nearest int - args = parser.parse_args(["--max-model-len", "10.212345k"]) + args = parser.parse_args(["--max-model-len", "10.2123451234567k"]) assert args.max_model_len == 10212 + args = parser.parse_args(["--max-model-len", "10.2123451234567m"]) + assert args.max_model_len == 10212345 + args = parser.parse_args(["--max-model-len", "10.2123451234567g"]) + assert args.max_model_len == 10212345123 + args = parser.parse_args(["--max-model-len", "10.2123451234567t"]) + assert args.max_model_len == 10212345123456 # Invalid (do not allow decimals with binary multipliers) - for invalid in ["1a", "pwd", "10.24", "1.23M"]: + for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]: with pytest.raises(ArgumentError): - args = parser.parse_args(["--max-model-len", invalid]) + parser.parse_args(["--max-model-len", invalid]) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f303bef17b6a9..3f23b95641d61 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1783,6 +1783,7 @@ class EngineArgs: except Exception: # This is only used to set default_max_num_batched_tokens device_memory = 0 + device_name = "" # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces # throughput, see PR #17885 for more details. @@ -2042,11 +2043,13 @@ def human_readable_int(value): "k": 10**3, "m": 10**6, "g": 10**9, + "t": 10**12, } binary_multiplier = { "K": 2**10, "M": 2**20, "G": 2**30, + "T": 2**40, } number, suffix = match.groups() From fba89069302e9b4d0457bc8eeddeeec76f27f0b1 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Thu, 11 Dec 2025 00:20:45 -0800 Subject: [PATCH 30/67] [perf] Use direct copy (broadcast) instead of cat for k_nope/k_pe in MLA prefill (#29710) Signed-off-by: Ming Yang --- benchmarks/kernels/benchmark_mla_k_concat.py | 150 +++++++++++++++++++ vllm/v1/attention/backends/mla/common.py | 33 +++- 2 files changed, 180 insertions(+), 3 deletions(-) create mode 100644 benchmarks/kernels/benchmark_mla_k_concat.py diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py new file mode 100644 index 0000000000000..fb3b6c8f12003 --- /dev/null +++ b/benchmarks/kernels/benchmark_mla_k_concat.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation +in MLA (Multi-head Latent Attention) prefill. + +This validates that the optimization from commit 8d4142bd is beneficial across +various batch sizes, not just the originally tested batch size of 32768. +""" + +import time +from collections.abc import Callable + +import torch + +# DeepSeek-V3 MLA dimensions +NUM_HEADS = 128 +QK_NOPE_HEAD_DIM = 128 +PE_DIM = 64 + + +def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor: + """Original torch.cat approach with expand.""" + return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + + +def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor: + """Optimized direct copy approach (avoids expand + cat overhead).""" + k = torch.empty( + (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]), + dtype=k_nope.dtype, + device=k_nope.device, + ) + k[..., : k_nope.shape[-1]] = k_nope + k[..., k_nope.shape[-1] :] = k_pe + return k + + +def benchmark_method( + method: Callable, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + num_warmup: int = 10, + num_iters: int = 100, +) -> float: + """Benchmark a concatenation method and return mean latency in ms.""" + # Warmup + for _ in range(num_warmup): + _ = method(k_nope, k_pe) + torch.cuda.synchronize() + + # Benchmark + start = time.perf_counter() + for _ in range(num_iters): + _ = method(k_nope, k_pe) + torch.cuda.synchronize() + end = time.perf_counter() + + return (end - start) / num_iters * 1000 # Convert to ms + + +@torch.inference_mode() +def run_benchmark(dtype: torch.dtype, dtype_name: str): + """Run benchmark for a specific dtype.""" + torch.set_default_device("cuda") + + # Batch sizes to test (powers of 2 from 32 to 65536) + batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536] + + print("=" * 80) + print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation") + print("=" * 80) + print( + f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], " + f"k_pe=[B, 1, {PE_DIM}]" + ) + print(f"dtype: {dtype_name}") + print() + print( + f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | " + f"{'Speedup':>8} | {'Reduction':>10}" + ) + print("-" * 70) + + results = [] + for batch_size in batch_sizes: + # Create input tensors (generate in float32 then convert for FP8 compatibility) + k_nope = torch.randn( + batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda" + ).to(dtype) + k_pe = torch.randn( + batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda" + ).to(dtype) + + # Benchmark both methods + cat_time = benchmark_method(cat_method, k_nope, k_pe) + direct_time = benchmark_method(direct_copy_method, k_nope, k_pe) + + speedup = cat_time / direct_time + reduction = (1 - direct_time / cat_time) * 100 + + results.append((batch_size, cat_time, direct_time, speedup, reduction)) + + print( + f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | " + f"{speedup:>7.2f}x | {reduction:>9.1f}%" + ) + + print("=" * 80) + + # Summary statistics + speedups = [r[3] for r in results] + print("\nSpeedup summary:") + print(f" Min: {min(speedups):.2f}x") + print(f" Max: {max(speedups):.2f}x") + print(f" Mean: {sum(speedups) / len(speedups):.2f}x") + + # Find crossover point + crossover_batch = None + for batch_size, _, _, speedup, _ in results: + if speedup >= 1.0: + crossover_batch = batch_size + break + + print("\nConclusion:") + if crossover_batch: + print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}") + # Filter for large batches (>= 512 which is typical for prefill) + large_batch_speedups = [r[3] for r in results if r[0] >= 512] + if large_batch_speedups: + avg_large = sum(large_batch_speedups) / len(large_batch_speedups) + print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x") + print(" - MLA prefill typically uses large batches, so optimization is effective") + + return results + + +@torch.inference_mode() +def main(): + # Test bfloat16 + print("\n") + run_benchmark(torch.bfloat16, "bfloat16") + + # Test float8_e4m3fn + print("\n") + run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn") + + +if __name__ == "__main__": + main() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 0a5257a1d87d8..8265503c28c35 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1654,6 +1654,33 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): # Convert from (L, N, P) to (N, P, L) self.W_UK_T = W_UK.permute(1, 2, 0) + def _concat_k_nope_k_pe( + self, k_nope: torch.Tensor, k_pe: torch.Tensor + ) -> torch.Tensor: + """ + Efficiently concatenate k_nope and k_pe tensors along the last dimension. + + This function avoids the performance penalty of torch.cat with expanded + non-contiguous tensors by pre-allocating the output and using direct copies. + + Args: + k_nope: Tensor of shape [..., nope_dim] + k_pe: Tensor to broadcast and concatenate, typically shape [..., 1, pe_dim] + or [..., pe_dim] + + Returns: + Tensor of shape [..., nope_dim + pe_dim] + """ + k = torch.empty( + (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]), + dtype=k_nope.dtype, + device=k_nope.device, + ) + # Direct copies with efficient broadcasting + k[..., : k_nope.shape[-1]] = k_nope + k[..., k_nope.shape[-1] :] = k_pe + return k + def _compute_prefill_context( self, q: torch.Tensor, @@ -1690,7 +1717,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + k = self._concat_k_nope_k_pe(k_nope, k_pe) attn_output, attn_softmax_lse = self._run_prefill_context_chunk( prefill=prefill_metadata, @@ -1794,7 +1821,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + k = self._concat_k_nope_k_pe(k_nope, k_pe) attn_output, attn_softmax_lse = self._run_prefill_context_chunk( prefill=prefill_metadata, @@ -1843,7 +1870,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + k = self._concat_k_nope_k_pe(k_nope, k_pe) output_prefill = self._run_prefill_new_tokens( prefill=attn_metadata.prefill, From 6299628d326f429eba78736acb44e76749b281f5 Mon Sep 17 00:00:00 2001 From: "Rei." <56646027+JaviS-Rei@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:05:08 +0800 Subject: [PATCH 31/67] [bugfix] fix MiniMaxM2ReasoningParser streaming output not separating reasoning_content. (#29882) Signed-off-by: Rei <1477174254@qq.com> --- ...test_minimax_m2_append_reasoning_parser.py | 195 +++++++++++++++ .../test_minimax_m2_reasoning_parser.py | 230 ++++++++++++++++++ vllm/reasoning/minimax_m2_reasoning_parser.py | 43 ++++ 3 files changed, 468 insertions(+) create mode 100644 tests/reasoning/test_minimax_m2_append_reasoning_parser.py create mode 100644 tests/reasoning/test_minimax_m2_reasoning_parser.py diff --git a/tests/reasoning/test_minimax_m2_append_reasoning_parser.py b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py new file mode 100644 index 0000000000000..eefe5e3eff74c --- /dev/null +++ b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +parser_name = "minimax_m2_append_think" +end_token = "" + +# MiniMax M2 model path +REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2" + + +@pytest.fixture(scope="module") +def minimax_m2_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +# ============================================================================= +# MiniMaxM2AppendThinkReasoningParser behavior: +# - Prepends to the beginning of the output +# - Does NOT separate reasoning and content +# - Returns everything as content (with prepended) +# - reasoning is always None +# +# This parser is used when you want to keep the raw output with added +# ============================================================================= + +# Case: simple output with end token +SIMPLE_OUTPUT = { + "output": "This is reasoningThis is response", + "reasoning": None, + "content": "This is reasoningThis is response", + "is_reasoning_end": True, +} + +# Case: output without end token (reasoning in progress) +NO_END_TOKEN = { + "output": "This is reasoning in progress", + "reasoning": None, + "content": "This is reasoning in progress", + "is_reasoning_end": False, +} + +# Case: only end token +ONLY_END_TOKEN = { + "output": "This is response", + "reasoning": None, + "content": "This is response", + "is_reasoning_end": True, +} + +# Case: multiple lines +MULTIPLE_LINES = { + "output": "Line 1\nLine 2Response 1\nResponse 2", + "reasoning": None, + "content": "Line 1\nLine 2Response 1\nResponse 2", + "is_reasoning_end": True, +} + +# Case: empty output (non-streaming prepends ) +EMPTY = { + "output": "", + "reasoning": None, + "content": "", + "is_reasoning_end": False, +} + +# Case: empty output streaming (no tokens = no output) +EMPTY_STREAMING = { + "output": "", + "reasoning": None, + "content": None, + "is_reasoning_end": False, +} + +# Case: special characters +SPECIAL_CHARS = { + "output": "Let me think... 1+1=2Yes!", + "reasoning": None, + "content": "Let me think... 1+1=2Yes!", + "is_reasoning_end": True, +} + +# Case: code in output +CODE_OUTPUT = { + "output": "```python\nprint('hi')\n```Here's the code.", + "reasoning": None, + "content": "```python\nprint('hi')\n```Here's the code.", + "is_reasoning_end": True, +} + +TEST_CASES = [ + pytest.param( + False, + SIMPLE_OUTPUT, + id="simple_output", + ), + pytest.param( + True, + SIMPLE_OUTPUT, + id="simple_output_streaming", + ), + pytest.param( + False, + NO_END_TOKEN, + id="no_end_token", + ), + pytest.param( + True, + NO_END_TOKEN, + id="no_end_token_streaming", + ), + pytest.param( + False, + ONLY_END_TOKEN, + id="only_end_token", + ), + pytest.param( + True, + ONLY_END_TOKEN, + id="only_end_token_streaming", + ), + pytest.param( + False, + MULTIPLE_LINES, + id="multiple_lines", + ), + pytest.param( + True, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + False, + EMPTY, + id="empty", + ), + pytest.param( + True, + EMPTY_STREAMING, + id="empty_streaming", + ), + pytest.param( + False, + SPECIAL_CHARS, + id="special_chars", + ), + pytest.param( + True, + SPECIAL_CHARS, + id="special_chars_streaming", + ), + pytest.param( + False, + CODE_OUTPUT, + id="code_output", + ), + pytest.param( + True, + CODE_OUTPUT, + id="code_output_streaming", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + minimax_m2_tokenizer, +): + output = minimax_m2_tokenizer.tokenize(param_dict["output"]) + # decode everything to tokens + output_tokens: list[str] = [ + minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)( + minimax_m2_tokenizer + ) + + reasoning, content = run_reasoning_extraction( + parser, output_tokens, streaming=streaming + ) + + assert reasoning == param_dict["reasoning"] + assert content == param_dict["content"] + + # Test is_reasoning_end + output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output) + is_reasoning_end = parser.is_reasoning_end(output_ids) + assert is_reasoning_end == param_dict["is_reasoning_end"] diff --git a/tests/reasoning/test_minimax_m2_reasoning_parser.py b/tests/reasoning/test_minimax_m2_reasoning_parser.py new file mode 100644 index 0000000000000..0d1056894c6ae --- /dev/null +++ b/tests/reasoning/test_minimax_m2_reasoning_parser.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +parser_name = "minimax_m2" +end_token = "" + +# MiniMax M2 model path +REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2" + + +@pytest.fixture(scope="module") +def minimax_m2_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +# ============================================================================= +# MiniMax M2 specific behavior: +# - Model does NOT generate start token +# - Model only generates end token +# - All content before is reasoning +# - All content after is the actual response (content) +# ============================================================================= + +# Case: reasoning + end token + content (typical case) +SIMPLE_REASONING = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning": "This is a reasoning section", + "content": "This is the rest", + "is_reasoning_end": True, +} + +# Case: reasoning + end token only (no content after) +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, + "is_reasoning_end": True, +} + +# Case: no end token yet (streaming in progress, all is reasoning) +NO_END_TOKEN = { + "output": "This is reasoning in progress", + "reasoning": "This is reasoning in progress", + "content": None, + "is_reasoning_end": False, +} + +# Case: multiple lines of reasoning +MULTIPLE_LINES = { + "output": "First line\nSecond lineResponse first line\nResponse second", + "reasoning": "First line\nSecond line", + "content": "Response first line\nResponse second", + "is_reasoning_end": True, +} + +# Case: only end token (empty reasoning, immediate response) +SHORTEST_REASONING_NO_STREAMING = { + "output": "This is the response", + "reasoning": "", + "content": "This is the response", + "is_reasoning_end": True, +} + +# Case: only end token streaming (reasoning is None because it's just the token) +SHORTEST_REASONING_STREAMING = { + "output": "This is the response", + "reasoning": None, + "content": "This is the response", + "is_reasoning_end": True, +} + +# Case: empty output +EMPTY = { + "output": "", + "reasoning": "", + "content": None, + "is_reasoning_end": False, +} + +# Case: empty streaming +EMPTY_STREAMING = { + "output": "", + "reasoning": None, + "content": None, + "is_reasoning_end": False, +} + +# Case: long reasoning with special characters +SPECIAL_CHARS = { + "output": "Let me think... 1+1=2, right?Yes, 1+1=2.", + "reasoning": "Let me think... 1+1=2, right?", + "content": "Yes, 1+1=2.", + "is_reasoning_end": True, +} + +# Case: reasoning with code blocks +CODE_IN_REASONING = { + "output": "```python\nprint('hello')\n```Here is the code.", + "reasoning": "```python\nprint('hello')\n```", + "content": "Here is the code.", + "is_reasoning_end": True, +} + +TEST_CASES = [ + # Core cases: no start token (MiniMax M2 actual behavior) + pytest.param( + False, + SIMPLE_REASONING, + id="simple_reasoning", + ), + pytest.param( + True, + SIMPLE_REASONING, + id="simple_reasoning_streaming", + ), + pytest.param( + False, + COMPLETE_REASONING, + id="complete_reasoning", + ), + pytest.param( + True, + COMPLETE_REASONING, + id="complete_reasoning_streaming", + ), + pytest.param( + False, + NO_END_TOKEN, + id="no_end_token", + ), + pytest.param( + True, + NO_END_TOKEN, + id="no_end_token_streaming", + ), + pytest.param( + False, + MULTIPLE_LINES, + id="multiple_lines", + ), + pytest.param( + True, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + False, + SHORTEST_REASONING_NO_STREAMING, + id="shortest_reasoning", + ), + pytest.param( + True, + SHORTEST_REASONING_STREAMING, + id="shortest_reasoning_streaming", + ), + pytest.param( + False, + EMPTY, + id="empty", + ), + pytest.param( + True, + EMPTY_STREAMING, + id="empty_streaming", + ), + pytest.param( + False, + SPECIAL_CHARS, + id="special_chars", + ), + pytest.param( + True, + SPECIAL_CHARS, + id="special_chars_streaming", + ), + pytest.param( + False, + CODE_IN_REASONING, + id="code_in_reasoning", + ), + pytest.param( + True, + CODE_IN_REASONING, + id="code_in_reasoning_streaming", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + minimax_m2_tokenizer, +): + output = minimax_m2_tokenizer.tokenize(param_dict["output"]) + # decode everything to tokens + output_tokens: list[str] = [ + minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)( + minimax_m2_tokenizer + ) + + reasoning, content = run_reasoning_extraction( + parser, output_tokens, streaming=streaming + ) + + assert reasoning == param_dict["reasoning"] + assert content == param_dict["content"] + + # Test is_reasoning_end + output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output) + is_reasoning_end = parser.is_reasoning_end(output_ids) + assert is_reasoning_end == param_dict["is_reasoning_end"] + + # Test extract_content + if param_dict["content"] is not None: + content = parser.extract_content_ids(output_ids) + assert content == minimax_m2_tokenizer.convert_tokens_to_ids( + minimax_m2_tokenizer.tokenize(param_dict["content"]) + ) + else: + content = parser.extract_content_ids(output) + assert content == [] diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 138d1b4e6dacf..a2b9224cb3bff 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -19,6 +19,10 @@ logger = init_logger(__name__) class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for MiniMax M2 model. + + MiniMax M2 models don't generate start token, only end + token. All content before is reasoning, content after is the + actual response. """ @property @@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser): """The token that ends reasoning content.""" return "" + def extract_reasoning_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> DeltaMessage | None: + """ + Extract reasoning content from a delta message for streaming. + + MiniMax M2 models don't generate start token, so we assume + all content is reasoning until we encounter the end token. + """ + # Skip single end token + if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id: + return None + + # Check if end token has already appeared in previous tokens + # meaning we're past the reasoning phase + if self.end_token_id in previous_token_ids: + # We're past the reasoning phase, this is content + return DeltaMessage(content=delta_text) + + # Check if end token is in delta tokens + if self.end_token_id in delta_token_ids: + # End token in delta, split reasoning and content + end_index = delta_text.find(self.end_token) + reasoning = delta_text[:end_index] + content = delta_text[end_index + len(self.end_token) :] + return DeltaMessage( + reasoning=reasoning if reasoning else None, + content=content if content else None, + ) + + # No end token yet, all content is reasoning + return DeltaMessage(reasoning=delta_text) + class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): """ From b4e8b91278e6cb8547b5545eba28626a3d5ac052 Mon Sep 17 00:00:00 2001 From: wz1qqx <55830058+wz1qqx@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:23:52 +0800 Subject: [PATCH 32/67] [Fix]fix import error from lmcache (#30376) Signed-off-by: wz1qqx Co-authored-by: wz1qqx From 13d63b65e0604db23c1485d370dbf9adc4e651c7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 18:06:36 +0800 Subject: [PATCH 33/67] [Deprecation] Remove missed fallback for `embed_input_ids` (#30469) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces_base.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index f988873c9c77c..134a1d9483804 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -49,13 +49,7 @@ class VllmModel(Protocol[T_co]): def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: """Apply token embeddings to `input_ids`.""" - if hasattr(self, "get_input_embeddings"): - logger.warning_once( - "`get_input_embeddings` for vLLM models is deprecated and will be " - "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename " - "this method to `embed_input_ids`." - ) - return self.get_input_embeddings(input_ids) + ... def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ... From 4515eb1a0b71fbdca68c95eb261b046bbd830d15 Mon Sep 17 00:00:00 2001 From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com> Date: Thu, 11 Dec 2025 02:14:57 -0800 Subject: [PATCH 34/67] [Fix] Update lazing loading of video loader backend (#30444) Signed-off-by: Jeremy Teboul Co-authored-by: Jeremy Teboul --- tests/multimodal/test_video.py | 124 ++++++++++++++++++++++++++++++++- vllm/multimodal/video.py | 9 ++- 2 files changed, 131 insertions(+), 2 deletions(-) diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 6ed21de368ac3..eccaa53ea1004 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): """ Regression test for handling videos with broken frames. This test uses a pre-corrupted video file (assets/corrupted.mp4) that - contains broken/unreadable frames to verify the video loader handles + contains broken frames to verify the video loader handles them gracefully without crashing and returns accurate metadata. """ with monkeypatch.context() as m: @@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): f"Expected fewer than {metadata['total_num_frames']} frames, " f"but loaded {frames.shape[0]} frames" ) + + +@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1") +class TestVideoBackendOverride1(VideoLoader): + """Test loader that returns FAKE_OUTPUT_1 to verify backend selection.""" + + @classmethod + def load_bytes( + cls, data: bytes, num_frames: int = -1, **kwargs + ) -> tuple[npt.NDArray, dict]: + return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"} + + +@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2") +class TestVideoBackendOverride2(VideoLoader): + """Test loader that returns FAKE_OUTPUT_2 to verify backend selection.""" + + @classmethod + def load_bytes( + cls, data: bytes, num_frames: int = -1, **kwargs + ) -> tuple[npt.NDArray, dict]: + return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"} + + +def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch): + """ + Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND + environment variable. + + This allows users to dynamically select a different video backend + via --media-io-kwargs without changing the global env var, which is + useful when plugins set a default backend but a specific request + needs a different one. + """ + with monkeypatch.context() as m: + # Set the env var to one backend + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1") + + imageio = ImageMediaIO() + + # Without video_backend kwarg, should use env var backend + videoio_default = VideoMediaIO(imageio, num_frames=10) + frames_default, metadata_default = videoio_default.load_bytes(b"test") + np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1) + assert metadata_default["video_backend"] == "test_video_backend_override_1" + + # With video_backend kwarg, should override env var + videoio_override = VideoMediaIO( + imageio, num_frames=10, video_backend="test_video_backend_override_2" + ) + frames_override, metadata_override = videoio_override.load_bytes(b"test") + np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2) + assert metadata_override["video_backend"] == "test_video_backend_override_2" + + +def test_video_media_io_backend_kwarg_not_passed_to_loader( + monkeypatch: pytest.MonkeyPatch, +): + """ + Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed + through to the underlying video loader's load_bytes method. + + This ensures the kwarg is properly popped from kwargs before forwarding. + """ + + @VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg") + class RejectVideoBackendKwargLoader(VideoLoader): + """Test loader that fails if video_backend is passed through.""" + + @classmethod + def load_bytes( + cls, data: bytes, num_frames: int = -1, **kwargs + ) -> tuple[npt.NDArray, dict]: + # This should never receive video_backend in kwargs + if "video_backend" in kwargs: + raise AssertionError( + "video_backend should be consumed by VideoMediaIO, " + "not passed to loader" + ) + return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())} + + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg") + + imageio = ImageMediaIO() + + # Even when video_backend is provided, it should NOT be passed to loader + videoio = VideoMediaIO( + imageio, + num_frames=10, + video_backend="test_reject_video_backend_kwarg", + other_kwarg="should_pass_through", + ) + + # This should NOT raise AssertionError + frames, metadata = videoio.load_bytes(b"test") + np.testing.assert_array_equal(frames, FAKE_OUTPUT_1) + # Verify other kwargs are still passed through + assert "other_kwarg" in metadata["received_kwargs"] + + +def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch): + """ + Test that when video_backend kwarg is None or not provided, + VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var. + """ + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2") + + imageio = ImageMediaIO() + + # Explicit None should fall back to env var + videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None) + frames_none, metadata_none = videoio_none.load_bytes(b"test") + np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2) + assert metadata_none["video_backend"] == "test_video_backend_override_2" + + # Not providing video_backend should also fall back to env var + videoio_missing = VideoMediaIO(imageio, num_frames=10) + frames_missing, metadata_missing = videoio_missing.load_bytes(b"test") + np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2) + assert metadata_missing["video_backend"] == "test_video_backend_override_2" diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index abfc226a689c2..024252799cf74 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -283,8 +283,15 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]): # They can be passed to the underlying # media loaders (e.g. custom implementations) # for flexible control. + + # Allow per-request override of video backend via kwargs. + # This enables users to specify a different backend than the + # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.: + # --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}' + video_loader_backend = ( + kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND + ) self.kwargs = kwargs - video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend) def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]: From a5f9fb59604f3a84e8be1317e33b2d368c9fc6f9 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 11 Dec 2025 18:18:25 +0800 Subject: [PATCH 35/67] [Deprecation] Deprecation `--convert reward`, use `--convert embed` instead. (#30463) Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 5 ++++- vllm/config/model.py | 7 +++++++ vllm/config/pooler.py | 6 ++++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 32ffcf96fabef..b4b0150faf841 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_ ### Remove softmax from PoolingParams -We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. +We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. ### as_reward_model +!!! warning + We are going to remove `--convert reward` in v0.15, use `--convert embed` instead. + Pooling models now default support all pooling, you can use it without any settings. - Extracting hidden states prefers using `token_embed` task. diff --git a/vllm/config/model.py b/vllm/config/model.py index bd98111ffb5db..03140c17fb50e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -788,6 +788,13 @@ class ModelConfig: runner_type: RunnerType, convert: ConvertOption, ) -> ConvertType: + if convert == "reward": + logger.warning( + "`--convert reward` is deprecated and will be removed in v0.15. " + "Please use `--convert embed` instead." + ) + return "embed" + if convert != "auto": return convert diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index aa4e7006d0247..976ae8c063eb7 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -111,13 +111,15 @@ class PoolerConfig: def get_use_activation(o: object): if softmax := getattr(o, "softmax", None) is not None: logger.warning_once( - "softmax will be deprecated, please use use_activation instead." + "softmax will be deprecated and will be removed in v0.15. " + "Please use use_activation instead." ) return softmax if activation := getattr(o, "activation", None) is not None: logger.warning_once( - "activation will be deprecated, please use use_activation instead." + "activation will be deprecated and will be removed in v0.15. " + "Please use use_activation instead." ) return activation From d917747c95b212f9b7e85c100bc572e3e5d33360 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 18:33:55 +0800 Subject: [PATCH 36/67] [Bugfix] Fix `task` still being passed in tests/benchmarks (#30476) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_ngram_proposer.py | 1 - tests/models/language/pooling/test_mm_classifier_conversion.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index 872a263318ff7..b5373d383b548 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -32,7 +32,6 @@ def benchmark_propose(args): model_config = ModelConfig( model="facebook/opt-125m", - task="generate", max_model_len=args.num_token + args.num_spec_token, tokenizer="facebook/opt-125m", tokenizer_mode="auto", diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py index a31a771238e26..d50ee85b9fd2b 100644 --- a/tests/models/language/pooling/test_mm_classifier_conversion.py +++ b/tests/models/language/pooling/test_mm_classifier_conversion.py @@ -17,7 +17,6 @@ def test_idefics_multimodal( with vllm_runner( model_name="HuggingFaceM4/Idefics3-8B-Llama3", runner="pooling", - task="classify", convert="classify", load_format="dummy", max_model_len=512, @@ -86,7 +85,6 @@ def test_gemma_multimodal( with vllm_runner( model_name="google/gemma-3-4b-it", runner="pooling", - task="classify", convert="classify", load_format="auto", hf_overrides=update_config, From 853611bb181290787d05502568fe76837507fdd9 Mon Sep 17 00:00:00 2001 From: Kenichi Maehashi <939877+kmaehashi@users.noreply.github.com> Date: Thu, 11 Dec 2025 20:07:56 +0900 Subject: [PATCH 37/67] Fix typo of endpoint name in CLI args docs (#30473) Signed-off-by: Kenichi Maehashi --- vllm/entrypoints/openai/cli_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 946362ce2ef0a..b798b05dcfcbf 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -176,7 +176,7 @@ class FrontendArgs: enable_force_include_usage: bool = False """If set to True, including usage on every request.""" enable_tokenizer_info_endpoint: bool = False - """Enable the /get_tokenizer_info endpoint. May expose chat + """Enable the `/tokenizer_info` endpoint. May expose chat templates and other tokenizer configuration.""" enable_log_outputs: bool = False """If True, log model outputs (generations). From a11f4a81e027efd9ef783b943489c222950ac989 Mon Sep 17 00:00:00 2001 From: Qiu Date: Thu, 11 Dec 2025 19:36:18 +0800 Subject: [PATCH 38/67] [Misc][PCP&DCP] relocate PCP feature check (#30050) Signed-off-by: QiuChunshuo Co-authored-by: Cyrus Leung --- vllm/attention/backends/abstract.py | 6 +++++ vllm/config/parallel.py | 5 ---- vllm/config/vllm.py | 5 ---- vllm/engine/arg_utils.py | 10 ------- vllm/v1/worker/cp_utils.py | 42 +++++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 18 +++---------- 6 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 vllm/v1/worker/cp_utils.py diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 03f4c40302eb8..025ede1eb0a4e 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]): # Some features like decode context parallelism require the softmax lse. can_return_lse_for_decode: bool = False + # Whether the attention impl supports Prefill Context Parallelism. + supports_pcp: bool = False + # Whether the attention impl(or ops) supports MTP + # when cp_kv_cache_interleave_size > 1 + supports_mtp_with_cp_non_trivial_interleave_size: bool = False + # some attention backends might not always want to return lse # even if they can return lse (for efficiency reasons) need_to_return_lse_for_decode: bool = False diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 0327832c4fb8c..1f9dd38ac9114 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -317,11 +317,6 @@ class ParallelConfig: "num_redundant_experts." ) - if self.prefill_context_parallel_size > 1: - raise ValueError( - "Prefill context parallelism is not fully supported. " - "Please set prefill_context_parallel_size to 1." - ) return self @property diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a3a9eec9b3203..0e75daf0d722c 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -820,11 +820,6 @@ class VllmConfig: f"({self.parallel_config.cp_kv_cache_interleave_size})." ) - assert ( - self.parallel_config.cp_kv_cache_interleave_size == 1 - or self.speculative_config is None - ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." - # Do this after all the updates to compilation_config.mode self.compilation_config.set_splitting_ops_for_v1( all2all_backend=self.parallel_config.all2all_backend, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3f23b95641d61..757023e12d439 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1848,16 +1848,6 @@ class EngineArgs: default_chunked_prefill = model_config.is_chunked_prefill_supported default_prefix_caching = model_config.is_prefix_caching_supported - if self.prefill_context_parallel_size > 1: - default_chunked_prefill = False - default_prefix_caching = False - logger.warning_once( - "--prefill-context-parallel-size > 1 is not compatible with " - "chunked prefill and prefix caching now. Chunked prefill " - "and prefix caching have been disabled by default.", - scope="local", - ) - if self.enable_chunked_prefill is None: self.enable_chunked_prefill = default_chunked_prefill diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py new file mode 100644 index 0000000000000..f666c739b0be7 --- /dev/null +++ b/vllm/v1/worker/cp_utils.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING, Any, cast + +from vllm.config import VllmConfig, get_layers_from_vllm_config + +if TYPE_CHECKING: + from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +else: + AttentionLayerBase = object + + +def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: + pcp_size = vllm_config.parallel_config.prefill_context_parallel_size + dcp_size = vllm_config.parallel_config.decode_context_parallel_size + interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size + if pcp_size * dcp_size > 1: + layer_type = cast(type[Any], AttentionLayerBase) + layers = get_layers_from_vllm_config(vllm_config, layer_type) + for layer in layers.values(): + layer_impl = getattr(layer, "impl", None) + if layer_impl is None: + continue + if vllm_config.speculative_config is not None and interleave_size > 1: + assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, ( + "MTP with cp_kv_cache_interleave_size > 1 is not " + f"supported in {layer_impl.__class__.__name__}." + ) + if dcp_size > 1: + assert layer_impl.need_to_return_lse_for_decode, ( + "DCP requires attention impls to return" + " the softmax lse for decode, but the impl " + f"{layer_impl.__class__.__name__} " + "does not return the softmax lse for decode." + ) + + if pcp_size > 1: + assert layer_impl.supports_pcp, ( + "PCP requires attention impls' support, " + f"but the impl {layer_impl.__class__.__name__} " + "does not support PCP." + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7dc86f1ee4815..0e2bf9df9a18f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -148,6 +148,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext +from vllm.v1.worker.cp_utils import check_attention_cp_compatibility from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -4736,6 +4737,9 @@ class GPUModelRunner( attention_backend_list, kv_cache_config.kv_cache_groups ) + # Check if attention backend supports PCP&DCP and related features. + check_attention_cp_compatibility(self.vllm_config) + for i, attn_backend_map in enumerate(attention_backend_maps): self.attn_groups.append(create_attn_groups(attn_backend_map, i)) @@ -5394,20 +5398,6 @@ class GPUModelRunner( kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) - if self.dcp_world_size > 1: - layer_type = cast(type[Any], AttentionLayerBase) - layers = get_layers_from_vllm_config(self.vllm_config, layer_type) - for layer in layers.values(): - layer_impl = getattr(layer, "impl", None) - if layer_impl is None: - continue - assert layer_impl.need_to_return_lse_for_decode, ( - "DCP requires attention impls to return" - " the softmax lse for decode, but the impl " - f"{layer_impl.__class__.__name__} " - "does not return the softmax lse for decode." - ) - def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: """ Add encoder-only layers to the KV cache config. From f4417f8449dc7a2cb890dbef659c0d1ce93432da Mon Sep 17 00:00:00 2001 From: Martin Hickey Date: Thu, 11 Dec 2025 14:30:29 +0000 Subject: [PATCH 39/67] [KVConnector] Add KV events to KV Connectors (#28309) Signed-off-by: Martin Hickey --- .../unit/test_lmcache_connector.py | 756 ++++++++++++++++++ vllm/distributed/kv_events.py | 130 ++- .../kv_transfer/kv_connector/utils.py | 15 + .../kv_transfer/kv_connector/v1/base.py | 10 +- .../kv_connector/v1/lmcache_connector.py | 117 ++- .../kv_connector/v1/multi_connector.py | 6 + vllm/v1/outputs.py | 4 + .../worker/kv_connector_model_runner_mixin.py | 13 +- 8 files changed, 1036 insertions(+), 15 deletions(-) create mode 100644 tests/v1/kv_connector/unit/test_lmcache_connector.py diff --git a/tests/v1/kv_connector/unit/test_lmcache_connector.py b/tests/v1/kv_connector/unit/test_lmcache_connector.py new file mode 100644 index 0000000000000..6a8cfc71a67a6 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_lmcache_connector.py @@ -0,0 +1,756 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import MagicMock + +import pytest + +from vllm.distributed.kv_events import BlockStored +from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector import ( + LMCacheConnectorV1, + LMCacheKVEvents, +) +from vllm.v1.outputs import KVConnectorOutput + + +@pytest.fixture +def mock_lmcache_engine_event(): + """Create a mock event object that mimics what the lmcache engine returns.""" + + class MockEvent: + def __init__( + self, + block_hashes, + parent_block_hash, + token_ids, + lora_id, + block_size, + medium, + ): + self.block_hashes = block_hashes + self.parent_block_hash = parent_block_hash + self.token_ids = token_ids + self.lora_id = lora_id + self.block_size = block_size + self.medium = medium + + return MockEvent( + block_hashes=["hash1", "hash2"], + parent_block_hash="parent_hash", + token_ids=[1, 2, 3, 4], + lora_id=None, + block_size=16, + medium="GPU", + ) + + +@pytest.fixture +def mock_connector(): + """Create a mock LMCacheConnectorV1 instance with mocked dependencies.""" + connector = MagicMock(spec=LMCacheConnectorV1) + connector._kv_cache_events = None + connector._lmcache_engine = MagicMock() + + # Make the methods use the real implementation + connector.get_kv_connector_kv_cache_events = ( + LMCacheConnectorV1.get_kv_connector_kv_cache_events.__get__( + connector, LMCacheConnectorV1 + ) + ) + connector.update_connector_output = ( + LMCacheConnectorV1.update_connector_output.__get__( + connector, LMCacheConnectorV1 + ) + ) + connector.take_events = LMCacheConnectorV1.take_events.__get__( + connector, LMCacheConnectorV1 + ) + + return connector + + +class TestGetKVConnectorKVCacheEvents: + """Test get_kv_connector_kv_cache_events method.""" + + def test_returns_none_when_no_events(self, mock_connector): + """Test that None is returned when lmcache engine has no events.""" + mock_connector._lmcache_engine.get_kv_events.return_value = None + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is None + mock_connector._lmcache_engine.get_kv_events.assert_called_once() + + def test_returns_none_when_empty_list(self, mock_connector): + """Test that None is returned when lmcache engine returns empty list.""" + mock_connector._lmcache_engine.get_kv_events.return_value = [] + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is None + + def test_converts_single_event(self, mock_connector, mock_lmcache_engine_event): + """Test conversion of a single event from lmcache engine format.""" + mock_connector._lmcache_engine.get_kv_events.return_value = [ + mock_lmcache_engine_event + ] + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is not None + assert isinstance(result, LMCacheKVEvents) + assert result.get_number_of_workers() == 1 + + events = result.get_all_events() + assert len(events) == 1 + assert isinstance(events[0], BlockStored) + assert events[0].block_hashes == ["hash1", "hash2"] + assert events[0].parent_block_hash == "parent_hash" + assert events[0].token_ids == [1, 2, 3, 4] + assert events[0].lora_id is None + assert events[0].block_size == 16 + assert events[0].medium == "GPU" + + def test_converts_multiple_events(self, mock_connector): + """Test conversion of multiple events from lmcache engine format.""" + + class MockEvent: + def __init__(self, i): + self.block_hashes = [f"hash{i}"] + self.parent_block_hash = f"parent{i}" + self.token_ids = [i] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + events = [MockEvent(i) for i in range(5)] + mock_connector._lmcache_engine.get_kv_events.return_value = events + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is not None + assert isinstance(result, LMCacheKVEvents) + + converted_events = result.get_all_events() + assert len(converted_events) == 5 + + for i, event in enumerate(converted_events): + assert isinstance(event, BlockStored) + assert event.block_hashes == [f"hash{i}"] + assert event.parent_block_hash == f"parent{i}" + assert event.token_ids == [i] + + def test_preserves_event_attributes(self, mock_connector): + """Test that all event attributes are correctly preserved.""" + + class MockEventWithLora: + def __init__(self): + self.block_hashes = ["hash_a", "hash_b", "hash_c"] + self.parent_block_hash = "parent_xyz" + self.token_ids = [100, 200, 300] + self.lora_id = 42 + self.block_size = 32 + self.medium = "DISK" + + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEventWithLora() + ] + + result = mock_connector.get_kv_connector_kv_cache_events() + + events = result.get_all_events() + event = events[0] + + assert event.block_hashes == ["hash_a", "hash_b", "hash_c"] + assert event.parent_block_hash == "parent_xyz" + assert event.token_ids == [100, 200, 300] + assert event.lora_id == 42 + assert event.block_size == 32 + assert event.medium == "DISK" + + def test_handles_none_parent_block_hash(self, mock_connector): + """Test handling of events with None parent_block_hash.""" + + class MockEventNoParent: + def __init__(self): + self.block_hashes = ["hash1"] + self.parent_block_hash = None + self.token_ids = [1, 2] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEventNoParent() + ] + + result = mock_connector.get_kv_connector_kv_cache_events() + + events = result.get_all_events() + assert events[0].parent_block_hash is None + + +class TestUpdateConnectorOutput: + """Test update_connector_output method.""" + + def test_does_nothing_when_kv_cache_events_is_none(self, mock_connector): + """Test that method returns early when kv_cache_events is None.""" + connector_output = KVConnectorOutput(kv_cache_events=None) + + mock_connector.update_connector_output(connector_output) + + assert mock_connector._kv_cache_events is None + + def test_does_nothing_when_kv_cache_events_is_not_lmcache_kv_events( + self, mock_connector + ): + """Test that method returns early when kv_cache_events is not + LMCacheKVEvents.""" + # Create a mock object that is not LMCacheKVEvents + fake_events = MagicMock() + connector_output = KVConnectorOutput(kv_cache_events=fake_events) + + mock_connector.update_connector_output(connector_output) + + assert mock_connector._kv_cache_events is None + + def test_sets_kv_cache_events_when_none(self, mock_connector): + """Test that _kv_cache_events is set when it was None.""" + kv_events = LMCacheKVEvents(num_workers=1) + event = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1, 2], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events.add_events([event]) + + connector_output = KVConnectorOutput(kv_cache_events=kv_events) + + mock_connector.update_connector_output(connector_output) + + assert mock_connector._kv_cache_events is kv_events + + def test_adds_events_when_kv_cache_events_already_exists(self, mock_connector): + """Test that events are added when _kv_cache_events already exists.""" + # Set up existing events + existing_events = LMCacheKVEvents(num_workers=2) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + existing_events.add_events([event1]) + existing_events.add_events([event1]) # Simulate 2 workers reporting + + mock_connector._kv_cache_events = existing_events + + # Create new events to add + new_events = LMCacheKVEvents(num_workers=1) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + new_events.add_events([event2]) + + connector_output = KVConnectorOutput(kv_cache_events=new_events) + + mock_connector.update_connector_output(connector_output) + + # Check that events were added + all_events = mock_connector._kv_cache_events.get_all_events() + assert len(all_events) == 3 # 2 from existing + 1 from new + assert event1 in all_events + assert event2 in all_events + + def test_increments_workers_when_kv_cache_events_already_exists( + self, mock_connector + ): + """Test that worker count is incremented correctly.""" + # Set up existing events with 2 workers + existing_events = LMCacheKVEvents(num_workers=2) + mock_connector._kv_cache_events = existing_events + + # Create new events from 3 workers + new_events = LMCacheKVEvents(num_workers=3) + event = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + new_events.add_events([event]) + + connector_output = KVConnectorOutput(kv_cache_events=new_events) + + mock_connector.update_connector_output(connector_output) + + # Worker count should be 2 + 3 = 5 + assert mock_connector._kv_cache_events.get_number_of_workers() == 5 + + def test_multiple_updates(self, mock_connector): + """Test multiple consecutive updates.""" + # First update + events1 = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + events1.add_events([event1]) + output1 = KVConnectorOutput(kv_cache_events=events1) + mock_connector.update_connector_output(output1) + + # Second update + events2 = LMCacheKVEvents(num_workers=2) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + events2.add_events([event2]) + output2 = KVConnectorOutput(kv_cache_events=events2) + mock_connector.update_connector_output(output2) + + # Third update + events3 = LMCacheKVEvents(num_workers=1) + event3 = BlockStored( + block_hashes=["hash3"], + parent_block_hash=None, + token_ids=[3], + block_size=16, + lora_id=None, + medium="GPU", + ) + events3.add_events([event3]) + output3 = KVConnectorOutput(kv_cache_events=events3) + mock_connector.update_connector_output(output3) + + # Check final state + all_events = mock_connector._kv_cache_events.get_all_events() + assert len(all_events) == 3 + assert mock_connector._kv_cache_events.get_number_of_workers() == 4 # 1+2+1 + + def test_updates_with_empty_events(self, mock_connector): + """Test updating with empty event lists.""" + # First update with actual events + events1 = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + events1.add_events([event1]) + output1 = KVConnectorOutput(kv_cache_events=events1) + mock_connector.update_connector_output(output1) + + # Second update with empty events + events2 = LMCacheKVEvents(num_workers=2) + # No events added + output2 = KVConnectorOutput(kv_cache_events=events2) + mock_connector.update_connector_output(output2) + + # Should still have the original event + all_events = mock_connector._kv_cache_events.get_all_events() + assert len(all_events) == 1 + assert mock_connector._kv_cache_events.get_number_of_workers() == 3 + + +class TestTakeEvents: + """Test take_events method.""" + + def test_yields_nothing_when_kv_cache_events_is_none(self, mock_connector): + """Test that nothing is yielded when _kv_cache_events is None.""" + mock_connector._kv_cache_events = None + + events = list(mock_connector.take_events()) + + assert events == [] + + def test_yields_events_and_clears(self, mock_connector): + """Test that events are yielded and then cleared.""" + # Set up events + kv_events = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events.add_events([event1, event2]) + mock_connector._kv_cache_events = kv_events + + # Take events + events = list(mock_connector.take_events()) + + # Check that events were yielded + assert len(events) == 2 + assert event1 in events + assert event2 in events + + # Check that _kv_cache_events was cleared + assert mock_connector._kv_cache_events is None + + def test_aggregates_before_yielding(self, mock_connector): + """Test that events are aggregated before yielding.""" + # Set up events from multiple workers + kv_events = LMCacheKVEvents(num_workers=3) + common_event = BlockStored( + block_hashes=["hash_common"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + uncommon_event = BlockStored( + block_hashes=["hash_uncommon"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + + # All 3 workers report common_event + kv_events.add_events([common_event]) + kv_events.add_events([common_event]) + kv_events.add_events([common_event]) + + # Only 1 worker reports uncommon_event + kv_events.add_events([uncommon_event]) + + mock_connector._kv_cache_events = kv_events + + # Take events + events = list(mock_connector.take_events()) + + # Only the common event should be yielded + assert len(events) == 1 + assert events[0] == common_event + + def test_multiple_take_events_calls(self, mock_connector): + """Test calling take_events multiple times.""" + # First call with events + kv_events1 = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events1.add_events([event1]) + mock_connector._kv_cache_events = kv_events1 + + events1 = list(mock_connector.take_events()) + assert len(events1) == 1 + assert events1[0] == event1 + assert mock_connector._kv_cache_events is None + + # Second call with no events + events2 = list(mock_connector.take_events()) + assert events2 == [] + + # Third call after adding new events + kv_events2 = LMCacheKVEvents(num_workers=1) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events2.add_events([event2]) + mock_connector._kv_cache_events = kv_events2 + + events3 = list(mock_connector.take_events()) + assert len(events3) == 1 + assert events3[0] == event2 + + def test_yields_empty_after_aggregation_removes_all(self, mock_connector): + """Test that nothing is yielded if aggregation removes all events.""" + # Set up events from 2 workers with no common events + kv_events = LMCacheKVEvents(num_workers=2) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + + # Worker 1 reports event1 + kv_events.add_events([event1]) + # Worker 2 reports event2 + kv_events.add_events([event2]) + + mock_connector._kv_cache_events = kv_events + + # Take events + events = list(mock_connector.take_events()) + + # No common events, so nothing should be yielded + assert events == [] + assert mock_connector._kv_cache_events is None + + +class TestIntegrationScenarios: + """Test integration scenarios.""" + + def test_full_workflow(self, mock_connector, mock_lmcache_engine_event): + """Test a complete workflow from getting events to taking them.""" + # Step 1: Get events from lmcache engine + mock_connector._lmcache_engine.get_kv_events.return_value = [ + mock_lmcache_engine_event + ] + kv_events = mock_connector.get_kv_connector_kv_cache_events() + + assert kv_events is not None + assert len(kv_events.get_all_events()) == 1 + + # Step 2: Update connector output (simulate receiving from worker) + output1 = KVConnectorOutput(kv_cache_events=kv_events) + mock_connector.update_connector_output(output1) + + assert mock_connector._kv_cache_events is not None + + # Step 3: Take events + taken_events = list(mock_connector.take_events()) + + assert len(taken_events) == 1 + assert mock_connector._kv_cache_events is None + + def test_multiple_workers_workflow(self, mock_connector): + """Test workflow with multiple workers.""" + + class MockEvent: + def __init__(self, hash_val): + self.block_hashes = [hash_val] + self.parent_block_hash = None + self.token_ids = [1] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + # Worker 1 + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEvent("hash_common"), + MockEvent("hash_worker1"), + ] + kv_events1 = mock_connector.get_kv_connector_kv_cache_events() + output1 = KVConnectorOutput(kv_cache_events=kv_events1) + mock_connector.update_connector_output(output1) + + # Worker 2 + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEvent("hash_common"), + MockEvent("hash_worker2"), + ] + kv_events2 = mock_connector.get_kv_connector_kv_cache_events() + output2 = KVConnectorOutput(kv_cache_events=kv_events2) + mock_connector.update_connector_output(output2) + + # Take events (should only get common events) + taken_events = list(mock_connector.take_events()) + + # With aggregation, only events reported by both workers should be present + # In this case, hash_common was reported by both + event_hashes = [e.block_hashes[0] for e in taken_events] + assert "hash_common" in event_hashes + + def test_empty_workflow(self, mock_connector): + """Test workflow when there are no events at any stage.""" + # Get events returns None + mock_connector._lmcache_engine.get_kv_events.return_value = None + kv_events = mock_connector.get_kv_connector_kv_cache_events() + + assert kv_events is None + + # Update with None + output = KVConnectorOutput(kv_cache_events=None) + mock_connector.update_connector_output(output) + + # Take events + taken_events = list(mock_connector.take_events()) + + assert taken_events == [] + assert mock_connector._kv_cache_events is None + + def test_repeated_cycles(self, mock_connector): + """Test multiple cycles of the complete workflow.""" + + class MockEvent: + def __init__(self, cycle_num): + self.block_hashes = [f"hash_cycle_{cycle_num}"] + self.parent_block_hash = None + self.token_ids = [cycle_num] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + for cycle in range(3): + # Get events + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEvent(cycle) + ] + kv_events = mock_connector.get_kv_connector_kv_cache_events() + + # Update + output = KVConnectorOutput(kv_cache_events=kv_events) + mock_connector.update_connector_output(output) + + # Take + taken_events = list(mock_connector.take_events()) + + # Verify + assert len(taken_events) == 1 + assert taken_events[0].block_hashes[0] == f"hash_cycle_{cycle}" + assert mock_connector._kv_cache_events is None + + def test_lmcache_kv_events_aggregation(self): + """ + Test LMCacheKVEvents aggregation across TP ranks using + KVOutputAggregator (used by MultiprocExecutor). + """ + from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator + from vllm.v1.outputs import ModelRunnerOutput + + # Create KVOutputAggregator for 3 workers (simulating TP=3) + aggregator = KVOutputAggregator(expected_finished_count=3) + + # Define common and unique events + common_event = BlockStored( + block_hashes=["hash_common"], + parent_block_hash="parent_common", + token_ids=[1, 2, 3], + block_size=16, + lora_id=None, + medium="GPU", + ) + + worker1_unique_event = BlockStored( + block_hashes=["hash_worker1"], + parent_block_hash="parent_w1", + token_ids=[4, 5], + block_size=16, + lora_id=None, + medium="GPU", + ) + + worker2_unique_event = BlockStored( + block_hashes=["hash_worker2"], + parent_block_hash="parent_w2", + token_ids=[6, 7], + block_size=16, + lora_id=None, + medium="GPU", + ) + + worker3_unique_event = BlockStored( + block_hashes=["hash_worker3"], + parent_block_hash="parent_w3", + token_ids=[8, 9], + block_size=16, + lora_id=None, + medium="GPU", + ) + + # Create events for each worker + # Worker 0: reports common event and its unique event + worker0_events = LMCacheKVEvents(num_workers=1) + worker0_events.add_events([common_event, worker1_unique_event]) + + # Worker 1: reports common event and its unique event + worker1_events = LMCacheKVEvents(num_workers=1) + worker1_events.add_events([common_event, worker2_unique_event]) + + # Worker 2: reports common event and its unique event + worker2_events = LMCacheKVEvents(num_workers=1) + worker2_events.add_events([common_event, worker3_unique_event]) + + # Create ModelRunnerOutput instances for each worker + worker_outputs = [] + for i, worker_events in enumerate( + [worker0_events, worker1_events, worker2_events] + ): + output = ModelRunnerOutput( + req_ids=[f"req_{i}"], + req_id_to_index={f"req_{i}": 0}, + sampled_token_ids=[[123]], # dummy token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=set([f"req_{i}_send"]) + if i < 2 + else None, # Workers 0,1 finished sending + finished_recving=set([f"req_{i}_recv"]) + if i > 0 + else None, # Workers 1,2 finished receiving + kv_cache_events=worker_events, + ), + ) + worker_outputs.append(output) + + # Use the real aggregation mechanism (like MultiprocExecutor.execute_model) + aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0) + kv_cache_events = aggregated_output.kv_connector_output.kv_cache_events + + assert isinstance(kv_cache_events, LMCacheKVEvents) + + # After aggregation, events should be combined from all workers + # The aggregator doesn't automatically aggregate events, so we need to call + # aggregate() to get only common events + kv_cache_events.aggregate() + aggregated_events = kv_cache_events.get_all_events() + + # Only the common event should remain after aggregation + # because it's the only event reported by all 3 workers + assert len(aggregated_events) == 1 + assert aggregated_events[0] == common_event + + # Verify the common event properties + assert aggregated_events[0].block_hashes == ["hash_common"] + assert aggregated_events[0].parent_block_hash == "parent_common" + assert aggregated_events[0].token_ids == [1, 2, 3] diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 7b5cb94cf13ea..3b76af75504de 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -5,7 +5,7 @@ import queue import threading import time from abc import ABC, abstractmethod -from collections import deque +from collections import Counter, deque from collections.abc import Callable from dataclasses import asdict from itertools import count @@ -54,11 +54,26 @@ class BlockStored(KVCacheEvent): lora_id: int | None medium: str | None + def __hash__(self) -> int: + return hash( + ( + tuple(self.block_hashes), + self.parent_block_hash, + tuple(self.token_ids), + self.block_size, + self.lora_id, + self.medium, + ) + ) + class BlockRemoved(KVCacheEvent): block_hashes: list[ExternalBlockHash] medium: str | None + def __hash__(self) -> int: + return hash((tuple(self.block_hashes), self.medium)) + class AllBlocksCleared(KVCacheEvent): pass @@ -68,6 +83,119 @@ class KVEventBatch(EventBatch): events: list[BlockStored | BlockRemoved | AllBlocksCleared] +class KVEventAggregator: + """ + Aggregates KV events across multiple workers. + Tracks how many times each event appears and returns only those + that were emitted by all workers. + """ + + __slots__ = ("_event_counter", "_num_workers") + + def __init__(self, num_workers: int) -> None: + if num_workers <= 0: + raise ValueError("num_workers must be greater than zero.") + self._event_counter: Counter[KVCacheEvent] = Counter() + self._num_workers: int = num_workers + + def add_events(self, events: list[KVCacheEvent]) -> None: + """ + Add events from a worker batch. + + :param events: List of KVCacheEvent objects. + """ + if not isinstance(events, list): + raise TypeError("events must be a list of KVCacheEvent.") + self._event_counter.update(events) + + def get_common_events(self) -> list[KVCacheEvent]: + """ + Return events that appeared in all workers. + + :return: List of events present in all workers. + """ + return [ + event + for event, count in self._event_counter.items() + if count == self._num_workers + ] + + def get_all_events(self) -> list[KVCacheEvent]: + """ + Return all events for all workers. + + :return: List of events for all workers. + """ + return list(self._event_counter.elements()) + + def clear_events(self) -> None: + """ + Clear all tracked events. + """ + self._event_counter.clear() + + def increment_workers(self, count: int = 1) -> None: + """ + Increment the number of workers contributing events. + + :param count: Number to increment the workers by. + """ + if count <= 0: + raise ValueError("count must be positive.") + self._num_workers += count + + def reset_workers(self) -> None: + """ + Reset the number of workers to 1. + """ + self._num_workers = 1 + + def get_number_of_workers(self) -> int: + """ + Return the number of workers. + + :return: int number of workers. + """ + return self._num_workers + + def __repr__(self) -> str: + return ( + f"" + ) + + +class KVConnectorKVEvents(ABC): + """ + Abstract base class for KV events. + Acts as a container for KV events from the connector. + """ + + @abstractmethod + def add_events(self, events: list[KVCacheEvent]) -> None: + raise NotImplementedError + + @abstractmethod + def aggregate(self) -> "KVConnectorKVEvents": + raise NotImplementedError + + @abstractmethod + def increment_workers(self, count: int = 1) -> None: + raise NotImplementedError + + @abstractmethod + def get_all_events(self) -> list[KVCacheEvent]: + raise NotImplementedError + + @abstractmethod + def get_number_of_workers(self) -> int: + raise NotImplementedError + + @abstractmethod + def clear_events(self) -> None: + raise NotImplementedError + + class EventPublisher(ABC): """Lightweight publisher for EventBatch batches with data parallelism support. diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 99d3be57c1381..117d159e25e71 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -78,6 +78,7 @@ class KVOutputAggregator: finished_sending = set[str]() finished_recving = set[str]() aggregated_kv_connector_stats = None + combined_kv_cache_events = None invalid_block_ids = set[int]() for model_runner_output in outputs: assert model_runner_output is not None @@ -119,6 +120,19 @@ class KVOutputAggregator: aggregated_kv_connector_stats.aggregate(kv_connector_stats) ) + # Combine kv_cache_events from all workers. + if combined_kv_cache_events is None: + # Use the first worker's kv_cache events as start event list. + combined_kv_cache_events = kv_output.kv_cache_events + elif kv_cache_events := kv_output.kv_cache_events: + assert isinstance( + combined_kv_cache_events, + type(kv_cache_events), + ) + worker_kv_cache_events = kv_cache_events.get_all_events() + combined_kv_cache_events.add_events(worker_kv_cache_events) + combined_kv_cache_events.increment_workers(1) + invalid_block_ids |= kv_output.invalid_block_ids # select output of the worker specified by output_rank @@ -129,6 +143,7 @@ class KVOutputAggregator: finished_sending=finished_sending or None, finished_recving=finished_recving or None, kv_connector_stats=aggregated_kv_connector_stats or None, + kv_cache_events=combined_kv_cache_events or None, invalid_block_ids=invalid_block_ids, expected_finished_count=self._expected_finished_count, ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 91f6443f92cbe..c05e5485a835e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -49,7 +49,7 @@ from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.config import VllmConfig - from vllm.distributed.kv_events import KVCacheEvent + from vllm.distributed.kv_events import KVCacheEvent, KVConnectorKVEvents from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorPromMetrics, KVConnectorStats, @@ -379,6 +379,14 @@ class KVConnectorBase_V1(ABC): """ return None + def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]: + """ + Get the KV connector kv cache events collected during the last interval. + This function should be called by the model runner every time after the + model execution and before cleanup. + """ + return None + def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: """ Get the KVConnector handshake metadata for this connector. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 30da424ddcca0..17d468fe6c305 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -1,14 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable from typing import TYPE_CHECKING, Any import torch -from lmcache.integration.vllm.vllm_v1_adapter import ( - LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, -) from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig +from vllm.distributed.kv_events import ( + BlockStored, + KVCacheEvent, + KVConnectorKVEvents, + KVEventAggregator, +) from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, @@ -16,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( ) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.forward_context import ForwardContext @@ -26,6 +31,44 @@ if TYPE_CHECKING: logger = init_logger(__name__) +class LMCacheKVEvents(KVConnectorKVEvents): + """ + Concrete implementation of KVConnectorKVEvents using KVEventAggregator. + """ + + def __init__(self, num_workers: int) -> None: + self._aggregator = KVEventAggregator(num_workers) + + def add_events(self, events: list[KVCacheEvent]) -> None: + self._aggregator.add_events(events) + + def aggregate(self) -> "LMCacheKVEvents": + """ + Aggregate KV events and retain only common events. + """ + common_events = self._aggregator.get_common_events() + self._aggregator.clear_events() + self._aggregator.add_events(common_events) + self._aggregator.reset_workers() + return self + + def increment_workers(self, count: int = 1) -> None: + self._aggregator.increment_workers(count) + + def get_all_events(self) -> list[KVCacheEvent]: + return self._aggregator.get_all_events() + + def get_number_of_workers(self) -> int: + return self._aggregator.get_number_of_workers() + + def clear_events(self) -> None: + self._aggregator.clear_events() + self._aggregator.reset_workers() + + def __repr__(self) -> str: + return f"" + + class LMCacheConnectorV1(KVConnectorBase_V1): def __init__( self, @@ -50,10 +93,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1): cls = _adapter.LMCacheConnectorV1Impl else: logger.info("Initializing latest dev LMCache connector") + # lazy import + from lmcache.integration.vllm.vllm_v1_adapter import ( + LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, + ) + cls = LMCacheConnectorLatestImpl self._lmcache_engine = cls(vllm_config, role, self) + self._kv_cache_events: LMCacheKVEvents | None = None + # ============================== # Worker-side methods # ============================== @@ -151,6 +201,31 @@ class LMCacheConnectorV1(KVConnectorBase_V1): # Fallback for older versions that don't support this method return set() + def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None: + """ + Get the KV connector kv cache events collected during the last interval. + """ + + events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined] + if not events: + return None + + blocks: list[BlockStored] = [ + BlockStored( + block_hashes=e.block_hashes, + parent_block_hash=e.parent_block_hash, + token_ids=e.token_ids, + lora_id=e.lora_id, + block_size=e.block_size, + medium=e.medium, + ) + for e in events + ] + + lmcache_kv_events = LMCacheKVEvents(num_workers=1) + lmcache_kv_events.add_events(blocks) + return lmcache_kv_events + # ============================== # Scheduler-side methods # ============================== @@ -198,6 +273,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1): """ return self._lmcache_engine.build_connector_meta(scheduler_output) + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + # Get the KV events + kv_cache_events = connector_output.kv_cache_events + if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents): + return + + if self._kv_cache_events is None: + self._kv_cache_events = kv_cache_events + else: + self._kv_cache_events.add_events(kv_cache_events.get_all_events()) + self._kv_cache_events.increment_workers( + kv_cache_events.get_number_of_workers() + ) + return + def request_finished( self, request: "Request", @@ -214,3 +311,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1): returned by the engine. """ return self._lmcache_engine.request_finished(request, block_ids) + + def take_events(self) -> Iterable["KVCacheEvent"]: + """ + Take the KV cache events from the connector. + + Yields: + New KV cache events since the last call. + """ + if self._kv_cache_events is not None: + self._kv_cache_events.aggregate() + kv_cache_events = self._kv_cache_events.get_all_events() + yield from kv_cache_events + self._kv_cache_events.clear_events() + self._kv_cache_events = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index c80dc1a567fdb..6825745374959 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -259,6 +259,12 @@ class MultiConnector(KVConnectorBase_V1): agg_block_ids |= c.get_block_ids_with_load_errors() return agg_block_ids + # TODO: Add a generic implementation of 'get_kv_connector_kv_cache_events' method + # for the MultiConnector. It should be able to get events from multiple + # connectors, handling the case where only a subset of the requested connectors + # implements the 'get_kv_connector_kv_cache_events' + # Follow on PR from https://github.com/vllm-project/vllm/pull/28309#pullrequestreview-3566351082 + # ============================== # Scheduler-side methods # ============================== diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 546eacebf83e5..bea9e5846de13 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: + from vllm.distributed.kv_events import KVConnectorKVEvents from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats else: KVConnectorStats = object + KVConnectorKVEvents = object class LogprobsLists(NamedTuple): @@ -108,6 +110,7 @@ class KVConnectorOutput: finished_sending: set[str] | None = None finished_recving: set[str] | None = None kv_connector_stats: KVConnectorStats | None = None + kv_cache_events: KVConnectorKVEvents | None = None # IDs of externally computed KV blocks that failed to load. # Requests referencing these blocks should be rescheduled to recompute them invalid_block_ids: set[int] = field(default_factory=set) @@ -123,6 +126,7 @@ class KVConnectorOutput: not self.finished_sending and not self.finished_recving and not self.kv_connector_stats + and not self.kv_cache_events and not self.invalid_block_ids ) diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index b799f1be73d9c..2bcc87b63bcdf 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import ( has_kv_transfer_group, ) from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig @@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin: ) output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors() - output.kv_connector_stats = ( - KVConnectorModelRunnerMixin.get_kv_connector_stats() - ) - kv_connector.clear_connector_metadata() + output.kv_connector_stats = kv_connector.get_kv_connector_stats() + output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events() - @staticmethod - def get_kv_connector_stats() -> KVConnectorStats | None: - if has_kv_transfer_group(): - return get_kv_transfer_group().get_kv_connector_stats() - return None + kv_connector.clear_connector_metadata() @staticmethod def use_uniform_kv_cache( From 3a3b06ee706e6ff99b711b20a6c431b43e490dbc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 11 Dec 2025 22:39:51 +0800 Subject: [PATCH 40/67] [Misc] Improve error message for `is_multimodal` (#30483) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 20 +++++++++++++++++--- vllm/model_executor/models/phi3v.py | 5 ++--- vllm/model_executor/models/qwen3_vl.py | 3 ++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 1e5d80dd2f313..cb99d57e8b8c7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -53,6 +53,22 @@ The output embeddings must be one of the following formats: """ +def _require_is_multimodal(is_multimodal: Tensor | None) -> Tensor: + """ + A helper function to be used in the context of + [vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][] + to provide a better error message. + """ + if is_multimodal is None: + raise ValueError( + "`embed_input_ids` now requires `is_multimodal` arg, " + "please update your model runner according to " + "https://github.com/vllm-project/vllm/pull/16229." + ) + + return is_multimodal + + @runtime_checkable class SupportsMultiModal(Protocol): """The interface required for all multi-modal models.""" @@ -190,12 +206,10 @@ class SupportsMultiModal(Protocol): if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - assert is_multimodal is not None - return _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, + is_multimodal=_require_is_multimodal(is_multimodal), ) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 0d39e29dcc97b..900b0eade308c 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -64,6 +64,7 @@ from .interfaces import ( SupportsMultiModal, SupportsPP, SupportsQuant, + _require_is_multimodal, ) from .utils import ( AutoWeightsLoader, @@ -687,12 +688,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - assert is_multimodal is not None - return _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, + is_multimodal=_require_is_multimodal(is_multimodal), ) def forward( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index eac3774196a0a..f8e0ea6284994 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -93,6 +93,7 @@ from .interfaces import ( SupportsMRoPE, SupportsMultiModal, SupportsPP, + _require_is_multimodal, ) from .qwen2_5_vl import ( Qwen2_5_VisionAttention, @@ -1572,7 +1573,7 @@ class Qwen3VLForConditionalGeneration( if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - assert is_multimodal is not None + is_multimodal = _require_is_multimodal(is_multimodal) if self.use_deepstack: ( From 97a042f3bca53417de6405a248e3d11fca568e2c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Dec 2025 15:44:56 +0000 Subject: [PATCH 41/67] Make the `httpx` logger less annoying when Transformers v5 is installed (#30480) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/logger.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/logger.py b/vllm/logger.py index 3b7bb1f22ec96..5506e09b8a65b 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -229,6 +229,11 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]: # guaranteed by the Python GIL. _configure_vllm_root_logger() +# Transformers uses httpx to access the Hugging Face Hub. httpx is quite verbose, +# so we set its logging level to WARNING when vLLM's logging level is INFO. +if envs.VLLM_LOGGING_LEVEL == "INFO": + logging.getLogger("httpx").setLevel(logging.WARNING) + logger = init_logger(__name__) From 17cb540248359afe3c93eb54dad03ce9e8d7f140 Mon Sep 17 00:00:00 2001 From: ioana ghiban Date: Thu, 11 Dec 2025 16:57:10 +0100 Subject: [PATCH 42/67] [Docs][CPU Backend] Add nightly and per revision pre-built Arm CPU wheels (#30402) Signed-off-by: Ioana Ghiban Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../installation/cpu.arm.inc.md | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 156f31f633d57..8ec18bcb826ec 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -29,8 +29,27 @@ uv pip install --pre vllm==+cpu --extra-index-url https://wheels.vllm.a The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -!!! note - Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression). +**Install the latest code** + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides working pre-built Arm CPU wheels for every commit since `v0.11.2` on . For native CPU wheels, this index should be used: + +* `https://wheels.vllm.ai/nightly/cpu/vllm` + +To install from nightly index, copy the link address of the `*.whl` under this index to run, for example: + +```bash +uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!) +``` + +**Install specific revisions** + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index: +https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm . +Then, copy the link address of the `*.whl` under this index to run: + +```bash +uv pip install -U +``` # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] From 93db3256a4c56cbf8647b6c0caca78abdf926130 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Dec 2025 16:22:58 +0000 Subject: [PATCH 43/67] Give pooling examples better names (#30488) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 2 +- docs/serving/openai_compatible_server.md | 2 +- .../pooling/score/{qwen3_reranker.py => offline_reranker.py} | 0 .../score/{jinaai_rerank_client.py => openai_reranker.py} | 0 vllm/model_executor/models/config.py | 2 +- 5 files changed, 3 insertions(+), 3 deletions(-) rename examples/pooling/score/{qwen3_reranker.py => offline_reranker.py} (100%) rename examples/pooling/score/{jinaai_rerank_client.py => openai_reranker.py} (100%) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d0166060c267a..586d5d91634dc 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py). + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py). ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 01453483a8d60..0e29204f8947c 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py) +Code example: [examples/pooling/score/openai_reranker.py](../../examples/pooling/score/openai_reranker.py) #### Example Request diff --git a/examples/pooling/score/qwen3_reranker.py b/examples/pooling/score/offline_reranker.py similarity index 100% rename from examples/pooling/score/qwen3_reranker.py rename to examples/pooling/score/offline_reranker.py diff --git a/examples/pooling/score/jinaai_rerank_client.py b/examples/pooling/score/openai_reranker.py similarity index 100% rename from examples/pooling/score/jinaai_rerank_client.py rename to examples/pooling/score/openai_reranker.py diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 55dd6e50ad249..8de793941b8c3 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -214,7 +214,7 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): tokens = getattr(config, "classifier_from_token", None) assert tokens is not None and len(tokens) == 2, ( "Try loading the original Qwen3 Reranker?, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py" + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py" ) vllm_config.model_config.hf_config.method = "from_2_way_softmax" From 305b168a9fc50f322e9c5a07f4fc8c7bbda5f844 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Fri, 12 Dec 2025 00:42:30 +0800 Subject: [PATCH 44/67] [CI] refine more logic when generating and using nightly wheels & indices, add cuda130 build for aarch64, specify correct manylinux version (#30341) Signed-off-by: Shengqi Chen --- .buildkite/release-pipeline.yaml | 21 ++++++++-- .buildkite/scripts/generate-nightly-index.py | 11 ++++++ .buildkite/scripts/upload-wheels.sh | 12 ++++-- tests/standalone_tests/python_only_compile.sh | 39 +++++++++++++++++-- 4 files changed, 73 insertions(+), 10 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index fbfc923998f89..151bb6abb0905 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -15,6 +15,21 @@ steps: env: DOCKER_BUILDKIT: "1" + - label: "Build arm64 wheel - CUDA 13.0" + depends_on: ~ + id: build-wheel-arm64-cuda-13-0 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" + env: + DOCKER_BUILDKIT: "1" + # aarch64 build - label: "Build arm64 CPU wheel" depends_on: ~ @@ -25,7 +40,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" env: DOCKER_BUILDKIT: "1" @@ -39,7 +54,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31" env: DOCKER_BUILDKIT: "1" @@ -52,7 +67,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index f10cb2f0b6e21..d0965fbd56405 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -372,6 +372,17 @@ if __name__ == "__main__": print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") + # keep only "official" files for a non-nightly version (specifed by cli args) + PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") + if PY_VERSION_RE.match(version): + # upload-wheels.sh ensures no "dev" is in args.version + wheel_files = list( + filter(lambda x: version in x and "dev" not in x, wheel_files) + ) + print(f"Non-nightly version detected, wheel files used: {wheel_files}") + else: + print("Nightly version detected, keeping all wheel files.") + # Generate index and metadata, assuming wheels and indices are stored as: # s3://vllm-wheels/{version}/ # s3://vllm-wheels// diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 8e38ace0bfbc2..3a218a4bb2e6d 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then fi wheel="${wheel_files[0]}" -# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31 +# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31 +# we also accept params as manylinux tag # refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels -manylinux_version="manylinux_2_31" +manylinux_version="${1:-manylinux_2_31}" # Rename 'linux' to the appropriate manylinux version in the wheel filename if [[ "$wheel" != *"linux"* ]]; then @@ -96,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/" fi -# copy to // only if it does not have "dev" in the version +# re-generate and copy to // only if it does not have "dev" in the version if [[ "$version" != *"dev"* ]]; then - echo "Uploading indices to overwrite /$pure_version/" + echo "Re-generating indices for /$pure_version/" + rm -rf "$INDICES_OUTPUT_DIR/*" + mkdir -p "$INDICES_OUTPUT_DIR" + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" fi diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index d29b9afcc6fbf..2017e34030d60 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -3,12 +3,45 @@ # for users who do not have any compilers installed on their system set -e -set -x merge_base_commit=$(git merge-base HEAD origin/main) -echo "Current merge base commit with main: $merge_base_commit" +echo "INFO: current merge base commit with main: $merge_base_commit" git show --oneline -s $merge_base_commit +# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times +# this avoids cumbersome error messages & manual retries in case the precompiled wheel +# for the given commit is still being built in the release pipeline +meta_json_url="https://wheels.vllm.ai/$merge_base_commit/vllm/metadata.json" +echo "INFO: will use metadata.json from $meta_json_url" + +for i in {1..5}; do + echo "Checking metadata.json URL (attempt $i)..." + if curl --fail "$meta_json_url" > metadata.json; then + echo "INFO: metadata.json URL is valid." + # check whether it is valid json by python + if python3 -m json.tool metadata.json; then + echo "INFO: metadata.json is valid JSON. Proceeding with the test." + else + echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!" + exit 1 + fi + break + fi + # failure handling + if [ $i -eq 5 ]; then + echo "ERROR: metadata.json URL is still not valid after 5 attempts." + echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists." + echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes." + echo " NOTE: If it fails, please report in #sig-ci channel." + exit 1 + else + echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..." + sleep 180 + fi +done + +set -x + cd /vllm-workspace/ # uninstall vllm @@ -29,6 +62,6 @@ python3 -c 'import vllm' # Check if the clangd log file was created if [ ! -f /tmp/changed.file ]; then - echo "changed.file was not created, python only compilation failed" + echo "ERROR: changed.file was not created, python only compilation failed" exit 1 fi From aa3c250c487e843b229a58d9978b02707b71109c Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:53:26 +0100 Subject: [PATCH 45/67] [IMPROVEMENT] Change MistralReasoningParser behavior (#30391) Signed-off-by: juliendenize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Co-authored-by: Patrick von Platen --- .../test_mistral_reasoning_parser.py | 157 ++++++++++-------- vllm/reasoning/mistral_reasoning_parser.py | 105 +++++++++++- 2 files changed, 192 insertions(+), 70 deletions(-) diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py index 0fe315c2567f9..01592fd0782a9 100644 --- a/tests/reasoning/test_mistral_reasoning_parser.py +++ b/tests/reasoning/test_mistral_reasoning_parser.py @@ -18,47 +18,53 @@ def mistral_tokenizer(): return mistral_tokenizer -SIMPLE_REASONING = { +INVALID_SIMPLE_REASONING = { "output": "This is a reasoning section[/THINK]This is the rest", - "reasoning": "This is a reasoning section", - "content": "This is the rest", - "is_reasoning_end": True, + "reasoning": None, + "content": "This is a reasoning sectionThis is the rest", + "is_reasoning_end": False, } -COMPLETE_REASONING = { +INVALID_COMPLETE_REASONING = { "output": "This is a reasoning section[/THINK]", - "reasoning": "This is a reasoning section", - "content": None, - "is_reasoning_end": True, + "reasoning": None, + "content": "This is a reasoning section", + "is_reasoning_end": False, } NO_CONTENT = { - "output": "This is content", - "reasoning": "This is content", + "output": "[THINK]This is reasoning", + "reasoning": "This is reasoning", "content": None, "is_reasoning_end": False, } +NO_REASONING = { + "output": "This is content", + "reasoning": None, + "content": "This is content", + "is_reasoning_end": False, +} NO_REASONING_STREAMING = { "output": "This is a reasoning section", - "reasoning": "This is a reasoning section", - "content": None, + "reasoning": None, + "content": "This is a reasoning section", "is_reasoning_end": False, } -MULTIPLE_LINES = { +INVALID_MULTIPLE_LINES = { "output": "This\nThat[/THINK]This is the rest\nThat", - "reasoning": "This\nThat", - "content": "This is the rest\nThat", - "is_reasoning_end": True, + "reasoning": None, + "content": "This\nThatThis is the rest\nThat", + "is_reasoning_end": False, } -SHORTEST_REASONING_NO_STREAMING = { - "output": "[/THINK]This is the rest", - "reasoning": "", - "content": "This is the rest", - "is_reasoning_end": True, -} -SHORTEST_REASONING = { +INVALID_SHORTEST_REASONING_NO_STREAMING = { "output": "[/THINK]This is the rest", "reasoning": None, "content": "This is the rest", - "is_reasoning_end": True, + "is_reasoning_end": False, +} +INVALID_SHORTEST_REASONING = { + "output": "[/THINK]This is the rest", + "reasoning": None, + "content": "This is the rest", + "is_reasoning_end": False, } REASONING_WITH_THINK = { "output": "[THINK]This is a reasoning section[/THINK]This is the rest", @@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = { "content": "This is the rest\nThat", "is_reasoning_end": True, } -SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { - "output": "[/THINK]This is the rest", - "reasoning": "", - "content": "This is the rest", - "is_reasoning_end": True, -} -SHORTEST_REASONING_WITH_THINK = { +INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { "output": "[/THINK]This is the rest", "reasoning": None, "content": "This is the rest", - "is_reasoning_end": True, + "is_reasoning_end": False, +} +INVALID_SHORTEST_REASONING_WITH_THINK = { + "output": "[/THINK]This is the rest", + "reasoning": None, + "content": "This is the rest", + "is_reasoning_end": False, } THINK_NO_END = { "output": "[THINK]This is a reasoning section", @@ -98,8 +104,8 @@ THINK_NO_END = { } EMPTY = { "output": "", - "reasoning": "", - "content": None, + "reasoning": None, + "content": "", "is_reasoning_end": False, } EMPTY_STREAMING = { @@ -109,47 +115,48 @@ EMPTY_STREAMING = { "is_reasoning_end": False, } NEW_LINE = { - "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", + "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", "reasoning": "This is a reasoning section", - "content": "\nThis is the rest", + "content": "Before\n\nThis is the rest", "is_reasoning_end": True, } -# Streaming cannot handle new lines at the beginning of the output -# because we need to support [THINK]...[/THINK] and [/THINK]... -# We cannot know if the text before [THINK] is reasoning content -# or not. NEW_LINE_STREAMING = { - "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", - "reasoning": "\nThis is a reasoning section", - "content": "\nThis is the rest", + "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", + "reasoning": "This is a reasoning section", + "content": "Before\n\nThis is the rest", "is_reasoning_end": True, } TEST_CASES = [ pytest.param( False, - SIMPLE_REASONING, - id="simple_reasoning", + INVALID_SIMPLE_REASONING, + id="invalid_simple_reasoning", ), pytest.param( True, - SIMPLE_REASONING, - id="simple_reasoning_streaming", + INVALID_SIMPLE_REASONING, + id="invalid_simple_reasoning_streaming", ), pytest.param( False, - COMPLETE_REASONING, - id="complete_reasoning", + INVALID_COMPLETE_REASONING, + id="invalid_complete_reasoning", ), pytest.param( True, - COMPLETE_REASONING, - id="complete_reasoning_streaming", + INVALID_COMPLETE_REASONING, + id="invalid_complete_reasoning_streaming", ), pytest.param( False, NO_CONTENT, - id="no_content_token", + id="no_content", + ), + pytest.param( + False, + NO_REASONING, + id="no_reasoning", ), pytest.param( True, @@ -158,23 +165,23 @@ TEST_CASES = [ ), pytest.param( False, - MULTIPLE_LINES, - id="multiple_lines", + INVALID_MULTIPLE_LINES, + id="invalid_multiple_lines", ), pytest.param( True, - MULTIPLE_LINES, - id="multiple_lines_streaming", + INVALID_MULTIPLE_LINES, + id="invalid_multiple_lines_streaming", ), pytest.param( True, - SHORTEST_REASONING, - id="shortest", + INVALID_SHORTEST_REASONING, + id="invalid_shortest", ), pytest.param( False, - SHORTEST_REASONING_NO_STREAMING, - id="shortest_streaming", + INVALID_SHORTEST_REASONING_NO_STREAMING, + id="invalid_shortest_streaming", ), pytest.param( False, @@ -208,13 +215,13 @@ TEST_CASES = [ ), pytest.param( False, - SHORTEST_REASONING_NO_STREAMING_WITH_THINK, - id="shortest_with_think", + INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK, + id="invalid_shortest_with_think", ), pytest.param( True, - SHORTEST_REASONING_WITH_THINK, - id="shortest_with_think_streaming", + INVALID_SHORTEST_REASONING_WITH_THINK, + id="invalid_shortest_with_think_streaming", ), pytest.param( False, @@ -316,10 +323,26 @@ def test_mistral_reasoning( # Test extract_content if param_dict["content"] is not None: - content = parser.extract_content_ids(output_tokens) - assert content == mistral_tokenizer.tokenizer.encode( - param_dict["content"], bos=False, eos=False + # Handle the case where there are tokens outputted before Thinking. + # This should not occur if the model is well trained and prompted. + if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith( + "[THINK]" + ): + before_content = param_dict["output"].split("[THINK]")[0] + before_token_ids = mistral_tokenizer.tokenizer.encode( + before_content, bos=False, eos=False + ) + left_to_encode = param_dict["content"][len(before_content) :] + # Normal situation. + else: + before_token_ids = [] + left_to_encode = param_dict["content"] + + content_tokens = parser.extract_content_ids(output_tokens) + expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode( + left_to_encode, bos=False, eos=False ) + assert content_tokens == expected_token_ids else: content = parser.extract_content_ids(output_tokens) assert content == [] diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index b61e50c188f8c..3206dbb29fe2e 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -3,20 +3,29 @@ from functools import cached_property +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser -from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.tokenizers import MistralTokenizer logger = init_logger(__name__) -class MistralReasoningParser(DeepSeekR1ReasoningParser): +class MistralReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for Mistral models. - The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning + The Mistral models uses `[THINK]`...`[/THINK]` tokens to denote reasoning text. This parser extracts the reasoning content from the model output. + + A valid reasoning trace should always start with a `[THINK]` token and end with + a `[/THINK]` token. + + If `[THINK]` token is not generated, then this parser only returns content. """ def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs): @@ -53,3 +62,93 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser): from mistral_common.tokens.tokenizers.base import SpecialTokens return SpecialTokens.end_think + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + has_eot_token = False + + for id in input_ids[::-1]: + if id == self.start_token_id: + # Reasoning ends only if a BOT token is found before a EOT token. + return has_eot_token + elif id == self.end_token_id: + has_eot_token = True + return False + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + """ + Extract the content + """ + has_bot_token = False + has_eot_token = False + bot_token_index = -1 + eot_token_index = -1 + # One for loop instead of multiple lookups + for i, token_id in enumerate(input_ids): + # We filter that we have multiple BOT tokens which should not + # happen for a well prompted trained model + if token_id == self.start_token_id and not has_bot_token: + has_bot_token = True + bot_token_index = i + elif token_id == self.end_token_id: + has_eot_token = True + eot_token_index = i + break + + # 1. Only BOT has been outputted + if has_bot_token and not has_eot_token: + # Should be = [] if model is well prompted and trained. + return input_ids[:bot_token_index] + # 2. Neither BOT or EOT have been outputted + elif not has_bot_token and not has_eot_token: + return input_ids + # 3. Both BOT and EOT have been outputted. + elif has_bot_token and has_eot_token: + return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :] + # 4. Only EOT has been outputted => this should not have occured for a model + # well prompted and trained. + else: + return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :] + + def extract_reasoning( + self, model_output: str, request: ChatCompletionRequest | ResponsesRequest + ) -> tuple[str | None, str | None]: + """ + Extract reasoning content from the model output. + """ + if not model_output: + return (None, "") + + # Check if the start token is present in the model output, remove it + # if it is present. + prev_bot_token, bot_token, post_bot_token = model_output.partition( + self.start_token + ) + + has_bot_token = bool(bot_token) + # Valid EOT tokens should follow BOT token + has_valid_eot_token = has_bot_token and self.end_token in post_bot_token + + # 1. If there is BOT token followed by EOT token + if has_bot_token and has_valid_eot_token: + prev_eot_token, _, post_eot_token = post_bot_token.partition(self.end_token) + # If model is well prompted and trained prev_bot_token should be "" + content = prev_bot_token + post_eot_token + return prev_eot_token, content if content else None + # 2. Only BOT token + elif has_bot_token: + # If model is well prompted and trained prev_bot_token should be "" + return post_bot_token, prev_bot_token if prev_bot_token else None + # 3. EOT token has been outputted without BOT or neither has been outputted + else: + has_non_valid_eot_token = self.end_token in prev_bot_token + # 3.a EOT token has been outputted without BOT + # If model is well prompted and trained `has_non_valid_eot_token` should + # be `False` and the parser outputs all tokens as 'content' + if has_non_valid_eot_token: + prev_eot_token, _, post_eot_token = prev_bot_token.partition( + self.end_token + ) + return None, prev_eot_token + post_eot_token + # 3.b neither BOT or EOT have been outputted + else: + return None, prev_bot_token From 8781cd6b88ad264a01886a05e698b5e036fb4eb9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:02:10 +0000 Subject: [PATCH 46/67] Add Eagle and Eagle3 support to Transformers modeling backend (#30340) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/e2e/test_spec_decode.py | 36 +++++++++- .../models/transformers/base.py | 66 +++++++++++++++++-- 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 8c904a8cddac4..c8587659d6580 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -280,9 +280,20 @@ def test_speculators_model_integration( @pytest.mark.parametrize( - ["model_setup", "mm_enabled", "enable_chunked_prefill"], + ["model_setup", "mm_enabled", "enable_chunked_prefill", "model_impl"], [ - (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), + ( + ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), + False, + False, + "auto", + ), + ( + ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), + False, + False, + "transformers", + ), pytest.param( ( "eagle3", @@ -292,6 +303,7 @@ def test_speculators_model_integration( ), False, False, + "auto", marks=pytest.mark.skip( reason="architecture of its eagle3 is LlamaForCausalLMEagle3" ), @@ -305,6 +317,7 @@ def test_speculators_model_integration( ), False, False, + "auto", marks=pytest.mark.skip( reason="Skipping due to its head_dim not being a a multiple of 32" ), @@ -318,6 +331,7 @@ def test_speculators_model_integration( ), False, True, + "auto", marks=large_gpu_mark(min_gb=40), ), # works on 4x H100 ( @@ -329,6 +343,7 @@ def test_speculators_model_integration( ), False, False, + "auto", ), pytest.param( ( @@ -339,6 +354,7 @@ def test_speculators_model_integration( ), False, False, + "auto", marks=large_gpu_mark(min_gb=80), ), # works on 4x H100 pytest.param( @@ -350,6 +366,7 @@ def test_speculators_model_integration( ), True, True, + "auto", marks=large_gpu_mark(min_gb=80), ), # works on 4x H100 ( @@ -361,10 +378,12 @@ def test_speculators_model_integration( ), False, False, + "auto", ), ], ids=[ "qwen3_eagle3", + "qwen3_eagle3-transformers", "qwen3_vl_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", @@ -381,6 +400,7 @@ def test_eagle_correctness( model_setup: tuple[str, str, str, int], mm_enabled: bool, enable_chunked_prefill: bool, + model_impl: str, attn_backend: str, ): if attn_backend == "TREE_ATTN": @@ -389,6 +409,17 @@ def test_eagle_correctness( "TREE_ATTN is flaky in the test disable for now until it can be " "resolved (see https://github.com/vllm-project/vllm/issues/22922)" ) + if model_impl == "transformers": + import transformers + from packaging.version import Version + + installed = Version(transformers.__version__) + required = Version("5.0.0.dev") + if installed < required: + pytest.skip( + "Eagle3 with the Transformers modeling backend requires " + f"transformers>={required}, but got {installed}" + ) # Generate test prompts inside the function instead of using fixture test_prompts = get_test_prompts(mm_enabled) @@ -448,6 +479,7 @@ def test_eagle_correctness( max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, + model_impl=model_impl, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index f3ebc6da8e302..45e746ac2d356 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -36,6 +36,8 @@ from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.models.interfaces import ( + SupportsEagle, + SupportsEagle3, SupportsLoRA, SupportsPP, SupportsQuant, @@ -92,7 +94,15 @@ def vllm_flash_attention_forward( ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward -class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): +class Base( + nn.Module, + VllmModel, + SupportsQuant, + SupportsLoRA, + SupportsPP, + SupportsEagle, + SupportsEagle3, +): embedding_modules = ["embed_tokens"] # TODO transformers will have a util to get it hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -131,17 +141,24 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): self.pp_group = get_pp_group() self.tp_group = get_tp_group() - # Weights to skip in `self.load_weights` + # Attrs for weight loading (see self.load_weights) self.skip_prefixes: list[str] = [] """Skip loading weights whose qualname starts with these prefixes.""" self.skip_substrs: list[str] = [] """Skip loading weights whose qualname contains these substrings.""" self.ignore_unexpected_prefixes: list[str] = [] - """Ignore unexpected weights whose qualname starts with these prefixes. - """ + """Ignore unexpected weights whose qualname starts with these prefixes.""" self.ignore_unexpected_suffixes: list[str] = [] """Ignore unexpected weights whose qualname ends with these suffixes.""" + # Attrs for Eagle3 (see self.set_aux_hidden_state_layers) + self._target_class: type[nn.Module] = nn.Module + """Target class for Eagle3 aux hidden state recording.""" + self._layer_names: dict[int, str] = {} + """Mapping from layer index to layer name for Eagle3.""" + self._output_aux_hidden_states_kwargs: dict[str, bool] = {} + """Kwargs to pass to model forward for Eagle3 aux hidden states.""" + if self.quant_config: quant_method_name = self.quant_config.get_name() # Check for unsupported quantization methods. @@ -278,6 +295,15 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): for child_name, child_module in module.named_children(): new_module = child_module qual_name = maybe_prefix(prefix, child_name) + # Populate Eagle3 attrs + if ( + isinstance(module, nn.ModuleList) + and len(module) == self.text_config.num_hidden_layers + ): + self._target_class = type(child_module) + layer_name = qual_name.removeprefix("model.") + self._layer_names[int(child_name)] = layer_name + # Replace modules as needed if isinstance(child_module, nn.Linear): generator = (p for p in tp_plan if re.match(p, qual_name)) pattern = next(generator, None) @@ -425,19 +451,26 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): else: position_ids = positions[None, ...] - hidden_states = self.model( + outputs = self.model( input_ids=input_ids, inputs_embeds=inputs_embeds, use_cache=False, position_ids=position_ids, attention_instances=self.attention_instances, return_dict=False, + **self._output_aux_hidden_states_kwargs, **kwargs, - )[0][0, ...] # we remove batch dimension for now + ) + # We must remove the batch dimension from these outputs + hidden_states = outputs[0][0, ...] + if self._output_aux_hidden_states_kwargs: + aux_hidden_states = [x[0][0, ...] for x in outputs[1:]] if not self.pp_group.is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) + if self._output_aux_hidden_states_kwargs and len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states def load_weights( @@ -462,3 +495,24 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): f"Transformers modeling backend requires transformers>={required} " f"for {feature}, but got {installed}" ) + + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.check_version("5.0.0.dev0", "Eagle3 support") + from transformers.utils.generic import OutputRecorder + + # The default value in PreTrainedModel is None + if self.model._can_record_outputs is None: + self.model._can_record_outputs = {} + + target_class = self._target_class + for layer in layers: + # layer - 1 because we want the input to the layer + layer_name = self._layer_names[layer - 1] + layer_key = f"aux_hidden_state_{layer}" + aux_hidden_state_i = OutputRecorder(target_class, layer_name=layer_name) + self.model._can_record_outputs[layer_key] = aux_hidden_state_i + self._output_aux_hidden_states_kwargs[f"output_{layer_key}"] = True + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = self.text_config.num_hidden_layers + return (2, num_layers // 2, num_layers - 3) From 0e71eaa6447d99e76de8e03213ec22bc1d3b07df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Fri, 12 Dec 2025 02:03:32 +0800 Subject: [PATCH 47/67] [Feature] AWQ marlin quantization support for fused moe with lora (#30442) Signed-off-by: princepride --- .../model_executor/layers/fused_moe/config.py | 36 +++++++ .../layers/quantization/awq_marlin.py | 95 ++++++++++++++++++- 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index f35cafa0f77dc..5eb6bc4829adf 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -700,6 +700,42 @@ def int4_w4afp8_moe_quant_config( ) +def awq_marlin_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: torch.Tensor | None, + w2_zp: torch.Tensor | None, + weight_bits: int, + group_size: int, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for awq marlin quantization. + """ + from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape + + w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size) + + # Activations are NOT quantized for AWQ (fp16/bf16) + a_shape = w_shape # Same as weight shape for alignment + + # Determine weight dtype + if weight_bits == 4: + weight_dtype = "int4" + elif weight_bits == 8: + weight_dtype = torch.int8 + else: + raise ValueError(f"Unsupported weight_bits: {weight_bits}") + + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _a2=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _w1=FusedMoEQuantDesc(weight_dtype, w_shape, w1_scale, None, w1_zp, w1_bias), + _w2=FusedMoEQuantDesc(weight_dtype, w_shape, w2_scale, None, w2_zp, w2_bias), + ) + + def biased_moe_quant_config( w1_bias: torch.Tensor | None, w2_bias: torch.Tensor | None, diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 16aa4f1e22698..3ed15ed7dd422 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -470,6 +470,11 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): } ) + intermediate_size_full = extra_weight_attrs.pop( + "intermediate_size_full", intermediate_size_per_partition + ) + self.is_k_full = intermediate_size_per_partition == intermediate_size_full + w13_qweight = Parameter( torch.empty( num_experts, @@ -597,6 +602,13 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): ) replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + # The modular kernel expects w13_weight and w2_weight, + # but AWQ uses w13_qweight and w2_qweight + # Alias for modular kernel + layer.w13_weight = layer.w13_qweight + # Alias for modular kernel + layer.w2_weight = layer.w2_qweight + # Why does this take the intermediate size for size_k? marlin_w13_scales = marlin_moe_permute_scales( s=layer.w13_scales, @@ -661,7 +673,88 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - return None + from vllm.model_executor.layers.fused_moe.config import ( + awq_marlin_moe_quant_config, + ) + + return awq_marlin_moe_quant_config( + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + weight_bits=self.quant_config.weight_bits, + group_size=self.quant_config.group_size, + w1_zp=getattr(layer, "w13_qzeros", None) + if self.quant_config.zero_point + else None, + w2_zp=getattr(layer, "w2_qzeros", None) + if self.quant_config.zero_point + else None, + w1_bias=getattr(layer, "w13_bias", None), + w2_bias=getattr(layer, "w2_bias", None), + ) + + def select_gemm_impl( + self, + prepare_finalize, + layer: torch.nn.Module, + ): + """ + Select the GEMM implementation for AWQ-Marlin MoE. + Returns MarlinExperts configured for AWQ quantization. + This is ONLY used when LoRA is enabled. + Without LoRA, AWQ uses its own apply() method. + """ + # Only use modular kernels when LoRA is enabled + # Without LoRA, AWQ's own apply() method works fine and is more efficient + if not self.moe.is_lora_enabled: + raise NotImplementedError( + "AWQ-Marlin uses its own apply() method when LoRA is not enabled. " + "Modular kernels are only used for LoRA support." + ) + + from vllm.model_executor.layers.fused_moe import modular_kernel as mk + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + MarlinExperts, + ) + + # Ensure quant config is initialized + assert self.moe_quant_config is not None, ( + "moe_quant_config must be initialized before select_gemm_impl" + ) + + w13_g_idx = getattr(layer, "w13_g_idx", None) + w2_g_idx = getattr(layer, "w2_g_idx", None) + w13_g_idx_sort_indices = getattr(layer, "w13_g_idx_sort_indices", None) + w2_g_idx_sort_indices = getattr(layer, "w2_g_idx_sort_indices", None) + + # Check if using batched expert format (for Expert Parallelism) + if ( + prepare_finalize.activation_format + == mk.FusedMoEActivationFormat.BatchedExperts + ): + # For batched format, use BatchedMarlinExperts + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + return BatchedMarlinExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + else: + # Standard Marlin experts for AWQ + return MarlinExperts( + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) def apply( self, From 72aaac5b66f908008efed5ba6874c3ed60e6c90a Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 11 Dec 2025 13:25:01 -0600 Subject: [PATCH 48/67] [ROCm][Bugfix] Add MLACommonMetadata to allowed attention types for speculative decoding (#30430) Signed-off-by: Andreas Karatzas --- vllm/v1/spec_decode/eagle.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 4cc78ae9d23ae..65a0a88ec0f5d 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -178,6 +178,12 @@ class EagleProposer: ) rocm_types.append(AiterFlashAttentionMetadata) + + # TRITON_MLA backend support for MLA models (e.g., DeepSeek) + from vllm.v1.attention.backends.mla.common import MLACommonMetadata + + rocm_types.append(MLACommonMetadata) + self.allowed_attn_types = tuple(rocm_types) # Parse the speculative token tree. From e458270a9537c5abc1d848f53f2d56fce92a6122 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Thu, 11 Dec 2025 12:06:09 -0800 Subject: [PATCH 49/67] [Misc] Add mcp to requirements (#30474) Signed-off-by: Ye (Charlotte) Qi --- requirements/common.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index f18560b98d16c..31c8fb404f63a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser setproctitle # Used to set process names for better debugging and monitoring openai-harmony >= 0.0.3 # Required for gpt-oss anthropic == 0.71.0 -model-hosting-container-standards >= 0.1.9, < 1.0.0 \ No newline at end of file +model-hosting-container-standards >= 0.1.9, < 1.0.0 +mcp \ No newline at end of file From 92fea56fd1e148a5650160427d6b5c733ff211b8 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Thu, 11 Dec 2025 15:28:03 -0500 Subject: [PATCH 50/67] [compile] Stop one-off setting enable_aot_compile and use context manager instead. (#30503) Signed-off-by: zhxchen17 --- vllm/compilation/wrapper.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b59a4a9dd1527..02e974b0f9e8c 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -171,22 +171,24 @@ class TorchCompileWithNoGuardsWrapper: compiled_ptr = self.check_invariants_and_forward + aot_context = nullcontext() if envs.VLLM_USE_AOT_COMPILE: if hasattr(torch._dynamo.config, "enable_aot_compile"): - torch._dynamo.config.enable_aot_compile = True + aot_context = torch._dynamo.config.patch(enable_aot_compile=True) else: msg = "torch._dynamo.config.enable_aot_compile is not " msg += "available. AOT compile is disabled and please " msg += "upgrade PyTorch version to use AOT compile." logger.warning(msg) - self._compiled_callable = torch.compile( - compiled_ptr, - fullgraph=True, - dynamic=False, - backend=backend, - options=options, - ) + with aot_context: + self._compiled_callable = torch.compile( + compiled_ptr, + fullgraph=True, + dynamic=False, + backend=backend, + options=options, + ) if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE: torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) From cf3eacfe58fa9e745c2854782ada884a9f992cf7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Dec 2025 20:45:23 +0000 Subject: [PATCH 51/67] Standardise `get_rope` to use `rope_parameters["partial_rotary_factor"]`, not `rotary_dim` (#30389) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 1 - benchmarks/kernels/benchmark_rope.py | 4 +- tests/compile/test_functionalization.py | 5 +- tests/kernels/core/test_mrope.py | 2 - tests/kernels/core/test_pos_encoding.py | 12 +- vllm/config/utils.py | 18 +- .../layers/rotary_embedding/__init__.py | 370 +++++++++--------- vllm/model_executor/models/afmoe.py | 1 - vllm/model_executor/models/apertus.py | 1 - vllm/model_executor/models/arctic.py | 1 - vllm/model_executor/models/baichuan.py | 1 - vllm/model_executor/models/bailing_moe.py | 4 +- vllm/model_executor/models/bamba.py | 7 +- vllm/model_executor/models/chameleon.py | 1 - vllm/model_executor/models/chatglm.py | 7 +- vllm/model_executor/models/commandr.py | 1 - vllm/model_executor/models/config.py | 12 +- vllm/model_executor/models/dbrx.py | 1 - vllm/model_executor/models/deepseek_v2.py | 4 - vllm/model_executor/models/dots1.py | 1 - vllm/model_executor/models/ernie45_moe.py | 1 - vllm/model_executor/models/exaone.py | 1 - vllm/model_executor/models/exaone4.py | 1 - vllm/model_executor/models/falcon.py | 1 - vllm/model_executor/models/falcon_h1.py | 7 +- vllm/model_executor/models/gemma.py | 1 - vllm/model_executor/models/gemma2.py | 1 - vllm/model_executor/models/gemma3.py | 1 - vllm/model_executor/models/gemma3n.py | 1 - vllm/model_executor/models/glm4.py | 2 - vllm/model_executor/models/glm4_1v.py | 2 +- vllm/model_executor/models/glm4_moe.py | 1 - vllm/model_executor/models/gpt_j.py | 5 +- vllm/model_executor/models/gpt_neox.py | 1 - vllm/model_executor/models/gpt_oss.py | 1 - vllm/model_executor/models/granite.py | 1 - vllm/model_executor/models/granitemoe.py | 1 - .../model_executor/models/granitemoehybrid.py | 1 - vllm/model_executor/models/grok1.py | 1 - vllm/model_executor/models/hunyuan_v1.py | 2 - vllm/model_executor/models/internlm2.py | 1 - vllm/model_executor/models/lfm2.py | 1 - vllm/model_executor/models/lfm2_moe.py | 1 - vllm/model_executor/models/llama.py | 1 - vllm/model_executor/models/llama4.py | 1 - vllm/model_executor/models/minicpm.py | 1 - vllm/model_executor/models/minicpm3.py | 1 - vllm/model_executor/models/minimax_m2.py | 6 +- vllm/model_executor/models/minimax_text_01.py | 7 +- vllm/model_executor/models/mixtral.py | 1 - vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/modernbert.py | 1 - vllm/model_executor/models/molmo.py | 1 - vllm/model_executor/models/nemotron.py | 1 - vllm/model_executor/models/nemotron_nas.py | 1 - vllm/model_executor/models/olmo.py | 1 - vllm/model_executor/models/olmo2.py | 1 - vllm/model_executor/models/olmoe.py | 1 - vllm/model_executor/models/openpangu.py | 2 - vllm/model_executor/models/orion.py | 1 - vllm/model_executor/models/ouro.py | 1 - vllm/model_executor/models/persimmon.py | 1 - vllm/model_executor/models/phi.py | 12 +- vllm/model_executor/models/phimoe.py | 1 - vllm/model_executor/models/plamo2.py | 1 - vllm/model_executor/models/plamo3.py | 1 - vllm/model_executor/models/qwen.py | 1 - vllm/model_executor/models/qwen2.py | 1 - vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 1 - vllm/model_executor/models/qwen2_vl.py | 2 +- vllm/model_executor/models/qwen3.py | 1 - vllm/model_executor/models/qwen3_moe.py | 1 - vllm/model_executor/models/qwen3_next.py | 1 - .../models/qwen3_omni_moe_thinker.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/model_executor/models/seed_oss.py | 1 - vllm/model_executor/models/solar.py | 1 - vllm/model_executor/models/stablelm.py | 1 - vllm/model_executor/models/starcoder2.py | 1 - vllm/model_executor/models/step3_text.py | 1 - vllm/model_executor/models/zamba2.py | 1 - vllm/transformers_utils/config.py | 17 +- 83 files changed, 260 insertions(+), 314 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 83bd91917508f..09de5fa822f86 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -99,7 +99,6 @@ def benchmark_mrope( # the parameters to compute the q k v size based on tp_size mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=rope_parameters, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 074b7a440b612..7a1bc050bb33f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device): def benchmark(batch_size, seq_len, num_heads, provider): dtype = torch.bfloat16 max_position = 8192 - base = 10000 - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"partial_rotary_factor": rotary_dim / head_size} + rope = get_rope(head_size, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=device) cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 7585915892700..ad5ead36e2310 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module): class TestRotaryEmbedding(torch.nn.Module): - def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000): + def __init__(self, head_dim=64, max_position=2048, base=10000): super().__init__() self.head_dim = head_dim - self.rotary_dim = rotary_dim or head_dim self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.rotary_dim, max_position=max_position, rope_parameters={"rope_type": "default", "rope_theta": base}, ) @@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters={"rope_type": "default", "rope_theta": base}, ) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 4e1559a049bf9..ba5d593b2d355 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -116,7 +116,6 @@ def test_mrope( mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, @@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing( mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index a8ed3825689d3..d18f01314c8f5 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -83,8 +83,12 @@ def test_rotary_embedding( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) + rope_parameters = { + "rope_type": "default", + "rope_theta": rope_theta, + "partial_rotary_factor": rotary_dim / head_size, + } + rope = get_rope(head_size, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -150,9 +154,9 @@ def test_rope_module_cache(): if rotary_dim is None: rotary_dim = head_size rope_parameters["rope_theta"] = rope_theta + rope_parameters["partial_rotary_factor"] = rotary_dim / head_size rope = get_rope( head_size, - rotary_dim, max_position, is_neox_style, rope_parameters, @@ -177,9 +181,9 @@ def test_rope_module_cache(): if rotary_dim is None: rotary_dim = head_size rope_parameters["rope_theta"] = rope_theta + rope_parameters["partial_rotary_factor"] = rotary_dim / head_size rope = get_rope( head_size, - rotary_dim, max_position, is_neox_style, rope_parameters, diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 93da3fd417ace..470296517deb1 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -73,14 +73,28 @@ def get_field(cls: ConfigType, name: str) -> Field: ) -def getattr_iter(object: object, names: Iterable[str], default: Any) -> Any: +def getattr_iter( + object: object, names: Iterable[str], default: Any, warn: bool = False +) -> Any: """ A helper function that retrieves an attribute from an object which may have multiple possible names. This is useful when fetching attributes from arbitrary `transformers.PretrainedConfig` instances. + + In the case where the first name in `names` is the preferred name, and + any other names are deprecated aliases, setting `warn=True` will log a + warning when a deprecated name is used. """ - for name in names: + for i, name in enumerate(names): if hasattr(object, name): + if warn and i > 0: + logger.warning_once( + "%s contains a deprecated attribute name '%s'. " + "Please use the preferred attribute name '%s' instead.", + type(object).__name__, + name, + names[0], + ) return getattr(object, name) return default diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 4dff984f92be6..452b87ea4e7a5 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -25,7 +25,6 @@ _ROPE_DICT: dict[tuple, RotaryEmbedding] = {} def get_rope( head_size: int, - rotary_dim: int, max_position: int, is_neox_style: bool = True, rope_parameters: dict[str, Any] | None = None, @@ -54,12 +53,15 @@ def get_rope( else: dual_chunk_attention_args = None - partial_rotary_factor = 1.0 - if rope_parameters is not None: - partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + rope_parameters = rope_parameters or {} + base = rope_parameters.get("rope_theta", 10000) + scaling_type = rope_parameters.get("rope_type", "default") + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + + if partial_rotary_factor <= 0.0 or partial_rotary_factor > 1.0: + raise ValueError(f"{partial_rotary_factor=} must be between 0.0 and 1.0") + rotary_dim = int(head_size * partial_rotary_factor) - if partial_rotary_factor < 1.0: - rotary_dim = int(rotary_dim * partial_rotary_factor) key = ( head_size, rotary_dim, @@ -72,7 +74,6 @@ def get_rope( if key in _ROPE_DICT: return _ROPE_DICT[key] - base = rope_parameters["rope_theta"] if rope_parameters else 10000 if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -88,109 +89,76 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_parameters: - rotary_emb = RotaryEmbedding( + elif scaling_type == "default": + if "mrope_section" in rope_parameters: + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), + ) + else: + rotary_emb = RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + ) + elif scaling_type == "llama3": + scaling_factor = rope_parameters["factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + rotary_emb = Llama3RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + scaling_factor, + low_freq_factor, + high_freq_factor, + original_max_position, + ) + elif scaling_type == "mllama4": + rotary_emb = Llama4VisionRotaryEmbedding( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) - else: - scaling_type = rope_parameters["rope_type"] - - if scaling_type == "llama3": - scaling_factor = rope_parameters["factor"] - low_freq_factor = rope_parameters["low_freq_factor"] - high_freq_factor = rope_parameters["high_freq_factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - rotary_emb = Llama3RotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - scaling_factor, - low_freq_factor, - high_freq_factor, - original_max_position, - ) - elif scaling_type == "mllama4": - rotary_emb = Llama4VisionRotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, dtype - ) - elif scaling_type == "default": - if "mrope_section" in rope_parameters: - rotary_emb = MRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_parameters["mrope_section"], - mrope_interleaved=rope_parameters.get("mrope_interleaved", False), - ) - else: - rotary_emb = RotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - ) - elif scaling_type == "linear": - scaling_factor = rope_parameters["factor"] - rotary_emb = LinearScalingRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - dtype, - ) - elif scaling_type == "ntk": - scaling_factor = rope_parameters["factor"] - mixed_b = rope_parameters.get("mixed_b") - rotary_emb = NTKScalingRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - dtype, - mixed_b, - ) - elif scaling_type == "dynamic": - if "alpha" in rope_parameters: - scaling_alpha = rope_parameters["alpha"] - rotary_emb = DynamicNTKAlphaRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_alpha, - dtype, - ) - elif "factor" in rope_parameters: - scaling_factor = rope_parameters["factor"] - rotary_emb = DynamicNTKScalingRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - dtype, - ) - else: - raise ValueError( - "Dynamic rope scaling must contain either 'alpha' or 'factor' field" - ) - elif scaling_type == "xdrope": + elif scaling_type == "linear": + scaling_factor = rope_parameters["factor"] + rotary_emb = LinearScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + elif scaling_type == "ntk": + scaling_factor = rope_parameters["factor"] + mixed_b = rope_parameters.get("mixed_b") + rotary_emb = NTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + mixed_b, + ) + elif scaling_type == "dynamic": + if "alpha" in rope_parameters: scaling_alpha = rope_parameters["alpha"] - rotary_emb = XDRotaryEmbedding( + rotary_emb = DynamicNTKAlphaRotaryEmbedding( head_size, rotary_dim, max_position, @@ -198,67 +166,66 @@ def get_rope( is_neox_style, scaling_alpha, dtype, - xdrope_section=rope_parameters["xdrope_section"], ) - elif scaling_type == "yarn": + elif "factor" in rope_parameters: scaling_factor = rope_parameters["factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - extra_kwargs = { - k: v - for k, v in rope_parameters.items() - if k - in ( - "extrapolation_factor", - "attn_factor", - "beta_fast", - "beta_slow", - "apply_yarn_scaling", - "truncate", - ) - } - if "mrope_section" in rope_parameters: - extra_kwargs.pop("apply_yarn_scaling", None) - rotary_emb = MRotaryEmbedding( - head_size, - rotary_dim, - original_max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_parameters["mrope_section"], - mrope_interleaved=rope_parameters.get("mrope_interleaved", False), - scaling_factor=scaling_factor, - **extra_kwargs, - ) - else: - rotary_emb = YaRNScalingRotaryEmbedding( - head_size, - rotary_dim, - original_max_position, - base, - is_neox_style, - scaling_factor, - dtype, - **extra_kwargs, - ) - elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]: - scaling_factor = rope_parameters["factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - # assert max_position == original_max_position * scaling_factor - extra_kwargs = { - k: v - for k, v in rope_parameters.items() - if k - in ( - "extrapolation_factor", - "attn_factor", - "beta_fast", - "beta_slow", - "mscale", - "mscale_all_dim", - ) - } - rotary_emb = DeepseekScalingRotaryEmbedding( + rotary_emb = DynamicNTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + else: + raise ValueError( + "Dynamic rope scaling must contain either 'alpha' or 'factor' field" + ) + elif scaling_type == "xdrope": + scaling_alpha = rope_parameters["alpha"] + rotary_emb = XDRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_alpha, + dtype, + xdrope_section=rope_parameters["xdrope_section"], + ) + elif scaling_type == "yarn": + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_parameters.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "apply_yarn_scaling", + "truncate", + ) + } + if "mrope_section" in rope_parameters: + extra_kwargs.pop("apply_yarn_scaling", None) + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), + scaling_factor=scaling_factor, + **extra_kwargs, + ) + else: + rotary_emb = YaRNScalingRotaryEmbedding( head_size, rotary_dim, original_max_position, @@ -268,28 +235,55 @@ def get_rope( dtype, **extra_kwargs, ) - elif scaling_type == "longrope": - short_factor = rope_parameters["short_factor"] - long_factor = rope_parameters["long_factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - extra_kwargs = { - k: v - for k, v in rope_parameters.items() - if k in ("short_mscale", "long_mscale") - } - rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( - head_size, - rotary_dim, - max_position, - original_max_position, - base, - is_neox_style, - dtype, - short_factor, - long_factor, - **extra_kwargs, + elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]: + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + # assert max_position == original_max_position * scaling_factor + extra_kwargs = { + k: v + for k, v in rope_parameters.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + } + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + scaling_factor, + dtype, + **extra_kwargs, + ) + elif scaling_type == "longrope": + short_factor = rope_parameters["short_factor"] + long_factor = rope_parameters["long_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_parameters.items() + if k in ("short_mscale", "long_mscale") + } + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( + head_size, + rotary_dim, + max_position, + original_max_position, + base, + is_neox_style, + dtype, + short_factor, + long_factor, + **extra_kwargs, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") _ROPE_DICT[key] = rotary_emb return rotary_emb diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 85827d54c911a..3ced52c2050d6 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -241,7 +241,6 @@ class AfmoeAttention(nn.Module): if self.is_local_attention: self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config["rope_parameters"], is_neox_style=True, diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 2a8be29d8d306..e3f97a718b0f4 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -226,7 +226,6 @@ class ApertusAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 266d29a8d9b2b..0200984c0ec85 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -314,7 +314,6 @@ class ArcticAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index beb22995a0719..ee4a1dbd6df94 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -189,7 +189,6 @@ class BaiChuanAttention(nn.Module): else: self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 0143e140af265..4bccee7521749 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -127,11 +127,11 @@ class BailingAttention(nn.Module): prefix=f"{prefix}.dense", ) - self.rotary_dim = getattr(config, "rotary_dim", self.head_dim) + rotary_dim = getattr(config, "rotary_dim", self.head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 00d742f84ef79..22631bbc5489b 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -178,14 +178,11 @@ class BambaAttentionDecoderLayer(nn.Module): self.scaling = self.head_dim**-0.5 self.max_position_embeddings = max_position_embeddings - if hasattr(config, "attn_rotary_emb"): - rotary_dim = config.attn_rotary_emb # for backward compatibility - else: - rotary_dim = self.head_dim # default + rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index dfc05a366b286..176c5cd14c6e2 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -314,7 +314,6 @@ class ChameleonAttention(nn.Module): self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim)) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 3d485fdd0a2e1..26181d1c9bae4 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -99,13 +99,16 @@ class GLMAttention(nn.Module): # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) - rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio} + rope_parameters = { + "rope_type": "default", + "rope_theta": 10000 * rope_ratio, + "partial_rotary_factor": 0.5, + } # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim // 2, max_position=max_positions, rope_parameters=rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index f837502c468f1..63a93eaa2d4f3 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -175,7 +175,6 @@ class CohereAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=False, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 8de793941b8c3..06cc92ee88180 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -42,9 +42,10 @@ class GteNewModelConfig(VerifyAndUpdateConfig): config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, "rope_parameters": config.rope_parameters, } @@ -77,9 +78,11 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): if not model_config.enforce_eager: max_position = round_up(max_position, 8) + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim + config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, "rope_parameters": config.rope_parameters, } @@ -113,12 +116,10 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): config.num_hidden_layers = config.n_layer head_dim = config.hidden_size // config.num_attention_heads - rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, "rope_parameters": config.rope_parameters, } @@ -240,9 +241,10 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, "rope_parameters": config.rope_parameters, } diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 946baffc8817a..db4fe61b0d85f 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -222,7 +222,6 @@ class DbrxAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0b6513789aea8..a9fa76deecbd2 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -156,7 +156,6 @@ class DeepseekAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) @@ -499,7 +498,6 @@ class DeepseekV2Attention(nn.Module): self.rotary_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=False, @@ -1018,7 +1016,6 @@ class DeepseekV2MLAAttention(nn.Module): self.rotary_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=False, @@ -1038,7 +1035,6 @@ class DeepseekV2MLAAttention(nn.Module): if self.is_v32: self.indexer_rope_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 3beee9f864634..870a37039f151 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -250,7 +250,6 @@ class Dots1Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 278ba45e9684c..fbbd31a485383 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -288,7 +288,6 @@ class Ernie4_5_MoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=False, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index acf651ed24988..039e7cf68e52b 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -167,7 +167,6 @@ class ExaoneAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index cb710a7ec5cf9..b4b7a798fd050 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -176,7 +176,6 @@ class Exaone4Attention(nn.Module): set_default_rope_theta(config, default_theta=1000000) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 32d9e7b925597..7cdfcae0e718d 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -167,7 +167,6 @@ class FalconAttention(nn.Module): max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index a1c1263f8d724..bfb6b1a1f160d 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -242,14 +242,11 @@ class FalconH1AttentionDecoderLayer(nn.Module): self.scaling = self.head_dim**-0.5 self.max_position_embeddings = max_position_embeddings - if hasattr(config, "attn_rotary_emb"): - rotary_dim = config.attn_rotary_emb # for backward compatibility - else: - rotary_dim = self.head_dim # default + rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index dd5a74c8ed005..7304a728067f4 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -174,7 +174,6 @@ class GemmaAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index cb36e04824588..fe6ec5ff83dec 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -152,7 +152,6 @@ class Gemma2Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 73176eba95ed5..40f6d100c767e 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -176,7 +176,6 @@ class Gemma3Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index f4427c9fd1d10..4d446f51c2ecb 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -384,7 +384,6 @@ class Gemma3nAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 9adfa942b99fa..2cd11e66c752b 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -81,7 +81,6 @@ class Glm4Attention(nn.Module): config.rope_parameters.setdefault("partial_rotary_factor", 0.5) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.head_dim = head_dim or hidden_size // self.total_num_heads - self.rotary_dim = self.head_dim self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -103,7 +102,6 @@ class Glm4Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.rotary_dim, max_position=max_position, rope_parameters=config.rope_parameters, is_neox_style=False, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index de091f03e881c..786482d77a1d2 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -678,9 +678,9 @@ class Glm4vVisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.blocks = nn.ModuleList( [ diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 8cae5ee425e4d..541d3b2beff83 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -285,7 +285,6 @@ class Glm4MoeAttention(nn.Module): config.rope_parameters.setdefault("partial_rotary_factor", 0.5) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index f0a34c47da54c..f32ac2639435c 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,12 +95,13 @@ class GPTJAttention(nn.Module): scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 + rope_parameters = getattr(config, "rope_parameters", {}) + rope_parameters["partial_rotary_factor"] = config.rotary_dim / self.head_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, - rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - rope_parameters=getattr(config, "rope_parameters", None), + rope_parameters=rope_parameters, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 212d605c17285..c4d11b488f38b 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,7 +92,6 @@ class GPTNeoXAttention(nn.Module): max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, - rotary_dim=self.head_size, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index cff16b7a7a8cd..6a92cf1533213 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,7 +67,6 @@ class OAIAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, dtype=torch.float32, rope_parameters={ diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 76519c4660f15..82c945f5ad5ec 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -160,7 +160,6 @@ class GraniteAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index b038400a1262a..0b1064b6343e3 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -190,7 +190,6 @@ class GraniteMoeAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1d9c2f5df4a55..3434716b83789 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -271,7 +271,6 @@ class GraniteMoeHybridAttention(nn.Module): if config.position_embedding_type == "rope": self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 6f62a1d11e52e..0a2e5cf39ffd8 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -181,7 +181,6 @@ class Grok1Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index ccdfa3fe175f1..0e82e84c4edbe 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -199,7 +199,6 @@ class HunYuanAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, @@ -305,7 +304,6 @@ class HunYuanCrossAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c79934e121447..3ca8864618628 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -140,7 +140,6 @@ class InternLM2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index a4a994f97a2f8..142ad3d6d1d1a 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -143,7 +143,6 @@ class Lfm2Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index c8669de72dd09..70804e0a843e8 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -236,7 +236,6 @@ class Lfm2MoeAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 167dfbca248ce..3507a2bc66c17 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -259,7 +259,6 @@ class LlamaAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=getattr(config, "rope_parameters", None), is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 423be45e80149..7b3da3e10ab8a 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -243,7 +243,6 @@ class Llama4Attention(nn.Module): self.rotary_emb = ( get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 67c462f4b25c4..f104018d3aa6c 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -277,7 +277,6 @@ class MiniCPMAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 0a2bcbd7f6084..c7a54cea21544 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -120,7 +120,6 @@ class MiniCPM3Attention(nn.Module): self.rotary_emb = get_rope( self.qk_rope_head_dim, - rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 3e6a9add9ec49..ee19288ae6852 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -199,9 +199,13 @@ class MiniMaxM2Attention(nn.Module): prefix=f"{prefix}.o_proj", ) + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 390de78cc27b4..4bfe3c391c26f 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -187,7 +187,6 @@ class MiniMaxText01Attention(nn.Module): num_heads: int, head_dim: int, num_kv_heads: int, - rotary_dim: int, max_position: int = 4096 * 32, rope_parameters: dict | None = None, sliding_window: int | None = None, @@ -245,7 +244,6 @@ class MiniMaxText01Attention(nn.Module): ) self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=rotary_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, @@ -290,6 +288,8 @@ class MiniMaxText01DecoderLayer(nn.Module): head_dim = getattr(config, "head_dim", None) if head_dim is None: head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = min( config.max_position_embeddings, config.max_model_len @@ -321,9 +321,6 @@ class MiniMaxText01DecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, head_dim=head_dim, - rotary_dim=config.rotary_dim - if hasattr(config, "rotary_dim") - else head_dim, num_kv_heads=config.num_key_value_heads, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 50ec57e7a8053..e170c530ca29f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -206,7 +206,6 @@ class MixtralAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e944c0ee38aa1..fe963cc6644fb 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -295,11 +295,11 @@ class Llama4VisionAttention(nn.Module): rope_parameters = { "rope_type": "mllama4", "rope_theta": config.rope_parameters["rope_theta"], + "partial_rotary_factor": 0.5, } self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, rope_parameters=rope_parameters, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index be36f761c63aa..4655ffa7b2f61 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -105,7 +105,6 @@ class ModernBertAttention(nn.Module): self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=rope_parameters, dtype=torch.float16, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index a6cd9ad16c188..71c6b1aa2e814 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -433,7 +433,6 @@ class MolmoAttention(nn.Module): # Rotary embeddings. self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index bf83ee5e42a15..21605015c470b 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -199,7 +199,6 @@ class NemotronAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 734fbc60709fa..19a942a5277cc 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -118,7 +118,6 @@ class DeciLMAttention(LlamaAttention): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 3bbb4dd242262..dd7c27f10c531 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -102,7 +102,6 @@ class OlmoAttention(nn.Module): # Rotary embeddings. self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 88e9c2d8541a1..b030c94b54cd5 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -146,7 +146,6 @@ class Olmo2Attention(nn.Module): rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 1376583a99725..a5a926151c5c9 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -171,7 +171,6 @@ class OlmoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index bddd9fa50957a..47abd7bf0b68a 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -352,7 +352,6 @@ class OpenPanguMLAAttention(nn.Module): } self.rotary_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=False, @@ -525,7 +524,6 @@ class OpenPanguEmbeddedAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 544a44ed54681..9d9066c4ba619 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -135,7 +135,6 @@ class OrionAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index dcae92ed20881..829148b4c1fb7 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -166,7 +166,6 @@ class OuroAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=config.rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 8f26c68720a5c..b644603c5baa1 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -134,7 +134,6 @@ class PersimmonAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 253fbbc41330c..e01e9d47c545c 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -84,19 +84,18 @@ class PhiAttention(nn.Module): prefix: str = "", ): super().__init__() - self.total_num_heads = config.num_attention_heads self.hidden_size = config.hidden_size - self.head_size = self.hidden_size // self.total_num_heads + self.head_size = self.hidden_size // config.num_attention_heads tensor_model_parallel_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = self.total_num_heads // tensor_model_parallel_world_size + assert config.num_attention_heads % tensor_model_parallel_world_size == 0 + self.num_heads = config.num_attention_heads // tensor_model_parallel_world_size # pylint: disable=C0103 self.qkv_proj = QKVParallelLinear( self.hidden_size, self.head_size, - self.total_num_heads, + config.num_attention_heads, bias=True, quant_config=quant_config, prefix=f"{prefix}.qkv_proj", @@ -109,13 +108,10 @@ class PhiAttention(nn.Module): ) scaling = self.head_size**-0.5 - rotary_dim = config.hidden_size // config.num_attention_heads - assert rotary_dim % 2 == 0 max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, - rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 49530776f8903..14f73d0c64586 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -352,7 +352,6 @@ class PhiMoEAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 472de5590dcf8..6765ee0c5779c 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -574,7 +574,6 @@ class Plamo2AttentionMixer(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 4aeb9d432dcc6..3557104d905cb 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -179,7 +179,6 @@ class Plamo3AttentionMixer(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 12285cf9c1968..492ba2fb12145 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -114,7 +114,6 @@ class QWenAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index f5501bae78418..3af4a49cd77cc 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -164,7 +164,6 @@ class Qwen2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3cc3a3a7873c6..fba06e34f6227 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -624,9 +624,9 @@ class Qwen2_5_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.attn_backend = get_vit_attn_backend( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index cbc618f1abd08..2750f1864b81a 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -244,7 +244,6 @@ class Qwen2MoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 608e90337f452..2c4ac2f8efff1 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -621,9 +621,9 @@ class Qwen2VisionTransformer(nn.Module): head_dim = embed_dim // num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 7d2b3e5f9bc79..0d0da52ed7382 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -111,7 +111,6 @@ class Qwen3Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index c6984dc37c51c..0be81ecc7dd3a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -269,7 +269,6 @@ class Qwen3MoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index dd64e3983e381..6a5447ad0fed4 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -747,7 +747,6 @@ class Qwen3NextAttention(nn.Module): self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, dual_chunk_attention_config=self.dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index dbe7bcd07576b..635c3bfdc65c7 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -333,9 +333,9 @@ class Qwen3Omni_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f8e0ea6284994..fcd58c4d33cd7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -340,9 +340,9 @@ class Qwen3_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.merger = Qwen3_VisionPatchMerger( diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 267c60157506d..f25223c782552 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -161,7 +161,6 @@ class SeedOssAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 7bef56110cab7..964aa902704b3 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -160,7 +160,6 @@ class SolarAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index e879599ad3ead..ea4342882feb4 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -148,7 +148,6 @@ class StablelmAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, rope_parameters=self.config.rope_parameters, ) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 46422f303ff43..569ca9b082cfa 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -112,7 +112,6 @@ class Starcoder2Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 077cce84a98dd..7077f1a22e8d7 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -196,7 +196,6 @@ class Step3TextAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embedding, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 653b5b9beef7b..fe157887eea91 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -230,7 +230,6 @@ class Zamba2Attention(nn.Module): if config.use_mem_rope: self.rotary_emb = get_rope( head_size=self.attention_head_dim, - rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d761802da9403..fb88c62dc5b23 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -306,8 +306,13 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" from vllm.config.utils import getattr_iter - rope_theta_names = ("rope_theta", "rotary_emb_base") - rope_theta = getattr_iter(config, rope_theta_names, None) + # Older custom models may use non-standard field names + # which need patching for both Transformers v4 and v5. + names = ["rope_theta", "rotary_emb_base"] + rope_theta = getattr_iter(config, names, None, warn=True) + names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"] + partial_rotary_factor = getattr_iter(config, names, None, warn=True) + if Version(version("transformers")) < Version("5.0.0.dev0"): # Transformers v4 installed, legacy config fields may be present if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: @@ -316,14 +321,18 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: if not hasattr(config, "rope_parameters"): config.rope_parameters = {"rope_type": "default"} config.rope_parameters["rope_theta"] = rope_theta - partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct") - partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None) if partial_rotary_factor is not None: if not hasattr(config, "rope_parameters"): config.rope_parameters = {"rope_type": "default"} config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor elif rope_theta is not None or hasattr(config, "rope_parameters"): # Transformers v5 installed + # Patch these fields in case they used non-standard names + if rope_theta is not None: + config.rope_theta = rope_theta + if partial_rotary_factor is not None: + config.partial_rotary_factor = partial_rotary_factor + # Standardize and validate RoPE parameters config.standardize_rope_params() config.validate_rope() From 90d6cf921fe623524f618740616a6cf494d4a8df Mon Sep 17 00:00:00 2001 From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:00:15 -0800 Subject: [PATCH 52/67] [BugFix][MM]support VLLM_RANDOMIZE_DP_DUMMY_INPUTS (#30472) Signed-off-by: Xingyu Liu Co-authored-by: Cyrus Leung --- vllm/v1/worker/gpu_model_runner.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0e2bf9df9a18f..40c8059f90d34 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools import gc import itertools import time @@ -3892,19 +3893,21 @@ class GPUModelRunner( return {} @contextmanager - def maybe_randomize_inputs(self, input_ids: torch.Tensor): + def maybe_randomize_inputs( + self, input_ids: torch.Tensor | None, inputs_embeds: torch.Tensor | None + ): """ Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. This is to help balance expert-selection - during profile_run - during DP rank dummy run """ + dp_size = self.vllm_config.parallel_config.data_parallel_size randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 if not randomize_inputs: yield - else: - import functools + elif input_ids is not None: @functools.cache def rand_input_ids() -> torch.Tensor: @@ -3912,13 +3915,27 @@ class GPUModelRunner( self.input_ids.gpu, low=0, high=self.model_config.get_vocab_size(), - dtype=input_ids.dtype, ) - logger.debug_once("Randomizing dummy data for DP Rank") + logger.debug_once("Randomizing dummy input_ids for DP Rank") input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True) yield input_ids.fill_(0) + else: + + @functools.cache + def rand_inputs_embeds() -> torch.Tensor: + return torch.randn_like( + self.inputs_embeds.gpu, + ) + + assert inputs_embeds is not None + logger.debug_once("Randomizing dummy inputs_embeds for DP Rank") + inputs_embeds.copy_( + rand_inputs_embeds()[: inputs_embeds.size(0)], non_blocking=True + ) + yield + inputs_embeds.fill_(0) def _get_mm_dummy_batch( self, @@ -4167,7 +4184,7 @@ class GPUModelRunner( num_tokens_across_dp[:] = num_tokens_padded with ( - self.maybe_randomize_inputs(input_ids), + self.maybe_randomize_inputs(input_ids, inputs_embeds), set_forward_context( attn_metadata, self.vllm_config, From 0efd9f867c6a7617fbcb8a335677bb8295f1bcb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 11 Dec 2025 22:06:51 +0100 Subject: [PATCH 53/67] [Core] Whisper Enable Encoder Batching (#29421) Signed-off-by: NickLucche --- vllm/config/model.py | 5 +++ vllm/config/vllm.py | 30 +++++---------- vllm/model_executor/models/whisper.py | 17 +++++++-- vllm/v1/core/encoder_cache_manager.py | 53 +++++++++++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 7 +++- 5 files changed, 87 insertions(+), 25 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 03140c17fb50e..59e9689567bd2 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -539,6 +539,11 @@ class ModelConfig: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + + if self.is_encoder_decoder: + self.mm_processor_cache_gb = 0 + logger.info("Encoder-decoder model detected, disabling mm processor cache.") + # Init multimodal config if needed if self._model_info.supports_multimodal: if ( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 0e75daf0d722c..b5f8f916de438 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -750,27 +750,17 @@ class VllmConfig: # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands self._set_compile_ranges() - if self.model_config and self.model_config.is_encoder_decoder: - from vllm.multimodal import MULTIMODAL_REGISTRY - - self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + if ( + self.model_config + and self.model_config.architecture == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): + logger.warning( + "Whisper is known to have issues with " + "forked workers. If startup is hanging, " + "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " + "to 'spawn'." ) - logger.debug( - "Encoder-decoder model detected: setting " - "`max_num_encoder_input_tokens` to encoder length (%s)", - self.scheduler_config.max_num_encoder_input_tokens, - ) - if ( - self.model_config.architecture == "WhisperForConditionalGeneration" - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" - ): - logger.warning( - "Whisper is known to have issues with " - "forked workers. If startup is hanging, " - "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " - "to 'spawn'." - ) if ( self.kv_events_config is not None diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b2feff1335151..b513e3513b2e2 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -522,6 +522,7 @@ class WhisperEncoder(nn.Module): def forward(self, input_features: torch.Tensor | list[torch.Tensor]): hidden_states = [] + input_is_batched = False for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) embeds = nn.functional.gelu(self.conv2(embeds)) @@ -530,7 +531,13 @@ class WhisperEncoder(nn.Module): embeds.dtype ) hidden_states.append(embeds) - hidden_states = torch.cat(hidden_states) + input_is_batched = embeds.ndim > 2 + # Input to MHA must be B x T x D + if input_is_batched: + # Models using WhisperEncoder may handle batching internally. + hidden_states = torch.cat(hidden_states) + else: + hidden_states = torch.stack(hidden_states, dim=0) for encoder_layer in self.layers: hidden_states = encoder_layer(hidden_states) @@ -603,8 +610,7 @@ class WhisperModel(nn.Module): positions: torch.Tensor, encoder_outputs: list[torch.Tensor], ) -> torch.Tensor: - assert len(encoder_outputs) in (0, 1) - enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None + enc_states = torch.cat(encoder_outputs, dim=0) if len(encoder_outputs) else None decoder_outputs = self.decoder( input_ids=input_ids, positions=positions, @@ -913,7 +919,10 @@ class WhisperForConditionalGeneration( def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: # Required as part of SupportsMultiModal interface. audio_input = self._parse_and_validate_audio_input(**kwargs) - return [self.model.get_encoder_outputs(audio_input["input_features"])] + # Split concatenated encoder outputs into one tensor per audio input + enc_output = self.model.get_encoder_outputs(audio_input["input_features"]) + # The assumption is we can only process whole mm items (audios) + return enc_output.unbind(dim=0) def embed_input_ids( self, diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 3959e9a59a53b..50f738713590b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -341,3 +341,56 @@ def compute_mm_encoder_budget( ) return encoder_compute_budget, encoder_cache_size + + +# NOTE (NickLucche): Temporary implementation for encoder-decoder models that only +# use the manager for scheduling purposes. Encoder-decoder models will eventually +# utilize the cache and this class will fold into EncoderCacheManager, as +# differences with MM models shrink. +class EncoderDecoderCacheManager(EncoderCacheManager): + def __init__(self, cache_size: int): + self.cache_size = cache_size + self.num_free_slots = cache_size + self.freed: list[str] = [] + + def check_and_update_cache(self, request: Request, input_id: int) -> bool: + return False + + def can_allocate( + self, + request: Request, + input_id: int, + encoder_compute_budget: int, + num_tokens_to_schedule: int, + ) -> bool: + num_tokens = request.get_num_encoder_tokens(input_id) + # Not enough compute budget + if num_tokens > encoder_compute_budget: + return False + + num_tokens += num_tokens_to_schedule + # Enough free slots + return num_tokens <= self.num_free_slots + + def allocate(self, request: Request, input_id: int) -> None: + num_encoder_tokens = request.get_num_encoder_tokens(input_id) + self.num_free_slots -= num_encoder_tokens + + mm_hash = request.mm_features[input_id].identifier + self.freed.append(mm_hash) + + def free(self, request: Request) -> None: + for input_id in range(len(request.mm_features)): + self.free_encoder_input(request, input_id) + + def get_cached_input_ids(self, request: Request) -> set[int]: + return set(range(len(request.mm_features))) + + def get_freed_mm_hashes(self) -> list[str]: + freed = self.freed + self.freed = [] + return freed + + def free_encoder_input(self, request: Request, input_id: int) -> None: + num_tokens = request.get_num_encoder_tokens(input_id) + self.num_free_slots += num_tokens diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c3d504f2e72c3..a9ce6e63cc775 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -27,6 +27,7 @@ from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import ( EncoderCacheManager, + EncoderDecoderCacheManager, compute_encoder_budget, ) from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager @@ -181,7 +182,11 @@ class Scheduler(SchedulerInterface): # NOTE: For the models without encoder (e.g., text-only models), # the encoder cache will not be initialized because cache size is 0 # for these models. - self.encoder_cache_manager = EncoderCacheManager(cache_size=encoder_cache_size) + self.encoder_cache_manager = ( + EncoderDecoderCacheManager(cache_size=encoder_cache_size) + if self.is_encoder_decoder + else EncoderCacheManager(cache_size=encoder_cache_size) + ) speculative_config = vllm_config.speculative_config self.use_eagle = False From 3efdc3feaef01d45fb54650163da480bdf2f0ce4 Mon Sep 17 00:00:00 2001 From: ioana ghiban Date: Thu, 11 Dec 2025 23:03:29 +0100 Subject: [PATCH 54/67] [Docs][CPU backend] Add pre-built Arm CPU Docker images (#30491) Signed-off-by: Ioana Ghiban --- .../installation/cpu.arm.inc.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 8ec18bcb826ec..ad9c7d9ef21be 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -100,7 +100,23 @@ Testing has been conducted on AWS Graviton3 instances for compatibility. # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] -Currently, there are no pre-built Arm CPU images. +See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image. + +Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo). +Please replace `` in the command below with a specific version string (e.g., `0.12.0`). + +```bash +docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v +``` + +You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +```bash +export VLLM_COMMIT=6299628d326f429eba78736acb44e76749b281f5 # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}-arm64-cpu +``` # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] From c817b1415121cf88178af1e4e78f651d802df4da Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:28:34 -0500 Subject: [PATCH 55/67] [Perf] Optimize deepgemm experts initialization, 3.9% TTFT improvement (#30494) Signed-off-by: yewentao256 Co-authored-by: li-jinpeng <3332126450@qq.com> Co-authored-by: youkaichao --- .../layers/fused_moe/deep_gemm_utils.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py index 6cca954123274..57d303cd53fef 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py @@ -84,10 +84,16 @@ def _fwd_kernel_ep_scatter_1( m_indices_start_ptr = m_indices + cur_expert_start off_expert = tl.arange(0, BLOCK_E) + # any rows in the per-expert aligned region that do not correspond to + # real tokens are left untouched here and should remain initialized to + # -1 so DeepGEMM can skip them for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4): + offs = start_m + off_expert + mask = offs < cur_expert_token_num tl.store( - m_indices_start_ptr + start_m + off_expert, + m_indices_start_ptr + offs, cur_expert, + mask=mask, ) @@ -366,12 +372,17 @@ def deepgemm_moe_permute( (M_sum, H // block_k), device=device, dtype=torch.float32 ) - maybe_has_empty_blocks = (expert_tokens_meta is None) or ( - expert_tokens_meta.expert_num_tokens_cpu is None + # DeepGEMM uses negative values in m_indices (here expert_ids) to mark + # completely invalid / padded blocks that should be skipped. We always + # initialize expert_ids to -1 so any row that is not explicitly written + # by the scatter kernel will be treated as invalid and skipped by + # DeepGEMM's scheduler. + expert_ids = torch.full( + (M_sum,), + fill_value=-1, + device=device, + dtype=torch.int32, ) - expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty - - expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32) inv_perm = torch.empty(topk_ids.shape, device=device, dtype=torch.int32) expert_num_tokens = None From 61249b177de1566027fc74e9b51b45a4c973eb47 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:43:41 -0500 Subject: [PATCH 56/67] [Refactor] Remove useless syncwarp (#30510) Signed-off-by: yewentao256 --- csrc/moe/grouped_topk_kernels.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 47ee5f021eb4a..5fa367abd96f5 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -481,8 +481,6 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias, largest = value; } } - - __syncwarp(); // Ensure all threads have valid data before reduction // Get the top2 warpwise T max1 = cg::reduce(tile, largest, cg::greater()); @@ -589,7 +587,6 @@ __global__ void group_idx_and_topk_idx_kernel( int pre_count_equal_to_top_value = 0; // Use loop to find the largset top_group while (count_equal_to_top_value < target_num_min) { - __syncwarp(); // Ensure all threads have valid data before reduction topk_group_value = cg::reduce(tile, value, cg::greater()); if (value == topk_group_value) { value = neg_inf(); @@ -644,10 +641,8 @@ __global__ void group_idx_and_topk_idx_kernel( } } queue.done(); - __syncwarp(); // Get the topk_idx queue.dumpIdx(s_topk_idx); - __syncwarp(); } // Load the valid score value From a00d88973daf9a151ecbd4c740ca99645715b9df Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 11 Dec 2025 16:59:40 -0600 Subject: [PATCH 57/67] [EPLB] Support EPLB w/ NVFP4 (#29804) Signed-off-by: Andrew Briand Co-authored-by: Andrew Briand --- .../test_eplb_fused_moe_layer_dep_nvfp4.py | 276 ++++++++++++++++++ .../layers/quantization/modelopt.py | 26 +- .../quantization/utils/flashinfer_fp4_moe.py | 79 +++++ 3 files changed, 376 insertions(+), 5 deletions(-) create mode 100644 tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py new file mode 100644 index 0000000000000..951b692e1edaf --- /dev/null +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -0,0 +1,276 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Test that the interaction between EPLB and FusedMoE Layer is okay for DP w/ NVFP4 + +from dataclasses import dataclass + +import pytest +import torch + +from tests.kernels.moe.utils import make_test_quant_config +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace +from vllm.distributed.parallel_state import ( + ensure_model_parallel_initialized, + get_dp_group, +) +from vllm.forward_context import set_forward_context +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptNvFp4Config, + ModelOptNvFp4FusedMoE, +) + +from .eplb_utils import distributed_run, set_env_vars_and_device + + +@dataclass +class TestConfig: + num_layers: int + num_experts: int + num_local_experts: int + num_topk: int + hidden_size: int + intermediate_size: int + num_tokens: int + + +def make_fused_moe_layer( + rank: int, + layer_idx: int, + test_config: TestConfig, +) -> FusedMoE: + quant_config = None + + device = torch.device(f"cuda:{rank}") + + quant_config = ModelOptNvFp4Config( + is_checkpoint_nvfp4_serialized=True, + kv_cache_quant_algo=None, + exclude_modules=[], + ) + + fml = FusedMoE( + num_experts=test_config.num_experts, + top_k=test_config.num_topk, + hidden_size=test_config.hidden_size, + intermediate_size=test_config.intermediate_size, + prefix=f"dummy_layer_{layer_idx}", + activation="silu", + is_act_and_mul=True, + params_dtype=torch.bfloat16, + quant_config=quant_config, + ) + + nvfp4_fused_moe = ModelOptNvFp4FusedMoE(quant_config, fml) + nvfp4_fused_moe.create_weights( + fml, + test_config.num_local_experts, + test_config.hidden_size, + test_config.intermediate_size, + params_dtype=torch.uint8, + global_num_experts=test_config.num_experts, + ) + + fml = fml.to(device) + w1_q, w2_q, quant_config = make_test_quant_config( + test_config.num_local_experts, + test_config.intermediate_size, + test_config.hidden_size, + in_dtype=torch.bfloat16, + quant_dtype="nvfp4", + block_shape=None, + per_act_token_quant=False, + ) + + fml.w13_weight.data = w1_q + fml.w2_weight.data = w2_q + + fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5 + fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5 + fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5 + fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5 + fml.w2_weight_scale.data = ( + torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5 + ).to(fml.w2_weight_scale.data.dtype) + fml.w13_weight_scale.data = ( + torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5 + ).to(fml.w13_weight_scale.data.dtype) + + nvfp4_fused_moe.process_weights_after_loading(fml) + + fml.maybe_init_modular_kernel() + + return fml + + +def _test_eplb_fml(env, world_size: int, test_config: TestConfig): + set_env_vars_and_device(env) + + vllm_config = VllmConfig() + vllm_config.parallel_config.data_parallel_size = world_size + vllm_config.parallel_config.enable_expert_parallel = True + + with set_current_vllm_config(vllm_config): + ensure_model_parallel_initialized( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + + ep_group = get_dp_group().cpu_group + ep_rank = torch.distributed.get_rank() + + device = torch.device(f"cuda:{ep_rank}") + + fml_layers = [ + make_fused_moe_layer(ep_rank, layer_idx, test_config).to(device) + for layer_idx in range(test_config.num_layers) + ] + rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers] + + hidden_states = [] + router_logits = [] + for layer_idx in range(test_config.num_layers): + hidden_states.append( + torch.randn( + (test_config.num_tokens, test_config.hidden_size), + dtype=torch.bfloat16, + device=device, + ) + ) + router_logits.append( + torch.randn( + (test_config.num_tokens, test_config.num_experts), + dtype=torch.bfloat16, + device=device, + ) + ) + + out_before_shuffle = [] + with set_forward_context( + {}, + num_tokens=test_config.num_tokens, + num_tokens_across_dp=torch.tensor( + [test_config.num_tokens] * world_size, device="cpu", dtype=torch.int + ), + vllm_config=vllm_config, + ): + for lidx, fml in enumerate(fml_layers): + out_before_shuffle.append( + fml(hidden_states[lidx].clone(), router_logits[lidx].clone()) + ) + + indices = torch.zeros( + test_config.num_layers, test_config.num_experts, dtype=torch.long + ) + for lidx in range(test_config.num_layers): + indices[lidx] = torch.Tensor(range(test_config.num_experts)) + + shuffled_indices = torch.zeros_like(indices) + for lidx in range(test_config.num_layers): + shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + + rearrange_expert_weights_inplace( + indices, + shuffled_indices, + rank_expert_weights, + ep_group, + is_profile=False, + ) + + num_global_experts = test_config.num_experts + + logical_to_physical_map_list = [] + for lidx, fml in enumerate(fml_layers): + physical_to_logical_map = shuffled_indices[lidx].to(device) + logical_to_physical_map = torch.empty( + (num_global_experts,), dtype=torch.int32, device=device + ) + logical_to_physical_map[physical_to_logical_map] = torch.arange( + 0, num_global_experts, dtype=torch.int32, device=device + ) + logical_to_physical_map_list.append( + logical_to_physical_map.reshape(num_global_experts, 1) + ) + + logical_to_physical_map = torch.stack(logical_to_physical_map_list) + + for lidx, fml in enumerate(fml_layers): + logical_replica_count = torch.ones( + (test_config.num_layers, num_global_experts), + dtype=torch.int32, + device=device, + ) + fml.enable_eplb = True + fml.set_eplb_state( + lidx, + torch.zeros( + (test_config.num_layers, num_global_experts), + dtype=torch.int32, + device=device, + ), + logical_to_physical_map, + logical_replica_count, + ) + + out_after_shuffle = [] + with set_forward_context( + {}, + num_tokens=test_config.num_tokens, + num_tokens_across_dp=torch.tensor( + [test_config.num_tokens] * world_size, device="cpu", dtype=torch.int + ), + vllm_config=vllm_config, + ): + for lidx, fml in enumerate(fml_layers): + out_after_shuffle.append( + fml(hidden_states[lidx].clone(), router_logits[lidx].clone()) + ) + + for lidx in range(test_config.num_layers): + torch.testing.assert_close( + out_before_shuffle[lidx], out_after_shuffle[lidx], atol=1e-1, rtol=1e-1 + ) + + +@pytest.mark.parametrize("world_size", [2, 4]) +@pytest.mark.parametrize("num_layers", [8]) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("hidden_size", [256]) +@pytest.mark.parametrize("intermediate_size", [256]) +@pytest.mark.parametrize("num_tokens", [256]) +@pytest.mark.parametrize("backend", ["latency", "throughput"]) +def test_eplb_fml( + world_size: int, + num_layers: int, + num_experts: int, + hidden_size: int, + intermediate_size: int, + num_tokens: int, + backend: str, + monkeypatch, +): + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") + monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend) + + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need at least {world_size} GPUs to run the test") + + num_local_experts = num_experts // world_size + num_topk = 4 + + test_config = TestConfig( + num_layers=num_layers, + num_experts=num_experts, + num_local_experts=num_local_experts, + num_topk=num_topk, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_tokens=num_tokens, + ) + + distributed_run( + _test_eplb_fml, + world_size, + test_config, + ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index e825cb33c3580..18a0fe6fbbb44 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -38,6 +38,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, flashinfer_trtllm_fp4_moe, + flashinfer_trtllm_fp4_routed_moe, prepare_static_weights_for_trtllm_fp4_moe, reorder_w1w3_to_w3w1, select_nvfp4_gemm_impl, @@ -1325,7 +1326,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): "Accuracy may be affected." ) - w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0] + w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0].contiguous() layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False) # Common processing for input scales and alphas @@ -1482,6 +1483,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): a2_gscale=layer.w2_input_scale_quant, ) + @property + def supports_eplb(self) -> bool: + return True + def apply( self, layer: FusedMoE, @@ -1500,11 +1505,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): if ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + and not layer.enable_eplb ): - if layer.enable_eplb: - raise NotImplementedError( - "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." - ) return flashinfer_trtllm_fp4_moe( layer=layer, x=x, @@ -1522,6 +1524,20 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): router_logits=router_logits, ) + # EPLB path + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + return flashinfer_trtllm_fp4_routed_moe( + layer=layer, + x=x, + topk_ids=topk_ids, + topk_weights=topk_weights, + top_k=layer.top_k, + global_num_experts=layer.global_num_experts, + ) + if self.use_marlin: return fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index eda40657b1e39..8f96222f19f20 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -331,3 +331,82 @@ def flashinfer_trtllm_fp4_moe( )[0] return out + + +def flashinfer_trtllm_fp4_routed_moe( + layer: torch.nn.Module, + x: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + top_k: int, + global_num_experts: int, +) -> torch.Tensor: + """ + Apply FlashInfer TensorRT-LLM FP4 MoE kernel. Uses packed + input top k expert indices and scores rather than computing + top k expert indices from scores. + + Args: + layer: The MoE layer with weights and scales + x: Input tensor + topk_ids: Ids of selected experts + top_k: Number of experts to select per token + global_num_experts: Total number of experts across all ranks + + Returns: + Output tensor from the MoE layer + """ + import flashinfer + + # Pack top k ids and expert weights into a single int32 tensor, as + # required by TRT-LLM + packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( + torch.bfloat16 + ).view(torch.int16) + + # Quantize input to FP4 + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) + + # Call TRT-LLM FP4 block-scale MoE kernel + out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe( + topk_ids=packed_tensor, + routing_bias=None, + hidden_states=hidden_states_fp4, + hidden_states_scale=hidden_states_scale_linear_fp4.view( + torch.float8_e4m3fn + ).flatten(), + gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, + gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm1_bias=None, + gemm1_alpha=None, + gemm1_beta=None, + gemm1_clamp_limit=None, + gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, + gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm2_bias=None, + output1_scale_scalar=layer.g1_scale_c.data, + output1_scale_gate_scalar=layer.g1_alphas.data, + output2_scale_scalar=layer.g2_alphas.data, + num_experts=global_num_experts, + top_k=top_k, + n_group=0, + topk_group=0, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + routed_scaling_factor=None, + tile_tokens_dim=None, + routing_method_type=1, + do_finalize=True, + )[0] + + return out From 2cc5affc388d3d134bacc14f042405ead925531b Mon Sep 17 00:00:00 2001 From: Concurrensee Date: Thu, 11 Dec 2025 17:03:54 -0600 Subject: [PATCH 58/67] [ROCM][CI] Fix AMD Examples Test Group (#30276) Signed-off-by: Yida Wu Signed-off-by: Yida --- .buildkite/test-amd.yaml | 3 +-- examples/offline_inference/basic/embed.py | 8 ++++++++ examples/offline_inference/basic/score.py | 8 ++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4038d32834e68..4e957634e7b47 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -435,7 +435,7 @@ steps: - label: Examples Test # 30min timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/examples" @@ -455,7 +455,6 @@ steps: # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # for pooling models diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index eeb7137ff7bae..17f727b33d321 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -4,6 +4,9 @@ from argparse import Namespace from vllm import LLM, EngineArgs +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config import AttentionConfig +from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -20,6 +23,11 @@ def parse_args(): def main(args: Namespace): + if current_platform.is_rocm(): + args.attention_config = AttentionConfig( + backend=AttentionBackendEnum.FLEX_ATTENTION + ) + # Sample prompts. prompts = [ "Hello, my name is", diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index cbca50eb5efa8..b2dadffd249f5 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -4,6 +4,9 @@ from argparse import Namespace from vllm import LLM, EngineArgs +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config import AttentionConfig +from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -20,6 +23,11 @@ def parse_args(): def main(args: Namespace): + if current_platform.is_rocm(): + args.attention_config = AttentionConfig( + backend=AttentionBackendEnum.FLEX_ATTENTION + ) + # Sample prompts. text_1 = "What is the capital of France?" texts_2 = [ From d527cf0b3d4210c4277f258c9d26286cec726a6f Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Thu, 11 Dec 2025 15:36:31 -0800 Subject: [PATCH 59/67] [FIX]Patch run-cluster.sh (fix for #28328) (#30002) Signed-off-by: elacey Signed-off-by: Ev Lacey --- examples/online_serving/run_cluster.sh | 60 +++++++++++++++----------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/examples/online_serving/run_cluster.sh b/examples/online_serving/run_cluster.sh index 0756d4b0ae556..5996098eb25aa 100644 --- a/examples/online_serving/run_cluster.sh +++ b/examples/online_serving/run_cluster.sh @@ -21,7 +21,7 @@ # --worker \ # /abs/path/to/huggingface/cache \ # -e VLLM_HOST_IP= -# +# # Each worker requires a unique VLLM_HOST_IP value. # Keep each terminal session open. Closing a session stops the associated Ray # node and thereby shuts down the entire cluster. @@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then exit 1 fi +# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=..."). +VLLM_HOST_IP="" +for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do + arg="${ADDITIONAL_ARGS[$i]}" + case "${arg}" in + -e) + next="${ADDITIONAL_ARGS[$((i + 1))]:-}" + if [[ "${next}" == VLLM_HOST_IP=* ]]; then + VLLM_HOST_IP="${next#VLLM_HOST_IP=}" + break + fi + ;; + -eVLLM_HOST_IP=* | VLLM_HOST_IP=*) + VLLM_HOST_IP="${arg#*=}" + break + ;; + esac +done + +# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent. +if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then + if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then + echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})." + echo "Using VLLM_HOST_IP as the head node address." + HEAD_NODE_ADDRESS="${VLLM_HOST_IP}" + fi +fi + # Generate a unique container name with random suffix. # Docker container names must be unique on each host. # The random suffix allows multiple Ray containers to run simultaneously on the same machine, @@ -74,36 +102,17 @@ cleanup() { trap cleanup EXIT # Build the Ray start command based on the node role. -# The head node manages the cluster and accepts connections on port 6379, +# The head node manages the cluster and accepts connections on port 6379, # while workers connect to the head's address. RAY_START_CMD="ray start --block" if [ "${NODE_TYPE}" == "--head" ]; then - RAY_START_CMD+=" --head --port=6379" + RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379" else + RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379" -fi - -# Parse VLLM_HOST_IP from additional args if present. -# This is needed for multi-NIC configurations where Ray needs explicit IP bindings. -VLLM_HOST_IP="" -for arg in "${ADDITIONAL_ARGS[@]}"; do - if [[ $arg == "-e" ]]; then - continue + if [ -n "${VLLM_HOST_IP}" ]; then + RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}" fi - if [[ $arg == VLLM_HOST_IP=* ]]; then - VLLM_HOST_IP="${arg#VLLM_HOST_IP=}" - break - fi -done - -# Build Ray IP environment variables if VLLM_HOST_IP is set. -# These variables ensure Ray binds to the correct network interface on multi-NIC systems. -RAY_IP_VARS=() -if [ -n "${VLLM_HOST_IP}" ]; then - RAY_IP_VARS=( - -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}" - -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}" - ) fi # Launch the container with the assembled parameters. @@ -118,6 +127,5 @@ docker run \ --shm-size 10.24g \ --gpus all \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ - "${RAY_IP_VARS[@]}" \ "${ADDITIONAL_ARGS[@]}" \ "${DOCKER_IMAGE}" -c "${RAY_START_CMD}" From 48661d275fb44b969112a7bd8586dfd9f498e2e3 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 11 Dec 2025 18:24:20 -0600 Subject: [PATCH 60/67] [CI/Build][AMD] Skip tests in test_fusions_e2e and test_dbo_dp_ep_gsm8k that require non-existing imports for ROCm (#30417) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/compile/distributed/test_fusions_e2e.py | 26 ++++++++++++++++++- tests/v1/distributed/test_dbo.py | 2 ++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 75a81efedea3b..5379b5157b811 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -138,6 +138,17 @@ elif current_platform.is_rocm(): CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] +def has_cuda_graph_wrapper_metadata() -> bool: + from importlib import import_module + + try: + module = import_module("torch._inductor.utils") + module.CUDAGraphWrapperMetadata # noqa B018 + except AttributeError: + return False + return True + + @pytest.mark.parametrize( "model_name, model_kwargs, backend, matches, custom_ops", # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 @@ -145,7 +156,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] # quant_fp4 only has the custom impl + list(flat_product(MODELS_FP4, [""])), ) -@pytest.mark.parametrize("inductor_graph_partition", [True, False]) +@pytest.mark.parametrize( + "inductor_graph_partition", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_cuda_graph_wrapper_metadata(), + reason="This test requires" + "torch._inductor.utils.CUDAGraphWrapperMetadata to run", + ), + ), + False, + ], +) def test_attn_quant( model_name: str, model_kwargs: dict[str, Any], diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py index f3a159762ea54..e5cbe1ce85e96 100644 --- a/tests/v1/distributed/test_dbo.py +++ b/tests/v1/distributed/test_dbo.py @@ -13,6 +13,7 @@ import torch from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k from tests.utils import RemoteOpenAIServer +from vllm.utils.import_utils import has_deep_ep # Detect Blackwell / B200 (compute capability 10.x) try: @@ -44,6 +45,7 @@ DEEPEP_BACKENDS = [ ] +@pytest.mark.skipif(not has_deep_ep(), reason="These tests require deep_ep to run") @pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS) @pytest.mark.xfail( IS_BLACKWELL, From 0ab23c2b2be1cdbde41b824186f57343f102e306 Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:00:58 -0800 Subject: [PATCH 61/67] [fix] fix SM check for Flashinfer TRTLLM MOE (#30314) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> --- .../layers/quantization/utils/flashinfer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 00c2720a34875..ba3653e4b5ea7 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: if flashinfer_moe_backend in backend_map: if ( flashinfer_moe_backend == "latency" - and not current_platform.is_device_capability(100) + and not current_platform.has_device_capability(100) ): logger.info_once( "Flashinfer TRTLLM MOE backend is only supported on " From ba809266818cfb9e63bcb34d79fdd77af6e308fe Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 11 Dec 2025 19:02:19 -0600 Subject: [PATCH 62/67] [CI/Build][AMD] Skip test_cutlass_w4a8_moe tests on ROCm sine they require cutlass_pack_scale_fp8 (#30508) Signed-off-by: Randall Smith Signed-off-by: Michael Goin Signed-off-by: mgoin Co-authored-by: Randall Smith Co-authored-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/quantization/test_cutlass_w4a8_moe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py index 3560402a29e90..a855f7333b617 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -18,7 +18,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types -IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 +IS_SUPPORTED_BY_GPU = ( + current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9 +) def to_fp8(tensor: torch.Tensor) -> torch.Tensor: From b5945d49c08b66658110fa1c63e55fde66fcfad7 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 11 Dec 2025 19:37:24 -0600 Subject: [PATCH 63/67] [ROCm][CI] Use mi325_4 agent pool for V1 e2e tests (#30526) Signed-off-by: Andreas Karatzas --- .buildkite/test-amd.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4e957634e7b47..c7d460be6e2b5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -326,10 +326,10 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py -- label: V1 Test e2e + engine # 30min - timeout_in_minutes: 45 +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 + agent_pool: mi325_4 # grade: Blocking source_file_dependencies: - vllm/ From 042da732445f5cef93cb83e1045333544e61a0a1 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 11 Dec 2025 20:54:12 -0500 Subject: [PATCH 64/67] [Core] Refactor `_build_attention_metadata` (#29628) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 248 ++++++++++++++--------------- 1 file changed, 123 insertions(+), 125 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 40c8059f90d34..3f20296c27ba7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1534,28 +1534,13 @@ class GPUModelRunner( """ :return: tuple[attn_metadata, spec_decode_common_attn_metadata] """ + # Attention metadata is not needed for attention free models + if len(self.kv_cache_config.kv_cache_groups) == 0: + return {}, None + num_tokens_padded = num_tokens_padded or num_tokens num_reqs_padded = num_reqs_padded or num_reqs - - logits_indices_padded = None - num_logits_indices = None - if logits_indices is not None: - num_logits_indices = logits_indices.size(0) - if self.cache_config.kv_sharing_fast_prefill: - logits_indices_padded = self._prepare_kv_sharing_fast_prefill( - logits_indices - ) - - # update seq_lens of decode reqs under DCP. - if self.dcp_world_size > 1: - self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens( - self.seq_lens.cpu[:num_reqs], - self.dcp_world_size, - self.dcp_rank, - self.parallel_config.cp_kv_cache_interleave_size, - ) - self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0) - self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded) + assert num_reqs_padded is not None and num_tokens_padded is not None attn_metadata: PerLayerAttnMetadata = {} if ubatch_slices is not None: @@ -1576,36 +1561,12 @@ class GPUModelRunner( self.num_accepted_tokens.np[num_reqs:].fill(1) self.num_accepted_tokens.copy_to_gpu() - # Used in the below loop, uses padded shapes - query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1] - query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1] - seq_lens = self.seq_lens.gpu[:num_reqs_padded] - seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded] - num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ - :num_reqs_padded - ] + kv_cache_groups = self.kv_cache_config.kv_cache_groups - dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None - if self.dcp_world_size > 1: - dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded] - dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded] - - spec_decode_common_attn_metadata = None - - # Prepare the attention metadata for each KV cache group and make layers - # in the same group share the same metadata. - for kv_cache_gid, kv_cache_group in enumerate( - self.kv_cache_config.kv_cache_groups - ): - encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens( - num_scheduled_tokens or {}, - kv_cache_group.kv_cache_spec, - num_reqs_padded, - ) - - if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec): - # Encoder-only layers do not have KV cache, so we need to - # create a dummy block table and slot mapping for them. + def _get_block_table_and_slot_mapping(kv_cache_gid: int): + assert num_reqs_padded is not None and num_tokens_padded is not None + kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec + if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec): blk_table_tensor = torch.zeros( (num_reqs_padded, 1), dtype=torch.int32, @@ -1621,92 +1582,129 @@ class GPUModelRunner( blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] - # Fill unused with -1. Needed for reshape_and_cache in full cuda - # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID - slot_mapping[num_tokens:num_tokens_padded].fill_(-1) - blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) + # Fill unused with -1. Needed for reshape_and_cache in full cuda + # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID + slot_mapping[num_tokens:num_tokens_padded].fill_(-1) + blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) - common_attn_metadata = CommonAttentionMetadata( - query_start_loc=query_start_loc, - query_start_loc_cpu=query_start_loc_cpu, - seq_lens=seq_lens, - _seq_lens_cpu=seq_lens_cpu, - _num_computed_tokens_cpu=num_computed_tokens_cpu, - num_actual_tokens=num_tokens_padded, - num_reqs=num_reqs_padded, - max_query_len=max_query_len, - max_seq_len=max_seq_len, - block_table_tensor=blk_table_tensor, - slot_mapping=slot_mapping, - logits_indices_padded=logits_indices_padded, - num_logits_indices=num_logits_indices, - causal=True, - encoder_seq_lens=encoder_seq_lens, - encoder_seq_lens_cpu=encoder_seq_lens_cpu, - dcp_local_seq_lens=dcp_local_seq_lens, - dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu, + return blk_table_tensor, slot_mapping + + block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0) + cm_base = CommonAttentionMetadata( + query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1], + query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1], + seq_lens=self.seq_lens.gpu[:num_reqs_padded], + _seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded], + _num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[ + :num_reqs_padded + ], + num_reqs=num_reqs_padded, + num_actual_tokens=num_tokens_padded, + max_query_len=max_query_len, + max_seq_len=max_seq_len, + block_table_tensor=block_table_gid_0, + slot_mapping=slot_mapping_gid_0, + causal=True, + ) + + if self.dcp_world_size > 1: + self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens( + self.seq_lens.cpu[:num_reqs], + self.dcp_world_size, + self.dcp_rank, + self.parallel_config.cp_kv_cache_interleave_size, ) + self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0) + self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded) + + cm_base.dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded] + cm_base.dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[ + :num_reqs_padded + ] + + if logits_indices is not None and self.cache_config.kv_sharing_fast_prefill: + cm_base.num_logits_indices = logits_indices.size(0) + cm_base.logits_indices_padded = self._prepare_kv_sharing_fast_prefill( + logits_indices + ) + + def _build_attn_group_metadata( + kv_cache_gid: int, + attn_gid: int, + common_attn_metadata: CommonAttentionMetadata, + ubid: int | None = None, + ) -> None: + attn_group = self.attn_groups[kv_cache_gid][attn_gid] + cascade_attn_prefix_len = ( + cascade_attn_prefix_lens[kv_cache_gid][attn_gid] + if cascade_attn_prefix_lens + else 0 + ) + + builder = attn_group.get_metadata_builder(ubid or 0) + extra_attn_metadata_args = {} + if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): + assert ubid is None, "UBatching not supported with GDN yet" + extra_attn_metadata_args = dict( + num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs_padded], + num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[ + :num_reqs_padded + ], + ) + + if for_cudagraph_capture: + attn_metadata_i = builder.build_for_cudagraph_capture( + common_attn_metadata + ) + else: + attn_metadata_i = builder.build( + common_prefix_len=cascade_attn_prefix_len, + common_attn_metadata=common_attn_metadata, + **extra_attn_metadata_args, + ) + + if ubid is None: + assert isinstance(attn_metadata, dict) + attn_metadata_dict = attn_metadata + else: + assert isinstance(attn_metadata, list) + attn_metadata_dict = attn_metadata[ubid] + + for layer_name in attn_group.layer_names: + attn_metadata_dict[layer_name] = attn_metadata_i + + # Prepare the attention metadata for each KV cache group and make layers + # in the same group share the same metadata. + spec_decode_common_attn_metadata = None + for kv_cache_gid, kv_cache_group in enumerate(kv_cache_groups): + cm = copy(cm_base) # shallow copy + + # Basically only the encoder seq_lens, block_table and slot_mapping change + # for each kv_cache_group. + cm.encoder_seq_lens, cm.encoder_seq_lens_cpu = self._get_encoder_seq_lens( + num_scheduled_tokens or {}, + kv_cache_group.kv_cache_spec, + num_reqs_padded, + ) + if kv_cache_gid > 0: + cm.block_table_tensor, cm.slot_mapping = ( + _get_block_table_and_slot_mapping(kv_cache_gid) + ) if self.speculative_config and spec_decode_common_attn_metadata is None: if isinstance(self.drafter, EagleProposer): if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names: - spec_decode_common_attn_metadata = common_attn_metadata + spec_decode_common_attn_metadata = cm else: - spec_decode_common_attn_metadata = common_attn_metadata - - for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]): - cascade_attn_prefix_len = ( - cascade_attn_prefix_lens[kv_cache_gid][attn_gid] - if cascade_attn_prefix_lens - else 0 - ) - builder = attn_group.get_metadata_builder() - - extra_attn_metadata_args = {} - if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): - extra_attn_metadata_args = dict( - num_accepted_tokens=self.num_accepted_tokens.gpu[ - :num_reqs_padded - ], - num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[ - :num_reqs_padded - ], - ) + spec_decode_common_attn_metadata = cm + for attn_gid in range(len(self.attn_groups[kv_cache_gid])): if ubatch_slices is not None: - common_attn_metadata_list = split_attn_metadata( - ubatch_slices, common_attn_metadata - ) - for ubid, common_attn_metadata in enumerate( - common_attn_metadata_list - ): - builder = attn_group.get_metadata_builder(ubatch_id=ubid) - if for_cudagraph_capture: - attn_metadata_i = builder.build_for_cudagraph_capture( - common_attn_metadata - ) - else: - attn_metadata_i = builder.build( - common_prefix_len=cascade_attn_prefix_len, - common_attn_metadata=common_attn_metadata, - ) - for layer_name in kv_cache_group.layer_names: - assert type(attn_metadata) is list - attn_metadata[ubid][layer_name] = attn_metadata_i + for ubid, _cm in enumerate(split_attn_metadata(ubatch_slices, cm)): + _build_attn_group_metadata(kv_cache_gid, attn_gid, _cm, ubid) + else: - assert isinstance(attn_metadata, dict) - if for_cudagraph_capture: - attn_metadata_i = builder.build_for_cudagraph_capture( - common_attn_metadata - ) - else: - attn_metadata_i = builder.build( - common_prefix_len=cascade_attn_prefix_len, - common_attn_metadata=common_attn_metadata, - **extra_attn_metadata_args, - ) - for layer_name in attn_group.layer_names: - attn_metadata[layer_name] = attn_metadata_i + _build_attn_group_metadata(kv_cache_gid, attn_gid, cm) if self.is_mm_prefix_lm: req_doc_ranges = {} From f355ad5412bc414a2a55f55481cb4aa1d909b4a3 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Fri, 12 Dec 2025 02:09:25 +0000 Subject: [PATCH 65/67] [CPU][FIX] Fix build failures on Arm CPUs with torch nightly (#30481) Signed-off-by: Fadi Arafeh --- cmake/utils.cmake | 23 ++++++++++++++--------- vllm/platforms/cpu.py | 14 ++++++++++---- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 5047c354ff7d2..bdb2ba74d944d 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR) run_python(_VLLM_TORCH_GOMP_PATH " import os, glob -try: - import torch - torch_pkg = os.path.dirname(torch.__file__) - site_root = os.path.dirname(torch_pkg) - torch_libs = os.path.join(site_root, 'torch.libs') - print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0]) -except: - print('') +import torch +torch_pkg = os.path.dirname(torch.__file__) +site_root = os.path.dirname(torch_pkg) + +# Search both torch.libs and torch/lib +roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')] +candidates = [] +for root in roots: + if not os.path.isdir(root): + continue + candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*'))) + +print(candidates[0] if candidates else '') " - "failed to probe torch.libs for libgomp") + "failed to probe for libgomp") if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}") return() diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index a49b6e92df00d..d961dcf13e53e 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -325,10 +325,16 @@ class CpuPlatform(Platform): # We need to find the location of PyTorch's libgomp torch_pkg = os.path.dirname(torch.__file__) site_root = os.path.dirname(torch_pkg) - torch_libs = os.path.join(site_root, "torch.libs") - pytorch_libgomp_so_candidates = glob.glob( - os.path.join(torch_libs, "libgomp-*.so*") - ) + # Search both torch.libs and torch/lib - See: https://github.com/vllm-project/vllm/issues/30470 + torch_libs_paths = [ + os.path.join(site_root, "torch.libs"), + os.path.join(torch_pkg, "lib"), + ] + pytorch_libgomp_so_candidates = [] + for torch_libs in torch_libs_paths: + pytorch_libgomp_so_candidates.extend( + glob.glob(os.path.join(torch_libs, "libgomp*.so*")) + ) if pytorch_libgomp_so_candidates: pytorch_libgomp_so = pytorch_libgomp_so_candidates[0] if ld_preload_str: From 6a6fc41c799916521b1fa2914f72e108352e1bf6 Mon Sep 17 00:00:00 2001 From: Bhanu Prakash Voutharoja <59905694+Bhanu068@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:27:22 +1100 Subject: [PATCH 66/67] gptq marlin quantization support for fused moe with lora (#30254) Signed-off-by: Bhanu068 --- csrc/moe/marlin_moe_wna16/ops.cu | 2 +- .../model_executor/layers/fused_moe/config.py | 36 ++++++ .../layers/quantization/gptq_marlin.py | 110 +++++++++++++++++- 3 files changed, 146 insertions(+), 2 deletions(-) diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu index 27b6ffaa67176..4fd8fc5c54202 100644 --- a/csrc/moe/marlin_moe_wna16/ops.cu +++ b/csrc/moe/marlin_moe_wna16/ops.cu @@ -860,4 +860,4 @@ torch::Tensor moe_wna16_marlin_gemm( TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm); -} +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 5eb6bc4829adf..a9a2990ca2b53 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -543,6 +543,42 @@ def int8_w8a8_moe_quant_config( ) +def gptq_marlin_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + weight_bits: int, + group_size: int, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +): + """ + Construct a quant config for gptq marlin quantization. + """ + from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape + + w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size) + + # Activations are NOT quantized for GPTQ (fp16/bf16) + a_shape = w_shape # Same as weight shape for alignment + + # Determine weight dtype + if weight_bits == 4: + weight_dtype = "int4" + elif weight_bits == 8: + weight_dtype = torch.int8 + else: + raise ValueError(f"Unsupported weight_bits: {weight_bits}") + + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _a2=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _w1=FusedMoEQuantDesc(weight_dtype, w_shape, w1_scale, None, w1_zp, w1_bias), + _w2=FusedMoEQuantDesc(weight_dtype, w_shape, w2_scale, None, w2_zp, w2_bias), + ) + + def mxfp4_w4a16_moe_quant_config( w1_scale: Union[torch.Tensor, "PrecisionConfig"], w2_scale: Union[torch.Tensor, "PrecisionConfig"], diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 8d1715f52f097..6e5dcfe59b2f9 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -732,6 +732,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): is_a_8bit=is_a_8bit, ) replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + + # The modular kernel expects w13_weight and w2_weight, + # but GPTQ uses w13_qweight and w2_qweight + # Alias for modular kernel + layer.w13_weight = layer.w13_qweight + # Alias for modular kernel + layer.w2_weight = layer.w2_qweight + # Repack scales marlin_w13_scales = marlin_moe_permute_scales( s=layer.w13_scales, @@ -782,7 +790,107 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - return None + from vllm.model_executor.layers.fused_moe.config import ( + gptq_marlin_moe_quant_config, + ) + + return gptq_marlin_moe_quant_config( + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + weight_bits=self.quant_config.weight_bits, + group_size=self.quant_config.group_size, + w1_zp=getattr(layer, "w13_qzeros", None) + if not self.quant_config.is_sym + else None, + w2_zp=getattr(layer, "w2_qzeros", None) + if not self.quant_config.is_sym + else None, + w1_bias=getattr(layer, "w13_bias", None), + w2_bias=getattr(layer, "w2_bias", None), + ) + + def select_gemm_impl( + self, + prepare_finalize, + layer: torch.nn.Module, + ): + """ + Select the GEMM implementation for GPTQ-Marlin MoE. + + Returns MarlinExperts configured for GPTQ quantization. + This is ONLY used when LoRA is enabled. + Without LoRA, GPTQ uses its own apply() method. + """ + # Only use modular kernels when LoRA is enabled + # Without LoRA, GPTQ's own apply() method works fine and is more efficient + if not self.moe.is_lora_enabled: + raise NotImplementedError( + "GPTQ-Marlin uses its own apply() method when LoRA is not enabled. " + "Modular kernels are only used for LoRA support." + ) + + # The modular marlin kernels do not support 8-bit weights. + if self.quant_config.weight_bits == 8: + raise NotImplementedError( + "GPTQ-Marlin kernel does not support 8-bit weights." + ) + + from vllm.model_executor.layers.fused_moe import modular_kernel as mk + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + MarlinExperts, + ) + + # Ensure quant config is initialized + assert self.moe_quant_config is not None, ( + "moe_quant_config must be initialized before select_gemm_impl" + ) + + w13_g_idx = ( + getattr(layer, "w13_g_idx", None) if self.quant_config.desc_act else None + ) + w2_g_idx = ( + getattr(layer, "w2_g_idx", None) if self.quant_config.desc_act else None + ) + w13_g_idx_sort_indices = ( + getattr(layer, "w13_g_idx_sort_indices", None) + if self.quant_config.desc_act + else None + ) + w2_g_idx_sort_indices = ( + getattr(layer, "w2_g_idx_sort_indices", None) + if self.quant_config.desc_act + else None + ) + + # Check if using batched expert format (for Expert Parallelism) + if ( + prepare_finalize.activation_format + == mk.FusedMoEActivationFormat.BatchedExperts + ): + # For batched format, use BatchedMarlinExperts + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + return BatchedMarlinExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + else: + # Standard Marlin experts for GPTQ + return MarlinExperts( + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) def apply( self, From 9f2fc16a6903f8988515ce2560d3ef0850809c42 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 11 Dec 2025 21:53:57 -0500 Subject: [PATCH 67/67] [Bugfix][Model] Fix Afmoe rope_parameters issue (#30505) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 5 +---- vllm/model_executor/models/afmoe.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 020cb749341a6..18056a9657e82 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -173,10 +173,7 @@ class _HfExamplesInfo: _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] - "AfmoeForCausalLM": _HfExamplesInfo( - "arcee-ai/Trinity-Nano", - is_available_online=False, - ), + "AfmoeForCausalLM": _HfExamplesInfo("arcee-ai/Trinity-Nano-Preview"), "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 3ced52c2050d6..f5dfe43067414 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -242,7 +242,7 @@ class AfmoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, max_position=max_position_embeddings, - rope_parameters=config["rope_parameters"], + rope_parameters=config.rope_parameters, is_neox_style=True, ) else: