From b7d85cf25c0a6699eb493e595ec44923ffce21b1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 5 Dec 2025 13:03:45 +0000 Subject: [PATCH 001/133] [CI] Have pre-commit comment on a PR if pre-commit was not used (#30077) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/mergify.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 997a40e18e588..5cb9fcdf9f86a 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -14,6 +14,38 @@ pull_request_rules: comment: message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/" +- name: comment-pre-commit-failure + description: Comment on PR when pre-commit check fails + conditions: + - status-failure=pre-commit + - -closed + - -draft + actions: + comment: + message: | + Hi @{{author}}, the pre-commit checks have failed. Please run: + + ```bash + uv pip install pre-commit + pre-commit install + pre-commit run --all-files + ``` + + Then, commit the changes and push to your branch. + + For future commits, `pre-commit` will run automatically on changed files before each commit. + +- name: comment-dco-failure + description: Comment on PR when DCO check fails + conditions: + - status-failure=dco + - -closed + - -draft + actions: + comment: + message: | + Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this. + - name: label-ci-build description: Automatically apply ci/build label conditions: From 9843e332da56307597deb2739bb83b85c18c5dde Mon Sep 17 00:00:00 2001 From: Elham Date: Fri, 5 Dec 2025 08:09:20 -0500 Subject: [PATCH 002/133] [CPU][Perf] Add fast vectorized exp impl from Arm Optimized Routines (#30068) Signed-off-by: Ubuntu Signed-off-by: Elham Harirpoush Co-authored-by: Ubuntu --- csrc/cpu/cpu_attn_impl.hpp | 13 ---------- csrc/cpu/cpu_attn_macros.h | 50 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 98f55d7c014be..02164ed3666e3 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -1246,14 +1246,8 @@ class AttentionMainLoop { // rescale sum and partial outputs if (need_rescale) { // compute rescale factor -#ifdef DEFINE_FAST_EXP - vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); - rescale_factor_vec = fast_exp(rescale_factor_vec); - rescale_factor = rescale_factor_vec.get_last_elem(); -#else rescale_factor = std::exp(rescale_factor); vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); -#endif // rescale sum new_sum_val += rescale_factor * init_sum_val; @@ -1889,15 +1883,8 @@ class AttentionMainLoop { : curr_output_buffer; float rescale_factor = final_max > curr_max ? curr_max - final_max : final_max - curr_max; - -#ifdef DEFINE_FAST_EXP - vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); - rescale_factor_vec = fast_exp(rescale_factor_vec); - rescale_factor = rescale_factor_vec.get_last_elem(); -#else rescale_factor = std::exp(rescale_factor); vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); -#endif local_sum[head_idx] = final_max > curr_max ? final_sum + rescale_factor * curr_sum diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_attn_macros.h index 6458e43419370..35716a0790ab3 100644 --- a/csrc/cpu/cpu_attn_macros.h +++ b/csrc/cpu/cpu_attn_macros.h @@ -60,4 +60,54 @@ #endif +#ifdef __aarch64__ + // Implementation copied from Arm Optimized Routines (expf AdvSIMD) + // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c + #include + #define DEFINE_FAST_EXP \ + const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \ + const float ln2_hi = 0x1.62e4p-1f; \ + const float ln2_lo = 0x1.7f7d1cp-20f; \ + const float c0 = 0x1.0e4020p-7f; \ + const float c2 = 0x1.555e66p-3f; \ + const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; \ + const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); \ + const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); \ + const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); \ + const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); \ + const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); \ + const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound); \ + const float32x4_t inf = \ + vdupq_n_f32(std::numeric_limits::infinity()); \ + const float32x4_t zero = vdupq_n_f32(0.0f); \ + auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) { \ + float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \ + float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); \ + r = vfmsq_laneq_f32(r, n, ln2_c02, 1); \ + uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \ + float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); \ + float32x4_t r2 = vmulq_f32(r, r); \ + float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); \ + float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); \ + q = vfmaq_f32(q, p, r2); \ + p = vmulq_f32(c4, r); \ + float32x4_t poly = vfmaq_f32(p, q, r2); \ + poly = vfmaq_f32(scale, poly, scale); \ + const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound); \ + const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound); \ + poly = vbslq_f32(hi_mask, inf, poly); \ + return vbslq_f32(lo_mask, zero, poly); \ + }; \ + auto fast_exp = [&](vec_op::FP32Vec16& vec) \ + __attribute__((always_inline)) { \ + float32x4x4_t result; \ + result.val[0] = neon_expf(vec.reg.val[0]); \ + result.val[1] = neon_expf(vec.reg.val[1]); \ + result.val[2] = neon_expf(vec.reg.val[2]); \ + result.val[3] = neon_expf(vec.reg.val[3]); \ + return vec_op::FP32Vec16(result); \ + }; + +#endif // __aarch64__ + #endif \ No newline at end of file From 0d8a7d8a264354ed53f822821b29cb0485bfa70f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 5 Dec 2025 22:02:09 +0800 Subject: [PATCH 003/133] [Compressed Tensors] Add XPU `wNa16` support (#29484) Signed-off-by: yiliu30 --- .../scripts/hardware_ci/run-xpu-test.sh | 1 + .../kernels/mixed_precision/__init__.py | 4 + .../kernels/mixed_precision/xpu.py | 97 +++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 4d163399cfc6c..1d5dba3f26f5a 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -38,6 +38,7 @@ docker run \ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager cd tests pytest -v -s v1/core diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index 0cf3f12af5522..c4160157cd628 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -30,6 +30,9 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKer MPLinearKernel, MPLinearLayerConfig, ) +from vllm.model_executor.layers.quantization.kernels.mixed_precision.xpu import ( # noqa: E501 + XPUwNa16LinearKernel, +) from vllm.platforms import current_platform # in priority/performance order (when available) @@ -42,6 +45,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [ BitBLASLinearKernel, ConchLinearKernel, ExllamaLinearKernel, + XPUwNa16LinearKernel, ] diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py new file mode 100644 index 0000000000000..abd2e047aeed0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import torch + +from vllm.platforms import current_platform + +from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig + + +class XPUwNa16LinearKernel(MPLinearKernel): + @classmethod + def get_min_capability(cls) -> int: + return 0 + + @classmethod + def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: + if not current_platform.is_xpu(): + return False, "IPEX wNa16 only supported on XPU/CPU devices" + + # TODO: (yiliu30) relax these restrictions in later PRs + if c.zero_points: + return False, "Zero points not supported for Now" + + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + from packaging import version + + MIN_IPEX_VERSION = "2.6.0" + bias = layer.bias if not layer.skip_bias_add else None + + try: + import intel_extension_for_pytorch as ipex + + if version.parse(ipex.__version__) < version.parse(MIN_IPEX_VERSION): + raise ImportError( + "intel_extension_for_pytorch version is " + "wrong. Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}." + ) + except ImportError as err: + raise ImportError( + "Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via " + f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`" + " to use IPEX-AWQ linear method." + ) from err + # Using the compute dtype (lowp_mode) as INT8 to leverage instructions + # with better performance. + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + # The weight will be de-packed from INT4 to INT8. + weight_dtype = ipex.quantization.WoqWeightDtype.INT4 + # The float activation will be quantized (dynamic, per-token) to INT8. + act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH + + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode, + group_size=self.config.group_size, + weight_qscheme=ipex.quantization.WoqWeightQScheme.SYMMETRIC, + ) + qweight = layer.weight_packed + g_idx = layer.weight_g_idx if self.config.has_g_idx else None + scales = layer.weight_scale + qzeros = None + if self.config.zero_points: + qzeros = layer.weight_zero_point.contiguous() + qweight = qweight.t().contiguous() + scales = scales.t().contiguous() + layer.ipex_output_size = self.config.partition_weight_shape[1] + layer.ipex_qlinear = ( + ipex.llm.quantization.woq_linear.IPEXWeightOnlyQuantizedLinear.from_weight( + qweight, + scales, + qzeros, + in_features=self.config.partition_weight_shape[0], + out_features=self.config.partition_weight_shape[1], + qconfig=qconfig, + g_idx=g_idx, + bias=bias, + group_size=self.config.group_size, + quant_method=0, # `0` stands for the IPEX GPTQ + ) + ) + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + reshaped_x = x.reshape(-1, x.shape[-1]) + out = layer.ipex_qlinear(reshaped_x) + return out.reshape(x.shape[:-1] + (layer.ipex_output_size,)) From 2c174420f5184256159e3d1acfe4184f3f70083e Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Fri, 5 Dec 2025 09:02:49 -0500 Subject: [PATCH 004/133] Reduce validation to a warning (#28749) Signed-off-by: Alec Solder Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Alec Solder Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/structured_outputs.py | 16 ---------------- vllm/reasoning/abs_reasoning_parsers.py | 5 ++++- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py index 1672c1d7c9a09..8c060c816fd15 100644 --- a/vllm/config/structured_outputs.py +++ b/vllm/config/structured_outputs.py @@ -65,22 +65,6 @@ class StructuredOutputsConfig: @model_validator(mode="after") def _validate_structured_output_config(self) -> Self: - # Import here to avoid circular import - from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager - - if self.reasoning_parser_plugin and len(self.reasoning_parser_plugin) > 3: - ReasoningParserManager.import_reasoning_parser(self.reasoning_parser_plugin) - - valid_reasoning_parsers = ReasoningParserManager.list_registered() - if ( - self.reasoning_parser != "" - and self.reasoning_parser not in valid_reasoning_parsers - ): - raise ValueError( - f"invalid reasoning parser: {self.reasoning_parser} " - f"(chose from {{ {','.join(valid_reasoning_parsers)} }})" - ) - if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"): raise ValueError( "disable_any_whitespace is only supported for " diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 5c6ac7dad9930..d0661d1f23b06 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -160,7 +160,10 @@ class ReasoningParserManager: if name in cls.lazy_parsers: return cls._load_lazy_parser(name) - raise KeyError(f"Reasoning parser '{name}' not found.") + registered = ", ".join(cls.list_registered()) + raise KeyError( + f"Reasoning parser '{name}' not found. Available parsers: {registered}" + ) @classmethod def list_registered(cls) -> list[str]: From 949a6a19d26f2179c0e5fc5d94f7d033e4c6a695 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 5 Dec 2025 14:52:45 +0000 Subject: [PATCH 005/133] [NIXL] Add compatibility checking to NIXL KV connector handshake (#29503) Signed-off-by: Mark McLoughlin --- .../kv_connector/unit/test_nixl_connector.py | 224 +++++++++++++++++- tests/v1/kv_connector/unit/utils.py | 10 +- .../kv_connector/v1/nixl_connector.py | 172 +++++++++++++- 3 files changed, 380 insertions(+), 26 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index b7d7a10057b8b..ae4125d54190c 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -9,8 +9,10 @@ import textwrap import time import uuid from collections import defaultdict -from unittest.mock import patch +from typing import Any +from unittest.mock import MagicMock, patch +import msgspec import pytest import ray import torch @@ -18,6 +20,7 @@ import torch from vllm import LLM from vllm.config import KVTransferConfig from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator +from vllm.distributed.kv_transfer.kv_connector.v1 import nixl_connector from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import ( MultiKVConnectorStats, @@ -29,7 +32,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( NixlConnectorMetadata, NixlConnectorScheduler, NixlConnectorWorker, + NixlHandshakePayload, NixlKVConnectorStats, + compute_nixl_compatibility_hash, ) from vllm.distributed.kv_transfer.kv_transfer_state import ( ensure_kv_transfer_shutdown, @@ -317,13 +322,19 @@ def test_kv_transfer_handshake(dist_init): } prefill_connector.register_kv_caches(kv_caches) - # Simulate EngineCore initialization that would - # gather connector metadata from all workers, the scheduler connector - # expects metadata to be in dict[int, KVConnectorHandshakeMetadata], - # where the first key is the dp_rank, the second key is the tp_rank. - metadata = {0: prefill_connector.get_handshake_metadata()} + # Simulate EngineCore initialization that would gather connector + # metadata from all workers + metadata = prefill_connector.get_handshake_metadata() + + # metadata is a NixlHandshakePayload, decode it to get NixlAgentMetadata + decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) + expected_agent_metadata = decoder.decode(metadata.agent_metadata_bytes) + + # The scheduler connector expects metadata to be in + # dict[int, KVConnectorHandshakeMetadata], where the first key is + # the dp_rank, the second key is the tp_rank. scheduler_connector = scheduler.get_kv_connector() - scheduler_connector.set_xfer_handshake_metadata(metadata) + scheduler_connector.set_xfer_handshake_metadata({0: metadata}) # Simulate a request that finishes prefill, which returns # corresponding NixlConnectorMetadata for decode instance. @@ -362,9 +373,9 @@ def test_kv_transfer_handshake(dist_init): ) received_metadata = mock_add_remote_agent.call_args.args + assert received_metadata[0] == expected_agent_metadata assert received_metadata[1] == 0 # remote_tp_rank assert received_metadata[2] == 1 # remote_tp_size - assert metadata[0] == received_metadata[0] # Need to shutdown the background thread to release NIXL side channel port scheduler_connector.shutdown() @@ -403,7 +414,6 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): device_id=0, num_blocks=1, block_lens=self.block_len_per_layer, - attn_backend_name=self.backend_name, # `self.kv_cache_layout` is only forced to HND when vllm engine # is started. We mock HND here. kv_cache_layout="HND", @@ -651,7 +661,6 @@ class TestNixlHandshake: device_id=0, num_blocks=1, block_lens=worker.block_len_per_layer, - attn_backend_name=worker.backend_name, kv_cache_layout=mismatched_layout, block_size=worker.block_size, ) @@ -706,7 +715,6 @@ class TestNixlHandshake: num_blocks=1, # prefill TP=1, decode TP=2, remote block_lens is double to local block_lens=[i * 2 for i in worker.block_len_per_layer], - attn_backend_name=worker.backend_name, kv_cache_layout="HND", block_size=worker.block_size, ) @@ -1168,6 +1176,9 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): mock_wrapper_instance = mock_nixl_wrapper.return_value connector.connector_worker.nixl_wrapper = mock_wrapper_instance + # Appease NixlHandshakePayload encoding with some bytes + mock_wrapper_instance.get_agent_metadata.return_value = b"fake_agent_metadata" + # Reassure the shutdown() check that the thread is terminated mock_thread.return_value.is_alive.return_value = False @@ -1534,3 +1545,194 @@ def test_transfer_setup_failure_returns_finished(dist_init): # ensure request appears in get_finished _, done_recving = connector.get_finished(finished_req_ids=set()) assert request_id in done_recving + + +@pytest.mark.parametrize( + "mismatch_type,config_overrides,version_override,should_fail,enforce_handshake_compat", + [ + ("vllm_version", {}, {"vllm_version": "0.6.1"}, True, True), + ("nixl_connector_version", {}, {"connector_version": 37}, True, True), + ("model_name", {"model": "facebook/opt-350m"}, {}, True, True), + ("dtype", {"dtype": "bfloat16"}, {}, True, True), + ("cache_dtype", {"cache_dtype": "fp8"}, {}, True, True), + ("num_kv_heads", {"hf_overrides": {"num_key_value_heads": 8}}, {}, True, True), + ( + "num_hidden_layers", + {"hf_overrides": {"num_hidden_layers": 24}}, + {}, + True, + True, + ), + ("hidden_size", {"hf_overrides": {"hidden_size": 1536}}, {}, True, True), + ("block_size", {"block_size": 8}, {}, False, True), + ("matching_config", {}, {}, False, True), + ("escape_hatch", {"model": "facebook/opt-350m"}, {}, False, False), + ], +) +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, +) +def test_compatibility_hash_validation( + dist_init, + mismatch_type, + config_overrides, + version_override, + should_fail, + enforce_handshake_compat, +): + """ + Test NIXL compatibility hash validation during handshake. + + Parameters: + mismatch_type: description of what is being tested + config_overrides: dict of config to override for the remote instance + version_override: version dict e.g. {"vllm_version": "0.6.1"} + should_fail: whether the handshake should fail + enforce_handshake_compat: whether to enforce compatibility checking + """ + local_vllm_config = create_vllm_config( + model="facebook/opt-125m", + block_size=16, + kv_connector_extra_config={ + "enforce_handshake_compat": enforce_handshake_compat + }, + ) + decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER) + decode_worker = decode_connector.connector_worker + + remote_config_params: dict[str, Any] = { + "model": "facebook/opt-125m", + "block_size": 16, + **config_overrides, + } + remote_vllm_config = create_vllm_config(**remote_config_params) + + with contextlib.ExitStack() as stack: + if "vllm_version" in version_override: + stack.enter_context( + patch("vllm.__version__", version_override["vllm_version"]) + ) + elif "connector_version" in version_override: + stack.enter_context( + patch.object( + nixl_connector, + "NIXL_CONNECTOR_VERSION", + version_override["connector_version"], + ) + ) + remote_hash = compute_nixl_compatibility_hash( + remote_vllm_config, decode_worker.backend_name + ) + + prefill_block_size = config_overrides.get("block_size", 16) + prefill_metadata = NixlAgentMetadata( + engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + agent_metadata=FakeNixlWrapper.AGENT_METADATA, + kv_caches_base_addr=[0], + device_id=0, + num_blocks=1, + block_lens=[4096 * prefill_block_size], # slot_size * block_size + kv_cache_layout="HND", + block_size=prefill_block_size, + ) + handshake_payload = NixlHandshakePayload( + compatibility_hash=remote_hash, + agent_metadata_bytes=msgspec.msgpack.encode(prefill_metadata), + ) + + # Mock ZMQ socket to return our handshake payload + mock_socket = MagicMock() + mock_socket.recv.return_value = msgspec.msgpack.encode(handshake_payload) + + # Mock add_remote_agent to avoid actual NIXL operations + # Patch zmq_ctx to return our mock socket + with ( + patch.object(decode_worker, "add_remote_agent", return_value="fake_agent"), + patch.object(nixl_connector, "zmq_ctx") as mock_zmq_ctx, + ): + mock_zmq_ctx.return_value.__enter__.return_value = mock_socket + + if should_fail: + with pytest.raises(RuntimeError, match="compatibility hash mismatch"): + decode_worker._nixl_handshake( + host="localhost", + port=1234, + remote_tp_size=1, + expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + ) + else: + result = decode_worker._nixl_handshake( + host="localhost", + port=1234, + remote_tp_size=1, + expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + ) + # Verify handshake returned agent mapping + assert isinstance(result, dict) + assert len(result) == 1 + + +@pytest.mark.parametrize( + "error_scenario", + [ + "handshake_decode_error", + "handshake_validation_error", + "metadata_decode_error", + "metadata_validation_error", + ], +) +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, +) +def test_handshake_decode_errors(dist_init, error_scenario): + """ + Test that msgspec decode errors are properly handled during handshake. + + Tests both DecodeError and ValidationError for both decoders: + - NixlHandshakePayload decoder + - NixlAgentMetadata decoder + """ + local_vllm_config = create_vllm_config( + model="facebook/opt-125m", + block_size=16, + ) + decode_connector = NixlConnector(local_vllm_config, KVConnectorRole.WORKER) + decode_worker = decode_connector.connector_worker + + if error_scenario == "handshake_decode_error": + msg_bytes = b"this is not valid msgpack data" + elif error_scenario == "handshake_validation_error": + msg_bytes = msgspec.msgpack.encode({"wrong_field": "value"}) + elif error_scenario == "metadata_decode_error": + valid_handshake = NixlHandshakePayload( + compatibility_hash=decode_worker.compat_hash, + agent_metadata_bytes=b"invalid msgpack for metadata", + ) + msg_bytes = msgspec.msgpack.encode(valid_handshake) + + elif error_scenario == "metadata_validation_error": + valid_handshake = NixlHandshakePayload( + compatibility_hash=decode_worker.compat_hash, + agent_metadata_bytes=msgspec.msgpack.encode({"missing": "fields"}), + ) + msg_bytes = msgspec.msgpack.encode(valid_handshake) + else: + raise AssertionError(f"{error_scenario} not a valid scenario") + + mock_socket = MagicMock() + mock_socket.recv.return_value = msg_bytes + with ( + patch.object(decode_worker, "add_remote_agent", return_value="fake_agent"), + patch.object(nixl_connector, "zmq_ctx") as mock_zmq_ctx, + ): + mock_zmq_ctx.return_value.__enter__.return_value = mock_socket + + with pytest.raises(RuntimeError): + decode_worker._nixl_handshake( + host="localhost", + port=1234, + remote_tp_size=1, + expected_engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 98f1f44923b1c..cea41c3ab18a6 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -90,13 +90,18 @@ def create_vllm_config( max_model_len: int = 10000, enable_chunked_prefill: bool = True, enable_permute_local_kv: bool = False, + kv_connector_extra_config: dict[str, Any] | None = None, + dtype: str = "float16", + cache_dtype: str = "auto", + hf_overrides: dict[str, Any] | None = None, ) -> VllmConfig: """Initialize VllmConfig For Testing.""" model_config = ModelConfig( model=model, trust_remote_code=True, - dtype="float16", + dtype=dtype, seed=42, + hf_overrides=hf_overrides or {}, ) scheduler_config = SchedulerConfig( max_num_seqs=max_num_seqs, @@ -110,13 +115,14 @@ def create_vllm_config( block_size=block_size, gpu_memory_utilization=0.9, swap_space=0, - cache_dtype="auto", + cache_dtype=cache_dtype, enable_prefix_caching=True, ) kv_transfer_config = KVTransferConfig( kv_connector="NixlConnector", kv_role="kv_both", enable_permute_local_kv=enable_permute_local_kv, + kv_connector_extra_config=kv_connector_extra_config or {}, ) return VllmConfig( scheduler_config=scheduler_config, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 24b7599a4fe0c..49330abcec324 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -59,6 +59,21 @@ Transfer = tuple[int, float] # (xfer_handle, start_time) EngineId = str ReqId = str +# +# NIXL Connector Version +# +# Increment this version whenever there is an incompatible change to: +# - NixlAgentMetadata schema +# - kv_transfer_params schema or semantics +# - NIXL transfer protocol or wire format +# - KV cache memory layout or block organization +# - Any other change that breaks P/D interoperability +# +# Version History: +# 1: Initial version with compatibility checking +# +NIXL_CONNECTOR_VERSION: int = 1 + GET_META_MSG = b"get_meta_msg" logger = init_logger(__name__) @@ -97,18 +112,95 @@ _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices()) @dataclass -class NixlAgentMetadata(KVConnectorHandshakeMetadata): +class NixlAgentMetadata: engine_id: str agent_metadata: bytes kv_caches_base_addr: list[int] device_id: int num_blocks: int block_lens: list[int] - attn_backend_name: str kv_cache_layout: str block_size: int +@dataclass +class NixlHandshakePayload(KVConnectorHandshakeMetadata): + """ + Wrapper for NIXL handshake sent over the wire. + + Enables two-phase decoding for graceful compatibility checking: + 1. Decode NixlHandshakePayload to get compatibility_hash + 2. Compute local hash and compare + 3. Only if hashes match, decode agent_metadata_bytes + + This prevents decoder errors when NixlAgentMetadata schema is + incompatible, allowing graceful failure with clear error message. + """ + + compatibility_hash: str + agent_metadata_bytes: bytes # NixlAgentMetadata encoded + + +def compute_nixl_compatibility_hash( + vllm_config: VllmConfig, attn_backend_name: str +) -> str: + """ + Compute compatibility hash for NIXL KV transfer. + + Hash only the factors that affect whether two NIXL instances can + successfully transfer KV cache data. + + Factors included: + - vLLM version and NIXL connector version + - Model architecture (name, dtype, KV heads, layers) + - KV cache format (dtype, sliding window) + - Attention backend + + Note: Factors like tensor_parallel_size, block_size, and kv_cache_layout + are validated at runtime in _validate_remote_agent_handshake and are not + included in this hash to support heterogeneous deployments. + + Note - the set of factors are likely to evolve significantly over + time to be more or less permissive. + + Returns: + SHA-256 hex digest + """ + from vllm import __version__ as vllm_version + from vllm.config.utils import hash_factors + + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + + factors = { + # Version compatibility + "vllm_version": vllm_version, + "nixl_connector_version": NIXL_CONNECTOR_VERSION, + # Model architecture - affects KV cache shape + "model": model_config.model, + "dtype": str(model_config.dtype), + "num_kv_heads": model_config.get_total_num_kv_heads(), + "head_size": model_config.get_head_size(), + "num_hidden_layers": model_config.get_total_num_hidden_layers(), + # Attention backend and KV cache dtype affect memory layout + "attn_backend_name": attn_backend_name, + "cache_dtype": str(cache_config.cache_dtype), + } + + compat_hash = hash_factors(factors) + logger.info( + "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, " + "cache_dtype=%s, attn_backend=%s)", + compat_hash, + factors["model"], + factors["dtype"], + factors["num_kv_heads"], + factors["cache_dtype"], + attn_backend_name, + ) + return compat_hash + + @dataclass class ReqMeta: local_block_ids: list[int] @@ -396,14 +488,14 @@ class NixlConnectorScheduler: encoded_data: dict[int, bytes] = {} encoder = msgspec.msgpack.Encoder() for tp_rank, rank_metadata in metadata.items(): - if not isinstance(rank_metadata, NixlAgentMetadata): + if not isinstance(rank_metadata, NixlHandshakePayload): raise ValueError( - "NixlConnectorScheduler expects NixlAgentMetadata for " + "NixlConnectorScheduler expects NixlHandshakePayload for " "handshake metadata." ) encoded_data[tp_rank] = encoder.encode(rank_metadata) logger.debug( - "Tp rank %d: encoded NixlAgentMetadata size: %s bytes", + "Tp rank %d: encoded NixlHandshakePayload size: %s bytes", tp_rank, str(len(encoded_data[tp_rank])), ) @@ -794,7 +886,7 @@ class NixlConnectorWorker: self._failed_recv_reqs: set[ReqId] = set() # Handshake metadata of this worker for NIXL transfers. - self.xfer_handshake_metadata: NixlAgentMetadata | None = None + self.xfer_handshake_metadata: NixlHandshakePayload | None = None # Background thread for initializing new NIXL handshakes. self._handshake_initiation_executor = ThreadPoolExecutor( # NIXL is not guaranteed to be thread-safe, limit 1 worker. @@ -829,6 +921,13 @@ class NixlConnectorWorker: logger.debug("Detected attention backend %s", self.backend_name) logger.debug("Detected kv cache layout %s", self.kv_cache_layout) + self.compat_hash = compute_nixl_compatibility_hash( + self.vllm_config, self.backend_name + ) + self.enforce_compat_hash = self.kv_transfer_config.get_from_extra_config( + "enforce_handshake_compat", True + ) + self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size} self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size} # With heterogeneous TP, P must wait for all assigned D TP workers to @@ -877,14 +976,58 @@ class NixlConnectorWorker: # Set receive timeout to 5 seconds to avoid hanging on dead server sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds sock.send(msg) - metadata_bytes = sock.recv() - decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) - metadata = decoder.decode(metadata_bytes) + handshake_bytes = sock.recv() + + # Decode handshake payload to get compatibility hash + handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload) + try: + handshake_payload = handshake_decoder.decode(handshake_bytes) + except (msgspec.DecodeError, msgspec.ValidationError) as e: + raise RuntimeError( + f"Failed to decode NixlHandshakePayload. This likely indicates " + f"an incompatibility between connector version. Error: {e}" + ) from e + got_metadata_time = time.perf_counter() logger.debug( "NIXL handshake: get metadata took: %s", got_metadata_time - start_time ) + # Check compatibility hash BEFORE decoding agent metadata + if ( + self.enforce_compat_hash + and handshake_payload.compatibility_hash != self.compat_hash + ): + raise RuntimeError( + f"NIXL compatibility hash mismatch. " + f"Local: {self.compat_hash}, " + f"Remote: {handshake_payload.compatibility_hash}. " + f"Prefill and decode instances have incompatible configurations. " + f"This may be due to: different vLLM versions, models, dtypes, " + f"KV cache layouts, attention backends, etc. " + f"Both instances must use identical configurations." + f"Disable this check using " + f'--kv-transfer-config \'{{"kv_connector_extra_config": ' + f'{{"enforce_handshake_compat": false}}}}\'' + ) + + logger.info( + "NIXL compatibility check passed (hash: %s)", + handshake_payload.compatibility_hash, + ) + + # Decode agent metadata + metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) + try: + metadata = metadata_decoder.decode( + handshake_payload.agent_metadata_bytes + ) + except (msgspec.DecodeError, msgspec.ValidationError) as e: + # This should not happen if hash matched + raise RuntimeError( + f"Failed to decode NixlAgentMetadata. Error: {e}" + ) from e + # Ensure engine id matches. if metadata.engine_id != expected_engine_id: raise RuntimeError( @@ -1175,19 +1318,24 @@ class NixlConnectorWorker: assert len(self.block_window_per_layer) == self.num_layers # After KV Caches registered, listen for new connections. - self.xfer_handshake_metadata = NixlAgentMetadata( + agent_metadata = NixlAgentMetadata( engine_id=self.engine_id, agent_metadata=self.nixl_wrapper.get_agent_metadata(), kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id], device_id=self.device_id, num_blocks=self.num_blocks, block_lens=self.block_len_per_layer, - attn_backend_name=self.backend_name, kv_cache_layout=self.kv_cache_layout if not self.use_host_buffer else self.host_buffer_kv_cache_layout, block_size=self.block_size, ) + # Wrap metadata in payload with hash for defensive decoding + encoder = msgspec.msgpack.Encoder() + self.xfer_handshake_metadata = NixlHandshakePayload( + compatibility_hash=self.compat_hash, + agent_metadata_bytes=encoder.encode(agent_metadata), + ) def register_local_xfer_handler( self, @@ -1402,8 +1550,6 @@ class NixlConnectorWorker: remote_engine_id = nixl_agent_meta.engine_id assert self._tp_size[remote_engine_id] == remote_tp_size - # TODO We may eventually want to skip enforcing the same attn backend. - assert nixl_agent_meta.attn_backend_name == self.backend_name tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id) block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id( From da7bc54ea8f44a2dcacc4a9869721bd105006e10 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Fri, 5 Dec 2025 08:11:50 -0800 Subject: [PATCH 006/133] [responsesAPI][5] ResponsesParser with tools for full MCP python loop (#29798) Signed-off-by: Andrew Xia Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- .../openai_responses_client_with_tools.py | 2 +- .../test_response_api_parsable_context.py | 99 ++++++++++++++++++- vllm/entrypoints/context.py | 92 ++++++++++++++++- .../openai/parser/responses_parser.py | 34 +++++++ vllm/entrypoints/openai/serving_engine.py | 65 ++++++++++-- vllm/entrypoints/openai/serving_responses.py | 6 +- vllm/entrypoints/responses_utils.py | 21 +++- vllm/entrypoints/tool.py | 44 +++++++++ 8 files changed, 347 insertions(+), 16 deletions(-) diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py index 276010197b5ab..c85c8cf807b49 100644 --- a/examples/online_serving/openai_responses_client_with_tools.py +++ b/examples/online_serving/openai_responses_client_with_tools.py @@ -3,7 +3,7 @@ """ Set up this example by starting a vLLM OpenAI-compatible server with tool call options enabled. -Reasoning models can be used through the Responses API as seen here +Reasoning models can be used through the Responses API as seen here https://platform.openai.com/docs/api-reference/responses For example: vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \ diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py index 1b2795770d4c7..1899c5f04fe3f 100644 --- a/tests/entrypoints/openai/test_response_api_parsable_context.py +++ b/tests/entrypoints/openai/test_response_api_parsable_context.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +import json import pytest import pytest_asyncio @@ -13,12 +15,27 @@ MODEL_NAME = "Qwen/Qwen3-8B" @pytest.fixture(scope="module") def server(): - args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"] + assert importlib.util.find_spec("gpt_oss") is not None, ( + "Harmony tests require gpt_oss package to be installed" + ) + + args = [ + "--reasoning-parser", + "qwen3", + "--max_model_len", + "5000", + "--structured-outputs-config.backend", + "xgrammar", + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes", + "--tool-server", + "demo", + ] env_dict = dict( VLLM_ENABLE_RESPONSES_API_STORE="1", VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1", - # uncomment for tool calling - # PYTHON_EXECUTION_BACKEND="dangerously_use_uv", + PYTHON_EXECUTION_BACKEND="dangerously_use_uv", ) with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server: @@ -85,3 +102,79 @@ async def test_reasoning_and_function_items(client: OpenAI, model_name: str): assert response.output[0].type == "reasoning" assert response.output[1].type == "message" assert type(response.output[1].content[0].text) is str + + +def get_horoscope(sign): + return f"{sign}: Next Tuesday you will befriend a baby otter." + + +def call_function(name, args): + if name == "get_horoscope": + return get_horoscope(**args) + else: + raise ValueError(f"Unknown function: {name}") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_function_call_first_turn(client: OpenAI, model_name: str): + tools = [ + { + "type": "function", + "name": "get_horoscope", + "description": "Get today's horoscope for an astrological sign.", + "parameters": { + "type": "object", + "properties": { + "sign": {"type": "string"}, + }, + "required": ["sign"], + "additionalProperties": False, + }, + "strict": True, + } + ] + + response = await client.responses.create( + model=model_name, + input="What is the horoscope for Aquarius today?", + tools=tools, + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + assert len(response.output) == 2 + assert response.output[0].type == "reasoning" + assert response.output[1].type == "function_call" + + function_call = response.output[1] + assert function_call.name == "get_horoscope" + assert function_call.call_id is not None + + args = json.loads(function_call.arguments) + assert "sign" in args + + # the multi turn function call is tested above in + # test_reasoning_and_function_items + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_mcp_tool_call(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24? Use python to calculate the result.", + tools=[{"type": "code_interpreter", "container": {"type": "auto"}}], + temperature=0.0, + ) + + assert response is not None + assert response.status == "completed" + assert response.output[0].type == "reasoning" + assert response.output[1].type == "mcp_call" + assert type(response.output[1].arguments) is str + assert type(response.output[1].output) is str + assert response.output[2].type == "reasoning" + # make sure the correct math is in the final output + assert response.output[3].type == "message" + assert "312" in response.output[3].content[0].text diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 43783c92667af..f50c473d7a773 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -9,10 +9,16 @@ from collections.abc import Callable from contextlib import AsyncExitStack from typing import TYPE_CHECKING, Union +from openai.types.responses.response_function_tool_call_output_item import ( + ResponseFunctionToolCallOutputItem, +) from openai.types.responses.tool import Mcp from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm import envs +from vllm.entrypoints.chat_utils import ( + ChatTemplateContentFormatOption, +) from vllm.entrypoints.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, @@ -22,16 +28,20 @@ from vllm.entrypoints.openai.parser.responses_parser import ( get_responses_parser_for_simple_context, ) from vllm.entrypoints.openai.protocol import ( + FunctionCall, ResponseInputOutputItem, ResponseRawMessageAndToken, ResponsesRequest, ) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool_server import ToolServer from vllm.outputs import RequestOutput from vllm.reasoning.abs_reasoning_parsers import ReasoningParser +from vllm.tokenizers.protocol import TokenizerLike from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid if TYPE_CHECKING: from mcp.client import ClientSession @@ -221,6 +231,10 @@ class ParsableContext(ConversationContext): tokenizer: AnyTokenizer, reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None, request: ResponsesRequest, + available_tools: list[str] | None, + tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, ): self.num_prompt_tokens = 0 self.num_output_tokens = 0 @@ -238,12 +252,19 @@ class ParsableContext(ConversationContext): reasoning_parser_cls=reasoning_parser_cls, response_messages=response_messages, request=request, + tool_parser_cls=tool_parser_cls, ) + self.tool_parser_cls = tool_parser_cls + self.request = request + self.tokenizer = tokenizer + self.available_tools = available_tools or [] self._tool_sessions: dict[str, ClientSession | Tool] = {} self.called_tools: set[str] = set() self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice) + self.chat_template = chat_template + self.chat_template_content_format = chat_template_content_format def append_output(self, output: RequestOutput) -> None: self.num_prompt_tokens = len(output.prompt_token_ids or []) @@ -252,14 +273,50 @@ class ParsableContext(ConversationContext): self.parser.process(output.outputs[0]) def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None: - raise NotImplementedError("Should not be called.") + self.parser.response_messages.extend(output) def need_builtin_tool_call(self) -> bool: """Return true if the last message is a MCP tool call""" + last_message = self.parser.response_messages[-1] + # TODO: figure out which tools are MCP tools + if ( # noqa: SIM103 + last_message.type == "function_call" + and last_message.name in ("code_interpreter", "python") + ): + return True + return False + async def call_python_tool( + self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall + ) -> list[ResponseInputOutputItem]: + self.called_tools.add("python") + if isinstance(tool_session, Tool): + return await tool_session.get_result_parsable_context(self) + args = json.loads(last_msg.arguments) + param = { + "code": args["code"], + } + result = await tool_session.call_tool("python", param) + result_str = result.content[0].text + + message = ResponseFunctionToolCallOutputItem( + id=f"fco_{random_uuid()}", + type="function_call_output", + call_id=f"call_{random_uuid()}", + output=result_str, + status="completed", + ) + + return [message] + async def call_tool(self) -> list[ResponseInputOutputItem]: - raise NotImplementedError("Should not be called.") + if not self.parser.response_messages: + return [] + last_msg = self.parser.response_messages[-1] + if last_msg.name == "code_interpreter": + return await self.call_python_tool(self._tool_sessions["python"], last_msg) + return [] def render_for_completion(self): raise NotImplementedError("Should not be called.") @@ -271,11 +328,38 @@ class ParsableContext(ConversationContext): request_id: str, mcp_tools: dict[str, Mcp], ): - pass + if tool_server: + for tool_name in self.available_tools: + if tool_name in self._tool_sessions: + continue + + tool_type = _map_tool_name_to_tool_type(tool_name) + headers = ( + mcp_tools[tool_type].headers if tool_type in mcp_tools else None + ) + tool_session = await exit_stack.enter_async_context( + tool_server.new_session(tool_name, request_id, headers) + ) + self._tool_sessions[tool_name] = tool_session + exit_stack.push_async_exit(self.cleanup_session) async def cleanup_session(self, *args, **kwargs) -> None: """Can be used as coro to used in __aexit__""" - raise NotImplementedError("Should not be called.") + + async def cleanup_tool_session(tool_session): + if not isinstance(tool_session, Tool): + logger.info( + "Cleaning up tool session for %s", tool_session._client_info + ) + with contextlib.suppress(Exception): + await tool_session.call_tool("cleanup_session", {}) + + await asyncio.gather( + *( + cleanup_tool_session(self._tool_sessions[tool]) + for tool in self.called_tools + ) + ) class HarmonyContext(ConversationContext): diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py index 1bc8e81bd9dfc..00045a7ccfd24 100644 --- a/vllm/entrypoints/openai/parser/responses_parser.py +++ b/vllm/entrypoints/openai/parser/responses_parser.py @@ -3,6 +3,7 @@ import logging from collections.abc import Callable +from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall from openai.types.responses.response_output_message import ResponseOutputMessage from openai.types.responses.response_output_text import ResponseOutputText from openai.types.responses.response_reasoning_item import ( @@ -11,8 +12,10 @@ from openai.types.responses.response_reasoning_item import ( ) from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser from vllm.outputs import CompletionOutput from vllm.reasoning.abs_reasoning_parsers import ReasoningParser +from vllm.tokenizers.protocol import TokenizerLike from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import random_uuid @@ -29,6 +32,7 @@ class ResponsesParser: reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser], response_messages: list[ResponseInputOutputItem], request: ResponsesRequest, + tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None, ): self.response_messages: list[ResponseInputOutputItem] = ( # TODO: initial messages may not be properly typed @@ -39,6 +43,9 @@ class ResponsesParser: self.request = request self.reasoning_parser_instance = reasoning_parser_cls(tokenizer) + self.tool_parser_instance = None + if tool_parser_cls is not None: + self.tool_parser_instance = tool_parser_cls(tokenizer) def process(self, output: CompletionOutput) -> "ResponsesParser": reasoning_content, content = self.reasoning_parser_instance.extract_reasoning( @@ -59,6 +66,29 @@ class ResponsesParser: ) ) + function_calls: list[ResponseFunctionToolCall] = [] + if self.tool_parser_instance is not None: + tool_call_info = self.tool_parser_instance.extract_tool_calls( + content if content is not None else "", + request=self.request, # type: ignore + ) + if tool_call_info is not None and tool_call_info.tools_called: + # extract_tool_calls() returns a list of tool calls. + function_calls.extend( + ResponseFunctionToolCall( + id=f"fc_{random_uuid()}", + call_id=f"call_{random_uuid()}", + type="function_call", + status="completed", + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ) + for tool_call in tool_call_info.tool_calls + ) + content = tool_call_info.content + if content and content.strip() == "": + content = None + if content: self.response_messages.append( ResponseOutputMessage( @@ -76,6 +106,8 @@ class ResponsesParser: ], ) ) + if len(function_calls) > 0: + self.response_messages.extend(function_calls) return self @@ -86,6 +118,7 @@ def get_responses_parser_for_simple_context( reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser], response_messages: list[ResponseInputOutputItem], request: ResponsesRequest, + tool_parser_cls, ) -> ResponsesParser: """Factory function to create a ResponsesParser with optional reasoning parser. @@ -98,4 +131,5 @@ def get_responses_parser_for_simple_context( reasoning_parser_cls=reasoning_parser_cls, response_messages=response_messages, request=request, + tool_parser_cls=tool_parser_cls, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index bfa98f29a064b..99936f588f28b 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -18,6 +18,16 @@ from pydantic import ConfigDict, TypeAdapter from starlette.datastructures import Headers from typing_extensions import TypeIs +from vllm.entrypoints.context import ( + HarmonyContext, + ParsableContext, + StreamingHarmonyContext, +) +from vllm.entrypoints.openai.protocol import ( + FunctionCall, + ResponseInputOutputItem, + ResponsesRequest, +) from vllm.entrypoints.pooling.classify.protocol import ( ClassificationChatRequest, ClassificationCompletionRequest, @@ -39,6 +49,7 @@ from vllm.entrypoints.pooling.score.protocol import ( ScoreRequest, ScoreResponse, ) +from vllm.transformers_utils.tokenizer import AnyTokenizer if sys.version_info >= (3, 12): from typing import TypedDict @@ -72,9 +83,7 @@ from vllm.entrypoints.openai.protocol import ( DetokenizeRequest, ErrorInfo, ErrorResponse, - FunctionCall, FunctionDefinition, - ResponsesRequest, TokenizeChatRequest, TokenizeCompletionRequest, TokenizeResponse, @@ -85,6 +94,9 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig +from vllm.entrypoints.responses_utils import ( + construct_input_messages, +) from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import PromptType @@ -1224,6 +1236,31 @@ class OpenAIServing: ) return engine_request, tokenization_kwargs + async def _render_next_turn( + self, + request: ResponsesRequest, + tokenizer: AnyTokenizer, + messages: list[ResponseInputOutputItem], + tool_dicts: list[dict[str, Any]] | None, + tool_parser, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + ): + new_messages = construct_input_messages( + request_input=messages, + ) + + _, request_prompts, engine_prompts = await self._preprocess_chat( + request, + tokenizer, + new_messages, + tool_dicts=tool_dicts, + tool_parser=tool_parser, + chat_template=chat_template, + chat_template_content_format=chat_template_content_format, + ) + return request_prompts, engine_prompts + async def _generate_with_builtin_tools( self, request_id: str, @@ -1286,11 +1323,27 @@ class OpenAIServing: # Create inputs for the next turn. # Render the next prompt token ids. - prompt_token_ids = context.render_for_completion() - engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) - request_prompt = prompt_token_ids + if isinstance(context, (HarmonyContext, StreamingHarmonyContext)): + prompt_token_ids = context.render_for_completion() + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) + request_prompt = prompt_token_ids + elif isinstance(context, ParsableContext): + request_prompts, engine_prompts = await self._render_next_turn( + context.request, + context.tokenizer, + context.parser.response_messages, + context.tool_dicts, + context.tool_parser_cls, + context.chat_template, + context.chat_template_content_format, + ) + engine_prompt = engine_prompts[0] + request_prompt = request_prompts[0] + # Update the sampling params. - sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids) + sampling_params.max_tokens = self.max_model_len - len( + engine_prompt["prompt_token_ids"] + ) # OPTIMIZATION priority = orig_priority - 1 sub_request += 1 diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 3c9ae8e8c8087..1eb1243e7e5bc 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -375,7 +375,7 @@ class OpenAIServingResponses(OpenAIServing): generators: list[AsyncGenerator[ConversationContext, None]] = [] builtin_tool_list: list[str] = [] - if self.use_harmony and self.tool_server is not None: + if self.tool_server is not None: if self.tool_server.has_tool("browser"): builtin_tool_list.append("browser") if self.tool_server.has_tool("python"): @@ -423,6 +423,10 @@ class OpenAIServingResponses(OpenAIServing): tokenizer=tokenizer, reasoning_parser_cls=self.reasoning_parser, request=request, + tool_parser_cls=self.tool_parser, + available_tools=available_tools, + chat_template=self.chat_template, + chat_template_content_format=self.chat_template_content_format, ) else: context = SimpleContext() diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index 5f21e2c44450c..fbc137bac4543 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -16,6 +16,7 @@ from openai.types.responses.response import ToolChoice from openai.types.responses.response_function_tool_call_output_item import ( ResponseFunctionToolCallOutputItem, ) +from openai.types.responses.response_output_item import McpCall from openai.types.responses.response_output_message import ResponseOutputMessage from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool @@ -25,6 +26,7 @@ from vllm.entrypoints.openai.protocol import ( ChatCompletionMessageParam, ResponseInputOutputItem, ) +from vllm.utils import random_uuid def make_response_output_items_from_parsable_context( @@ -36,7 +38,24 @@ def make_response_output_items_from_parsable_context( if not isinstance(message, ResponseFunctionToolCallOutputItem): output_messages.append(message) else: - raise NotImplementedError("tool calls not supported for response context") + if len(output_messages) == 0: + raise ValueError( + "Cannot have a FunctionToolCallOutput before FunctionToolCall." + ) + if isinstance(output_messages[-1], ResponseFunctionToolCall): + mcp_message = McpCall( + id=f"mcp_{random_uuid()}", + arguments=output_messages[-1].arguments, + name=output_messages[-1].name, + server_label=output_messages[ + -1 + ].name, # TODO: store the server label + type="mcp_call", + status="completed", + output=message.output, + # TODO: support error output + ) + output_messages[-1] = mcp_message return output_messages diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py index c74ce1ee16de1..4feed827385d1 100644 --- a/vllm/entrypoints/tool.py +++ b/vllm/entrypoints/tool.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import os from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any +from openai.types.responses.response_function_tool_call_output_item import ( + ResponseFunctionToolCallOutputItem, +) from openai_harmony import Author, Message, Role, TextContent from vllm.logger import init_logger +from vllm.utils import random_uuid if TYPE_CHECKING: # Avoid circular import. @@ -46,6 +51,10 @@ class Tool(ABC): async def get_result(self, context: "ConversationContext") -> Any: pass + @abstractmethod + async def get_result_parsable_context(self, context: "ConversationContext") -> Any: + pass + class HarmonyBrowserTool(Tool): def __init__(self): @@ -81,6 +90,9 @@ class HarmonyBrowserTool(Tool): tool_output_msgs.append(msg) return tool_output_msgs + async def get_result_parsable_context(self, context: "ConversationContext") -> Any: + raise NotImplementedError("Not implemented yet") + @property def tool_config(self) -> Any: return self.browser_tool.tool_config @@ -138,6 +150,38 @@ class HarmonyPythonTool(Tool): tool_output_msgs.append(msg) return tool_output_msgs + async def get_result_parsable_context(self, context: "ConversationContext") -> Any: + """ + This function converts parsable context types to harmony and + back so we can use GPTOSS demo python tool + """ + from vllm.entrypoints.context import ParsableContext + + assert isinstance(context, ParsableContext) + + last_msg = context.parser.response_messages[-1] + args = json.loads(last_msg.arguments) + + last_msg_harmony = Message( + author=Author(role="assistant", name=None), + content=[TextContent(text=args["code"])], + channel="analysis", + recipient="python", + content_type="code", + ) + + tool_output_msgs = [] + async for msg in self.python_tool.process(last_msg_harmony): + processed = ResponseFunctionToolCallOutputItem( + id=f"fco_{random_uuid()}", + type="function_call_output", + call_id=f"call_{random_uuid()}", + output=msg.content[0].text, + status="completed", + ) + tool_output_msgs.append(processed) + return tool_output_msgs + @property def tool_config(self) -> Any: return self.python_tool.tool_config From e7296b08da66b4c79eb43d0932a3d8628178d036 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Fri, 5 Dec 2025 08:54:26 -0800 Subject: [PATCH 007/133] [bugfix] Pass globals to aot_compiled function (#29428) Signed-off-by: angelayi --- vllm/compilation/decorators.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index eed7795cdb349..6bb66ce3eec0c 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -409,7 +409,9 @@ def _support_torch_compile( open(aot_compilation_path, "rb") as f, ): start_monitoring_torch_compile(self.vllm_config) - loaded_fn = torch.compiler.load_compiled_function(f) + loaded_fn = torch.compiler.load_compiled_function( + f, f_globals=self.forward.__globals__ + ) _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config) loaded_fn.disable_guard_check() self.aot_compiled_fn = loaded_fn From 78c44fd722fe3ce010b0ba9c5e7349fe4094b39d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 5 Dec 2025 18:17:36 +0100 Subject: [PATCH 008/133] [NIXL] Small cleanup of unused variables (#29618) Signed-off-by: NickLucche Co-authored-by: Cyrus Leung --- .../kv_connector/unit/test_nixl_connector.py | 2 +- .../kv_connector/v1/nixl_connector.py | 20 +++++++------------ 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index ae4125d54190c..65db16f48c2c9 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1307,7 +1307,7 @@ def test_shutdown_cleans_up_resources(dist_init): patch.object(nixl_wrapper, "remove_remote_agent") as mock_rem_agent, patch.object(nixl_wrapper, "deregister_memory") as mock_dereg, ): - worker._recving_transfers = {"req1": [(123, time.perf_counter())]} + worker._recving_transfers = {"req1": [123]} worker.src_xfer_side_handle = 456 worker.dst_xfer_side_handles = {"engine1": 789} worker._remote_agents = {"engine1": {0: "agent1"}} diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 49330abcec324..649e54adaba49 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -55,7 +55,7 @@ if TYPE_CHECKING: from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request -Transfer = tuple[int, float] # (xfer_handle, start_time) +TransferHandle = int EngineId = str ReqId = str @@ -874,7 +874,7 @@ class NixlConnectorWorker: # In progress transfers. # [req_id -> list[handle]] self._recving_metadata: dict[ReqId, ReqMeta] = {} - self._recving_transfers = defaultdict[ReqId, list[Transfer]](list) + self._recving_transfers = defaultdict[ReqId, list[TransferHandle]](list) # Track the expiration time of requests that are waiting to be sent. self._reqs_to_send: dict[ReqId, float] = {} # Set of requests that have been part of a batch, regardless of status. @@ -1201,14 +1201,11 @@ class NixlConnectorWorker: # Enable different block lengths for different layers when MLA is used. self.block_len_per_layer = list[int]() self.slot_size_per_layer = list[int]() # HD bytes in kv terms - self.device_id = self.tp_rank for layer_name, cache_or_caches in xfer_buffers.items(): cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] for cache in cache_list: base_addr = cache.data_ptr() - if not self.use_host_buffer and current_platform.is_cuda_alike(): - self.device_id = cache.device.index if base_addr in seen_base_addresses: continue @@ -1251,8 +1248,7 @@ class NixlConnectorWorker: "All kv cache tensors must have the same size" ) # Need to make sure the device ID is non-negative for NIXL, - # Torch uses -1 to indicate CPU tensors while NIXL uses explicit - # memory type. + # Torch uses -1 to indicate CPU tensors. self.device_id = max(cache.get_device(), 0) caches_data.append( (base_addr, curr_tensor_size_bytes, self.device_id, "") @@ -1842,9 +1838,7 @@ class NixlConnectorWorker: self._reqs_to_send.pop(req_id, None) return notified_req_ids - def _pop_done_transfers( - self, transfers: dict[str, list[tuple[int, float]]] - ) -> set[str]: + def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]: """ Pop completed xfers by checking for DONE state. Args: @@ -1855,7 +1849,7 @@ class NixlConnectorWorker: done_req_ids: set[str] = set() for req_id, handles in list(transfers.items()): in_progress = False - for handle, xfer_start_time in handles: + for handle in handles: try: xfer_state = self.nixl_wrapper.check_xfer_state(handle) if xfer_state == "DONE": @@ -2120,7 +2114,7 @@ class NixlConnectorWorker: self.nixl_wrapper.transfer(handle) # Use handle to check completion in future step(). - self._recving_transfers[request_id].append((handle, time.perf_counter())) + self._recving_transfers[request_id].append(handle) except Exception: logger.exception( "NIXL transfer setup/initiation failed for request %s. " @@ -2251,7 +2245,7 @@ class NixlConnectorWorker: """Shutdown the connector worker.""" self._handshake_initiation_executor.shutdown(wait=False) for handles in self._recving_transfers.values(): - for handle, _ in handles: + for handle in handles: self.nixl_wrapper.release_xfer_handle(handle) self._recving_transfers.clear() if self.src_xfer_side_handle: From dc264bcea17b279173c7cbb99645c35a56bed778 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 5 Dec 2025 09:28:32 -0800 Subject: [PATCH 009/133] [BugFix] Eagerly abort cancelled final-step requests (#29987) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, when requests are cancelled while executing their final step, "completion" is handled based on normal stop processing (e.g. length or stop token), so the abort has no effect. This is typically not a problem, but when a kv connector is involved it thinks the request completed successfully rather than being aborted. This is problematic for disaggregated prefill which will free kv cache blocks if the request was aborted but not if it completed successfully—since the cancelled request will never be sent to the decode side, kv cache blocks remain pinned until the fall-back timeout expires. The problem is exacerbated when many requests are cancelled and/or there are large prefills whose forward pass takes a long time (since the window is bigger). This PR fixes the problem by processing pending aborts immediately prior to processing model output each step; we process only aborts, not new requests, since it's preferable for latency to process model outputs before new incoming requests. Fixes #26400. Signed-off-by: Nick Hill --- tests/v1/engine/test_abort_final_step.py | 311 +++++++++++++++++++++++ vllm/v1/engine/core.py | 37 ++- vllm/v1/worker/gpu_worker.py | 10 +- 3 files changed, 352 insertions(+), 6 deletions(-) create mode 100644 tests/v1/engine/test_abort_final_step.py diff --git a/tests/v1/engine/test_abort_final_step.py b/tests/v1/engine/test_abort_final_step.py new file mode 100644 index 0000000000000..560c5c2b1e300 --- /dev/null +++ b/tests/v1/engine/test_abort_final_step.py @@ -0,0 +1,311 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Test for the fix in PR #29987: Eagerly abort cancelled final-step requests. + +This test verifies that when a request is aborted during its final execution +step (when it would naturally complete), it is properly marked as aborted +rather than being treated as normally completed. + +The test uses a dummy KV connector to verify that the connector receives +the correct finish status (FINISHED_ABORTED, not FINISHED_LENGTH_CAPPED). +""" + +import asyncio +import tempfile +import time +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import pytest + +from vllm import SamplingParams +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, + KVConnectorMetadata, + KVConnectorRole, +) +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.platforms import current_platform +from vllm.sampling_params import RequestOutputKind +from vllm.utils.torch_utils import set_default_torch_num_threads +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.request import Request + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True) + +TEXT_PROMPT = "Hello" + + +class DummyKVConnectorMetadata(KVConnectorMetadata): + """Dummy metadata for the test connector.""" + + def __init__(self): + self.requests: list = [] + + +class DummyKVConnector(KVConnectorBase_V1): + """ + Dummy KV connector that captures request finish statuses to a file. + This is used to verify the fix - without the fix, a request aborted + during its final step would be captured as FINISHED_LENGTH_CAPPED + instead of FINISHED_ABORTED. + + The connector runs in a separate process, so we write statuses to a file + that can be read by the test process. + """ + + def __init__( + self, + vllm_config: VllmConfig, + role: KVConnectorRole, + kv_cache_config: KVCacheConfig | None = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + # Get the status file path from extra config + extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config or {} + self.status_file = extra_config.get("status_file") + # Log that we were initialized + if self.status_file: + try: + with open(self.status_file, "a") as f: + f.write(f"INIT:{role.name}\n") + except Exception: + pass + + def get_num_new_matched_tokens( + self, + request: Request, + num_computed_tokens: int, + ) -> tuple[int | None, bool]: + return (0, False) + + def update_state_after_alloc( + self, + request: Request, + blocks: Any, + num_external_tokens: int, + ): + pass + + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> KVConnectorMetadata: + return DummyKVConnectorMetadata() + + def request_finished( + self, + request: Request, + block_ids: list[int], + ) -> tuple[bool, dict[str, Any] | None]: + """Capture the request status when finished by writing to a file.""" + if self.status_file: + try: + with open(self.status_file, "a") as f: + # Write the status name (e.g., "FINISHED_ABORTED") + f.write(f"{request.status.name}\n") + except Exception as e: + # Log but don't fail - this is just test instrumentation + print(f"[DummyKVConnector] Failed to write status: {e}") + return False, None + + def start_load_kv(self, forward_context: Any, **kwargs: Any) -> None: + pass + + def wait_for_layer_load(self, layer_name: str) -> None: + pass + + def save_kv_layer( + self, + layer_name: str, + kv_layer: Any, + attn_metadata: Any, + **kwargs: Any, + ) -> None: + pass + + def wait_for_save(self): + pass + + +# Register the dummy connector +KVConnectorFactory.register_connector( + "DummyKVConnector", __name__, DummyKVConnector.__name__ +) + + +@pytest.mark.parametrize("async_scheduling", [False, True]) +@pytest.mark.asyncio +async def test_abort_during_final_step(async_scheduling: bool): + """ + Test that a request aborted during its final execution step is treated as + aborted rather than completed. + + This test: + 1. Monkeypatches execute_model to wait for a file to be deleted + 2. Configures a dummy KV connector to capture finish statuses + 3. Starts a request with max_tokens=1 (will complete on first decode step) + 4. Aborts the request, then deletes the file to unblock execute_model + 5. Verifies the KV connector received FINISHED_ABORTED not FINISHED_LENGTH_CAPPED + + See https://github.com/vllm-project/vllm/pull/29987. + + Without the fix, the KV connector would see FINISHED_LENGTH_CAPPED because + update_from_output() would mark the request as completed before processing + the abort. This causes KV cache blocks to not be freed properly in + disaggregated prefill scenarios. + + With the fix, _process_aborts_queue() runs before update_from_output(), so the + abort takes precedence and the KV connector sees FINISHED_ABORTED. + """ + + # Create three temporary files: + # 1. ready_file: deleted by execute_model to signal it has started + # 2. block_file: execute_model waits for this to be deleted + # 3. status_file: KV connector writes finish statuses here + with tempfile.NamedTemporaryFile(delete=False) as f: + ready_file = Path(f.name) + with tempfile.NamedTemporaryFile(delete=False) as f2: + block_file = Path(f2.name) + with tempfile.NamedTemporaryFile(delete=False, mode="w") as f3: + status_file = Path(f3.name) + + try: + # Get the original execute_model method + from vllm.v1.worker.gpu_worker import Worker + + original_execute_model = Worker.execute_model + + def execute_model_with_wait(self, scheduler_output): + # Signal that execute_model has been called by deleting ready_file + if ready_file.exists(): + ready_file.unlink() + + # Wait for the block file to be deleted (triggered from test after abort) + # This runs in the worker process (after fork), so we poll the filesystem + while block_file.exists(): + time.sleep(0.01) + return original_execute_model(self, scheduler_output) + + # Patch execute_model to inject the wait + # This happens before the worker process is forked, so the patch applies there + with patch.object(Worker, "execute_model", execute_model_with_wait): + request_id = "test-abort-final-step" + + # Configure engine with dummy KV connector + # Pass the status file path so the connector can write to it + kv_transfer_config = KVTransferConfig( + kv_connector="DummyKVConnector", + kv_role="kv_both", + kv_connector_extra_config={"status_file": str(status_file)}, + ) + engine_args = AsyncEngineArgs( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + async_scheduling=async_scheduling, + kv_transfer_config=kv_transfer_config, + ) + + with set_default_torch_num_threads(1): + engine = AsyncLLM.from_engine_args(engine_args) + + try: + # Create a request that will complete after just 1 token + sampling_params = SamplingParams( + max_tokens=1, + ignore_eos=True, + output_kind=RequestOutputKind.DELTA, + ) + + # Start generation in a task + outputs = [] + + async def generate(): + async for output in engine.generate( + request_id=request_id, + prompt=TEXT_PROMPT, + sampling_params=sampling_params, + ): + outputs.append(output) + + gen_task = asyncio.create_task(generate()) + + # Wait for execute_model to signal it has started (with timeout) + timeout = 5.0 # 5 second timeout + start_time = time.time() + while ready_file.exists(): + if time.time() - start_time > timeout: + raise TimeoutError( + "Timeout waiting for execute_model to start. " + "The monkeypatch may not be working correctly, " + "for example if spawn was used instead of fork." + ) + await asyncio.sleep(0.01) + + # Abort the request while execute_model is blocked + await engine.abort(request_id) + + # Now unblock execute_model by deleting the file + # The abort should be processed before the model output + block_file.unlink() + + # Wait for generation to complete + await gen_task + + # Give the scheduler a moment to finish cleanup + await asyncio.sleep(0.1) + + # Verify we got output + assert len(outputs) > 0, "Should have received at least one output" + + # The final output should have finish_reason="abort" + final_output = outputs[-1] + assert final_output.finished, ( + "Final output should be marked as finished" + ) + assert final_output.outputs[0].finish_reason == "abort", ( + f"Expected finish_reason='abort' but got " + f"'{final_output.outputs[0].finish_reason}'. " + ) + + with open(status_file) as f4: + status_lines = f4.read().strip().split("\n") + # Filter for actual finish statuses (not INIT or empty lines) + captured_statuses = [ + line + for line in status_lines + if line and line.startswith("FINISHED_") + ] + + assert len(captured_statuses) >= 1, ( + f"Expected at least 1 captured finish status, got " + f"{len(captured_statuses)}. File content: {status_lines}" + ) + + assert "FINISHED_ABORTED" in captured_statuses, ( + f"KV connector should see FINISHED_ABORTED but got " + f"{captured_statuses}. " + ) + + # Verify cleanup + assert not engine.output_processor.has_unfinished_requests() + + finally: + # Shutdown the engine + engine.shutdown() + + finally: + # Clean up temporary files if they still exist + if ready_file.exists(): + ready_file.unlink() + if block_file.exists(): + block_file.unlink() + if status_file.exists(): + status_file.unlink() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 61b8422dd6633..8e34dfcea7f61 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -204,6 +204,8 @@ class EngineCore: ) self.async_scheduling = vllm_config.scheduler_config.async_scheduling + self.aborts_queue = queue.Queue[list[str]]() + # Mark the startup heap as static so that it's ignored by GC. # Reduces pause times of oldest generation collections. freeze_gc_heap() @@ -347,6 +349,9 @@ class EngineCore: if model_output is None: model_output = self.model_executor.sample_tokens(grammar_output) + # Before processing the model output, process any aborts that happened + # during the model execution. + self._process_aborts_queue() engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output ) @@ -440,6 +445,9 @@ class EngineCore: with self.log_error_detail(scheduler_output): model_output = future.result() + # Before processing the model output, process any aborts that happened + # during the model execution. + self._process_aborts_queue() engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output ) @@ -458,6 +466,18 @@ class EngineCore: return engine_core_outputs, model_executed + def _process_aborts_queue(self): + if not self.aborts_queue.empty(): + request_ids = [] + while not self.aborts_queue.empty(): + ids = self.aborts_queue.get_nowait() + if isinstance(ids, str): + # Should be a list here, but also handle string just in case. + ids = (ids,) + request_ids.extend(ids) + # More efficient to abort all as a single batch. + self.abort_requests(request_ids) + def shutdown(self): self.structured_output_manager.clear_backend() if self.model_executor: @@ -871,9 +891,13 @@ class EngineCoreProc(EngineCore): and not self.scheduler.has_requests() and not self.batch_queue ): - if logger.isEnabledFor(DEBUG) and self.input_queue.empty(): - logger.debug("EngineCore waiting for work.") - waited = True + if self.input_queue.empty(): + # Drain aborts queue; all aborts are also processed via input_queue. + with self.aborts_queue.mutex: + self.aborts_queue.queue.clear() + if logger.isEnabledFor(DEBUG): + logger.debug("EngineCore waiting for work.") + waited = True req = self.input_queue.get() self._handle_client_request(*req) @@ -1027,6 +1051,13 @@ class EngineCoreProc(EngineCore): else: request = generic_decoder.decode(data_frames) + if request_type == EngineCoreRequestType.ABORT: + # Aborts are added to *both* queues, allows us to eagerly + # process aborts while also ensuring ordering in the input + # queue to avoid leaking requests. This is ok because + # aborting in the scheduler is idempotent. + self.aborts_queue.put_nowait(request) + # Push to input queue for core busy loop. self.input_queue.put_nowait((request_type, request)) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a133575cbbced..d189d0860b9a7 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -51,7 +51,6 @@ from vllm.v1.outputs import ( ModelRunnerOutput, ) from vllm.v1.utils import report_usage_stats -from vllm.v1.worker.gpu_model_runner import GPUModelRunner from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase @@ -59,6 +58,7 @@ logger = init_logger(__name__) if TYPE_CHECKING: from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + from vllm.v1.worker.gpu_model_runner import GPUModelRunner class Worker(WorkerBase): @@ -259,7 +259,11 @@ class Worker(WorkerBase): self.vllm_config, self.device ) else: - self.model_runner = GPUModelRunner(self.vllm_config, self.device) + from vllm.v1.worker.gpu_model_runner import ( + GPUModelRunner as GPUModelRunnerV1, + ) + + self.model_runner = GPUModelRunnerV1(self.vllm_config, self.device) if self.rank == 0: # If usage stat is enabled, collect relevant info. @@ -556,7 +560,7 @@ class Worker(WorkerBase): and forward_pass ): # currently only supported by V1 GPUModelRunner - assert isinstance(self.model_runner, GPUModelRunner) + assert not self.use_v2_model_runner num_scheduled_tokens_np = np.array( list(scheduler_output.num_scheduled_tokens.values()), dtype=np.int32, From dff0a2b39475096f5456721bfc8df3c7fea3cc57 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 5 Dec 2025 17:43:48 +0000 Subject: [PATCH 010/133] [NIXL] Add remote_request_id to kv_transfer_params (#29665) Signed-off-by: Mark McLoughlin --- .../v1/kv_connector/unit/test_nixl_connector.py | 6 ++++++ tests/v1/kv_connector/unit/utils.py | 1 + .../kv_connector/v1/nixl_connector.py | 17 ++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 65db16f48c2c9..5045ae0eef33f 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -470,6 +470,7 @@ class TestNixlHandshake: num_xfers + 6, ], "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", "remote_port": 1234, "remote_tp_size": 1, @@ -536,6 +537,7 @@ class TestNixlHandshake: kv_transfer_params={ "remote_block_ids": [4, 5, 6], "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_request_id": "prefill-id", "remote_host": "localhost", "remote_port": 1234, "remote_tp_size": prefill_tp_size, @@ -591,6 +593,7 @@ class TestNixlHandshake: kv_transfer_params={ "remote_block_ids": [4, 5, 6], "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_request_id": f"prefill-id-{i}", "remote_host": "localhost", "remote_port": 1234, "remote_tp_size": 1, @@ -754,6 +757,7 @@ def test_kv_connector_stats(dist_init): kv_transfer_params={ "remote_block_ids": [4, 5, 6], "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", "remote_port": 1234, "remote_tp_size": 1, @@ -1470,6 +1474,7 @@ def test_handshake_failure_returns_finished(dist_init): kv_transfer_params={ "remote_block_ids": [4, 5, 6], "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", "remote_port": 1234, "remote_tp_size": 1, @@ -1519,6 +1524,7 @@ def test_transfer_setup_failure_returns_finished(dist_init): kv_transfer_params={ "remote_block_ids": [10, 11, 12], "remote_engine_id": FakeNixlConnectorWorker.REMOTE_ENGINE_ID, + "remote_request_id": f"prefill-{request_id}", "remote_host": "localhost", "remote_port": 1234, "remote_tp_size": 1, diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index cea41c3ab18a6..58f1a7282352b 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -194,6 +194,7 @@ def create_request( do_remote_prefill=True, do_remote_decode=False, remote_engine_id="my-engine-id", + remote_request_id=f"prefill-{request_id}", remote_block_ids=list(range(num_remote_blocks)), remote_host="my-host", remote_port=1234, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 649e54adaba49..7aa12e9993c76 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -71,8 +71,9 @@ ReqId = str # # Version History: # 1: Initial version with compatibility checking +# 2: Add remote_request_id to kv_transfer_params # -NIXL_CONNECTOR_VERSION: int = 1 +NIXL_CONNECTOR_VERSION: int = 2 GET_META_MSG = b"get_meta_msg" @@ -210,6 +211,7 @@ class ReqMeta: remote_host: str remote_port: int remote_engine_id: str + remote_request_id: str tp_size: int @@ -236,6 +238,7 @@ class NixlConnectorMetadata(KVConnectorMetadata): local_physical_block_ids=local_block_ids, remote_block_ids=kv_transfer_params["remote_block_ids"], remote_engine_id=kv_transfer_params["remote_engine_id"], + remote_request_id=kv_transfer_params["remote_request_id"], remote_host=kv_transfer_params["remote_host"], remote_port=kv_transfer_params["remote_port"], # P workers don't need to receive tp_size from proxy here. @@ -622,7 +625,12 @@ class NixlConnectorScheduler: if params.get("remote_block_ids"): if all( p in params - for p in ("remote_engine_id", "remote_host", "remote_port") + for p in ( + "remote_engine_id", + "remote_request_id", + "remote_host", + "remote_port", + ) ): # If remote_blocks and num_external_tokens = 0, we have # a full prefix cache hit on the D worker. We need to call @@ -751,6 +759,7 @@ class NixlConnectorScheduler: do_remote_decode=False, remote_block_ids=block_ids, remote_engine_id=self.engine_id, + remote_request_id=request.request_id, remote_host=self.side_channel_host, remote_port=self.side_channel_port, tp_size=self.vllm_config.parallel_config.tensor_parallel_size, @@ -1964,6 +1973,7 @@ class NixlConnectorWorker: self._read_blocks( request_id=req_id, dst_engine_id=meta.remote_engine_id, + remote_request_id=meta.remote_request_id, local_block_ids=meta.local_physical_block_ids, remote_block_ids=meta.remote_block_ids, ) @@ -1974,6 +1984,7 @@ class NixlConnectorWorker: remote_block_ids: list[int], dst_engine_id: str, request_id: str, + remote_request_id: str, ): block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id) if block_size_ratio > 1: @@ -2006,7 +2017,7 @@ class NixlConnectorWorker: # Number of D TP workers that will read from dst P. Propagate tp_ratio # on notification so that dst worker can wait before freeing blocks. tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id) - notif_id = f"{request_id}:{tp_ratio}".encode() + notif_id = f"{remote_request_id}:{tp_ratio}".encode() # Full prefix cache hit: do not need to read remote blocks, # just notify P worker that we have the blocks we need. From 66e674cdd549e89b341fb33405a2c02ec5f5ae8d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 5 Dec 2025 12:48:43 -0500 Subject: [PATCH 011/133] [Attention][UX][1/N] Add AttentionConfig and change attention env vars to CLI arguments (#26315) Signed-off-by: Matthew Bonanni Signed-off-by: Matthew Bonanni Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Lucas Wilkinson --- tests/compile/test_fusion_attn.py | 5 +- .../kv_connector/unit/test_nixl_connector.py | 39 ++++- tests/v1/worker/test_gpu_model_runner.py | 158 +++++++++--------- vllm/attention/backends/abstract.py | 26 ++- vllm/attention/layer.py | 4 +- vllm/attention/selector.py | 137 +-------------- vllm/attention/utils/fa_utils.py | 11 +- vllm/config/__init__.py | 3 + vllm/config/attention.py | 114 +++++++++++++ vllm/config/model.py | 13 -- vllm/config/vllm.py | 7 + vllm/engine/arg_utils.py | 31 +++- vllm/model_executor/models/config.py | 11 +- vllm/model_executor/models/vision.py | 7 +- vllm/platforms/cuda.py | 23 +-- vllm/utils/flashinfer.py | 37 ++-- vllm/v1/attention/backends/flash_attn.py | 9 +- vllm/v1/attention/backends/flashinfer.py | 26 +-- vllm/v1/attention/backends/mla/common.py | 19 ++- .../attention/backends/mla/flashattn_mla.py | 5 +- vllm/v1/attention/backends/rocm_attn.py | 2 +- vllm/v1/attention/backends/triton_attn.py | 5 +- 22 files changed, 367 insertions(+), 325 deletions(-) create mode 100644 vllm/config/attention.py diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 9b4486e56c73e..db95dff5e0fc7 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -12,13 +12,13 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import Attention -from vllm.attention.selector import global_force_attn_backend_context_manager from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.matcher_utils import QUANT_OPS from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass from vllm.config import ( + AttentionConfig, CacheConfig, CompilationConfig, CompilationMode, @@ -335,6 +335,7 @@ def test_attention_quant_pattern( custom_ops=custom_ops_list, ), cache_config=CacheConfig(cache_dtype="fp8"), + attention_config=AttentionConfig(backend=backend), ) # Create test inputs @@ -352,7 +353,6 @@ def test_attention_quant_pattern( with ( set_current_vllm_config(vllm_config_unfused), set_forward_context(attn_metadata=None, vllm_config=vllm_config_unfused), - global_force_attn_backend_context_manager(backend), ): model_unfused = model_class( num_qo_heads=num_qo_heads, @@ -378,7 +378,6 @@ def test_attention_quant_pattern( with ( set_current_vllm_config(vllm_config), set_forward_context(attn_metadata=None, vllm_config=vllm_config), - global_force_attn_backend_context_manager(backend), ): model_fused = model_class( num_qo_heads=num_qo_heads, diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 5045ae0eef33f..ec9ff7315d097 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1151,13 +1151,29 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): } # Store tensor info for validation - expected_tensor_size = shared_tensor[0].element_size() * shared_tensor[0].numel() - expected_base_addrs = [ - shared_tensor[0].data_ptr(), - shared_tensor[1].data_ptr(), - unique_tensor[0].data_ptr(), - unique_tensor[1].data_ptr(), - ] + test_shape = backend_cls.get_kv_cache_shape( + num_blocks=1, block_size=16, num_kv_heads=1, head_size=1 + ) + is_blocks_first = len(test_shape) == 5 and test_shape[0] == 1 + + if is_blocks_first: + expected_tensor_size = shared_tensor.element_size() * shared_tensor.numel() + expected_base_addrs = [ + shared_tensor.data_ptr(), + unique_tensor.data_ptr(), + ] + expected_num_entries = 2 + else: + expected_tensor_size = ( + shared_tensor[0].element_size() * shared_tensor[0].numel() + ) + expected_base_addrs = [ + shared_tensor[0].data_ptr(), + shared_tensor[1].data_ptr(), + unique_tensor[0].data_ptr(), + unique_tensor[1].data_ptr(), + ] + expected_num_entries = 4 with ( patch( @@ -1192,7 +1208,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): # Verify get_reg_descs was called with caches_data assert mock_wrapper_instance.get_reg_descs.called caches_data, _ = mock_wrapper_instance.get_reg_descs.call_args[0] - assert len(caches_data) == 4 + assert len(caches_data) == expected_num_entries for i, cache_entry in enumerate(caches_data): base_addr, size, _tp_rank, _ = cache_entry @@ -1214,7 +1230,12 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): f"Expected {expected_blocks_count} blocks, got {len(blocks_data)}" ) - expected_block_len = expected_tensor_size // 2 + num_blocks = 2 + if is_blocks_first: + expected_block_len = expected_tensor_size // num_blocks // 2 + else: + expected_block_len = expected_tensor_size // num_blocks + for i, block_entry in enumerate(blocks_data): block_start_addr, block_len, tp_rank = block_entry assert block_len == expected_block_len, ( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 459abcfdd53cf..7b8c4268a5237 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -6,8 +6,10 @@ import pytest import torch from vllm.attention.backends.abstract import MultipleOf +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import Attention from vllm.config import ( + AttentionConfig, CacheConfig, ModelConfig, ParallelConfig, @@ -765,7 +767,7 @@ def test_init_kv_cache_with_kv_sharing_valid(): current_platform.is_rocm(), reason="Attention backend FLASHINFER is not supported on ROCm.", ) -def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): +def test_hybrid_attention_mamba_tensor_shapes(): """ The GPU model runner creates different views into the KVCacheTensors for the attention and mamba layers @@ -806,11 +808,13 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): cache_dtype="auto", ) parallel_config = ParallelConfig() + attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER) vllm_config = VllmConfig( model_config=model_config, cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, + attention_config=attention_config, ) layer_0 = "model.layers.0.self_attn.attn" @@ -820,8 +824,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): layer_4 = "model.layers.4.mixer" layer_5 = "model.layers.5.mixer" - with set_current_vllm_config(vllm_config), monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + with set_current_vllm_config(vllm_config): hf_config = vllm_config.model_config.hf_config fwd_context = {} for key in [layer_0, layer_1]: @@ -851,10 +854,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): ) # suppress var not used error assert fwd_context is not None - vllm_ctx = vllm_config.compilation_config.static_forward_context - - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + vllm_ctx = vllm_config.compilation_config.static_forward_context runner = GPUModelRunner(vllm_config, DEVICE) kv_cache_spec = runner.get_kv_cache_spec() @@ -865,94 +865,94 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): )[0] runner.initialize_kv_cache(kv_cache_config) - # random partition of blocks - # blocks0 will be assigned to attention layers - # blocks1 will be assigned to mamba layers - num_blocks = kv_cache_config.num_blocks - ind = np.arange(num_blocks) - np.random.shuffle(ind) - blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :] + # random partition of blocks + # blocks0 will be assigned to attention layers + # blocks1 will be assigned to mamba layers + num_blocks = kv_cache_config.num_blocks + ind = np.arange(num_blocks) + np.random.shuffle(ind) + blocks0, blocks1 = ind[: (num_blocks // 2)], ind[(num_blocks // 2) :] - attn_shape = vllm_ctx[layer_0].kv_cache[0].shape - conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape - ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape + attn_shape = vllm_ctx[layer_0].kv_cache[0].shape + conv_shape = vllm_ctx[layer_2].kv_cache[0][0].shape + ssm_shape = vllm_ctx[layer_2].kv_cache[0][1].shape - # assert we are using FlashInfer - assert attn_shape[0] % num_blocks == 0 - block_split_ratio = attn_shape[0] // num_blocks + # assert we are using FlashInfer + assert attn_shape[0] % num_blocks == 0 + block_split_ratio = attn_shape[0] // num_blocks - # use small blocks for testing to avoid memory issues - test_block_size = min(2, len(blocks0), len(blocks1)) + # use small blocks for testing to avoid memory issues + test_block_size = min(2, len(blocks0), len(blocks1)) - # use non-overlapping blocks to avoid data contamination - # Split kernel blocks: first half for attention, second half for mamba - mid_point = num_blocks // 2 + # use non-overlapping blocks to avoid data contamination + # Split kernel blocks: first half for attention, second half for mamba + mid_point = num_blocks // 2 - # attention uses kernel blocks from first half (mapped to logical blocks) - kv_blocks_for_attention = np.array([0, 1])[:test_block_size] + # attention uses kernel blocks from first half (mapped to logical blocks) + kv_blocks_for_attention = np.array([0, 1])[:test_block_size] - # mamba uses kernel blocks from second half - kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size] + # mamba uses kernel blocks from second half + kv_blocks_for_mamba = np.array([mid_point, mid_point + 1])[:test_block_size] - # create small constant tensors for testing with corrected shapes - # attention: [block_size, ...] starting from dimension 2 - attn_constant_shape = attn_shape[2:] - conv_constant_shape = conv_shape[1:] - ssm_constant_shape = ssm_shape[1:] + # create small constant tensors for testing with corrected shapes + # attention: [block_size, ...] starting from dimension 2 + attn_constant_shape = attn_shape[2:] + conv_constant_shape = conv_shape[1:] + ssm_constant_shape = ssm_shape[1:] - attn_blocks_constant = torch.full( - (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33 - ) - conv_blocks_constant = torch.full( - (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66 - ) - ssm_blocks_constant = torch.full( - (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99 - ) + attn_blocks_constant = torch.full( + (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33 + ) + conv_blocks_constant = torch.full( + (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66 + ) + ssm_blocks_constant = torch.full( + (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99 + ) - # Fill attention blocks with constants using kv block indices - kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio + # Fill attention blocks with constants using kv block indices + kernel_blocks_for_attention = kv_blocks_for_attention * block_split_ratio - for layer in [layer_0, layer_1]: - # attention: kv_cache[0][kernel_block_idx, kv_idx, ...] - for i, kernel_block in enumerate(kernel_blocks_for_attention): - vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i] + for layer in [layer_0, layer_1]: + # attention: kv_cache[0][kernel_block_idx, kv_idx, ...] + for i, kernel_block in enumerate(kernel_blocks_for_attention): + vllm_ctx[layer].kv_cache[0][kernel_block, :] = attn_blocks_constant[i] - # fill mamba blocks with constants using kernel block indices - for layer in [layer_2, layer_3, layer_4, layer_5]: - # mamba: kv_cache[0][component][kernel_block_idx, ...] - for i, kv_block in enumerate(kv_blocks_for_mamba): - vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i] - vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i] + # fill mamba blocks with constants using kernel block indices + for layer in [layer_2, layer_3, layer_4, layer_5]: + # mamba: kv_cache[0][component][kernel_block_idx, ...] + for i, kv_block in enumerate(kv_blocks_for_mamba): + vllm_ctx[layer].kv_cache[0][0][kv_block, :] = conv_blocks_constant[i] + vllm_ctx[layer].kv_cache[0][1][kv_block, :] = ssm_blocks_constant[i] - # verify attention and mamba contents are correct - for layer in [layer_0, layer_1]: - for i, kernel_block in enumerate(kernel_blocks_for_attention): - actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :] - expected = attn_blocks_constant[i] + # verify attention and mamba contents are correct + for layer in [layer_0, layer_1]: + for i, kernel_block in enumerate(kernel_blocks_for_attention): + actual_kv = vllm_ctx[layer].kv_cache[0][kernel_block, :] + expected = attn_blocks_constant[i] - # Check K and V separately - assert torch.equal(actual_kv[0], expected) - assert torch.equal(actual_kv[1], expected) + # Check K and V separately + assert torch.equal(actual_kv[0], expected) + assert torch.equal(actual_kv[1], expected) - for layer in [layer_2, layer_3, layer_4, layer_5]: - for i, kv_block in enumerate(kv_blocks_for_mamba): - actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :] - actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :] - expected_conv = conv_blocks_constant[i] - expected_ssm = ssm_blocks_constant[i] + for layer in [layer_2, layer_3, layer_4, layer_5]: + for i, kv_block in enumerate(kv_blocks_for_mamba): + actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :] + actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :] + expected_conv = conv_blocks_constant[i] + expected_ssm = ssm_blocks_constant[i] - assert torch.equal(actual_conv, expected_conv) - assert torch.equal(actual_ssm, expected_ssm) + assert torch.equal(actual_conv, expected_conv) + assert torch.equal(actual_ssm, expected_ssm) - for layer in [layer_2, layer_3, layer_4, layer_5]: - for i, kv_block in enumerate(kv_blocks_for_mamba): - actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :] - actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :] - expected_conv = conv_blocks_constant[i] - expected_ssm = ssm_blocks_constant[i] - assert torch.equal(actual_conv, expected_conv) - assert torch.equal(actual_ssm, expected_ssm) + for layer in [layer_2, layer_3, layer_4, layer_5]: + for i, kv_block in enumerate(kv_blocks_for_mamba): + actual_conv = vllm_ctx[layer].kv_cache[0][0][kv_block, :] + actual_ssm = vllm_ctx[layer].kv_cache[0][1][kv_block, :] + expected_conv = conv_blocks_constant[i] + expected_ssm = ssm_blocks_constant[i] + assert torch.equal(actual_conv, expected_conv) + assert torch.equal(actual_ssm, expected_ssm) def test_hybrid_block_table_initialization(): diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index c290670eeacb0..84cca8e686075 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -289,6 +289,16 @@ class AttentionImpl(ABC, Generic[T]): # even if they can return lse (for efficiency reasons) need_to_return_lse_for_decode: bool = False + # Whether this attention implementation supports pre-quantized query input. + # When True, the attention layer will quantize queries before passing them + # to this backend, allowing torch.compile to fuse the quantization with + # previous operations. This is typically supported when using FP8 KV cache + # with compatible attention kernels (e.g., TRT-LLM). + # Subclasses should set this in __init__. + # TODO add support to more backends: + # https://github.com/vllm-project/vllm/issues/25584 + supports_quant_query_input: bool = False + dcp_world_size: int dcp_rank: int @@ -368,22 +378,6 @@ class AttentionImpl(ABC, Generic[T]): """ return False - def supports_quant_query_input(self) -> bool: - """ - Check if this attention implementation supports pre-quantized query input. - - When True, the attention layer will quantize queries before passing them - to this backend, allowing torch.compile to fuse the quantization with - previous operations. This is typically supported when using FP8 KV cache - with compatible attention kernels (e.g., TRT-LLM). - TODO add support to more backends: - https://github.com/vllm-project/vllm/issues/25584 - - Returns: - bool: True if the implementation can accept pre-quantized queries. - """ - return False - def process_weights_after_loading(self, act_dtype: torch.dtype): pass diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index da5a626171298..8a522deedf3ce 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -303,7 +303,7 @@ class Attention(nn.Module, AttentionLayerBase): self.query_quant = None if ( self.kv_cache_dtype.startswith("fp8") - and self.impl.supports_quant_query_input() + and self.impl.supports_quant_query_input ): self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR) @@ -338,7 +338,7 @@ class Attention(nn.Module, AttentionLayerBase): assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"} # check if query quantization is supported - if self.impl.supports_quant_query_input(): + if self.impl.supports_quant_query_input: query, _ = self.query_quant(query, self._q_scale) if self.use_output: diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index a7190df3c4f10..aeb130dfe8726 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -2,19 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect -import os -from collections.abc import Generator -from contextlib import contextmanager from functools import cache from typing import cast, get_args import torch -import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.registry import ( MAMBA_TYPE_TO_BACKEND_MAP, - AttentionBackendEnum, MambaAttentionBackendEnum, ) from vllm.config.cache import CacheDType @@ -24,60 +19,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname logger = init_logger(__name__) -def get_env_variable_attn_backend() -> AttentionBackendEnum | None: - """ - Get the backend override specified by the vLLM attention - backend environment variable, if one is specified. - - Returns: - - * AttentionBackendEnum value if an override is specified - * None otherwise - """ - backend_name = os.environ.get("VLLM_ATTENTION_BACKEND") - if backend_name is None: - return None - if backend_name == "XFORMERS": - raise ValueError( - "Attention backend 'XFORMERS' has been removed (See PR #29262 for " - "details). Please select a supported attention backend." - ) - return AttentionBackendEnum[backend_name] - - -# Global state allows a particular choice of backend -# to be forced, overriding the logic which auto-selects -# a backend based on system & workload configuration -# (default behavior if this variable is None) -# -# THIS SELECTION TAKES PRECEDENCE OVER THE -# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE -forced_attn_backend: AttentionBackendEnum | None = None - - -def global_force_attn_backend(attn_backend: AttentionBackendEnum | None) -> None: - """ - Force all attention operations to use a specified backend. - - Passing `None` for the argument re-enables automatic - backend selection., - - Arguments: - - * attn_backend: backend selection (None to revert to auto) - """ - global forced_attn_backend - forced_attn_backend = attn_backend - - -def get_global_forced_attn_backend() -> AttentionBackendEnum | None: - """ - Get the currently-forced choice of attention backend, - or None if auto-selection is currently enabled. - """ - return forced_attn_backend - - def get_attn_backend( head_size: int, dtype: torch.dtype, @@ -97,7 +38,13 @@ def get_attn_backend( f"Valid values are: {valid_cache_dtypes}" ) + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + backend_enum = vllm_config.attention_config.backend + return _cached_get_attn_backend( + backend=backend_enum, head_size=head_size, dtype=dtype, kv_cache_dtype=cast(CacheDType | None, kv_cache_dtype), @@ -111,6 +58,7 @@ def get_attn_backend( @cache def _cached_get_attn_backend( + backend, head_size: int, dtype: torch.dtype, kv_cache_dtype: CacheDType | None, @@ -120,39 +68,6 @@ def _cached_get_attn_backend( use_sparse: bool = False, attn_type: str | None = None, ) -> type[AttentionBackend]: - # Check whether a particular choice of backend was - # previously forced. - # - # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND - # ENVIRONMENT VARIABLE. - selected_backend = None - backend_by_global_setting: AttentionBackendEnum | None = ( - get_global_forced_attn_backend() - ) - if backend_by_global_setting is not None: - selected_backend = backend_by_global_setting - else: - # Check the environment variable and override if specified - backend_by_env_var: str | None = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - if backend_by_env_var.endswith("_VLLM_V1"): - logger.warning( - "The suffix '_VLLM_V1' in the environment variable " - "VLLM_ATTENTION_BACKEND is no longer necessary as " - "V0 backends have been deprecated. " - "Please remove this suffix from your " - "environment variable setting.", - ) - backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1") - try: - selected_backend = AttentionBackendEnum[backend_by_env_var] - except KeyError as e: - raise ValueError( - f"Invalid attention backend: '{backend_by_env_var}'. Valid " - f"backends are: {list(AttentionBackendEnum.__members__.keys())}" - ) from e - - # get device-specific attn_backend from vllm.platforms import current_platform sig = inspect.signature(current_platform.get_attn_backend_cls) @@ -163,7 +78,7 @@ def _cached_get_attn_backend( "remove it from your plugin code." ) attention_cls = current_platform.get_attn_backend_cls( - selected_backend, + backend, head_size, dtype, kv_cache_dtype, @@ -176,7 +91,7 @@ def _cached_get_attn_backend( ) else: attention_cls = current_platform.get_attn_backend_cls( - selected_backend, + backend, head_size, dtype, kv_cache_dtype, @@ -232,37 +147,3 @@ def _cached_get_mamba_attn_backend( mamba_attn_backend = selected_backend.get_class() return mamba_attn_backend - - -@contextmanager -def global_force_attn_backend_context_manager( - attn_backend: AttentionBackendEnum, -) -> Generator[None, None, None]: - """ - Globally force a vLLM attention backend override within a - context manager, reverting the global attention backend - override to its prior state upon exiting the context - manager. - - Arguments: - - * attn_backend: attention backend to force - - Returns: - - * Generator - """ - - # Save the current state of the global backend override (if any) - original_value = get_global_forced_attn_backend() - - # Globally force the new backend override - global_force_attn_backend(attn_backend) - - # Yield control back to the enclosed code block - try: - yield - finally: - # Revert the original global backend override, if any - global_force_attn_backend(original_value) - _cached_get_attn_backend.cache_clear() diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index 8a46587473e43..e38c88f4838d1 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm import envs from vllm.logger import init_logger from vllm.platforms import current_platform @@ -49,10 +48,12 @@ def get_flash_attn_version(requires_alibi: bool = False) -> int | None: 3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2 ) - # 2. override if passed by environment - if envs.VLLM_FLASH_ATTN_VERSION is not None: - assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3] - fa_version = envs.VLLM_FLASH_ATTN_VERSION + # 2. override if passed by environment or config + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + if vllm_config.attention_config.flash_attn_version is not None: + fa_version = vllm_config.attention_config.flash_attn_version # 3. fallback for unsupported combinations if device_capability.major == 10 and fa_version == 3: diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index dd76a722106ef..0f84f3ca9d3e3 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.config.attention import AttentionConfig from vllm.config.cache import CacheConfig from vllm.config.compilation import ( CompilationConfig, @@ -46,6 +47,8 @@ from vllm.config.vllm import ( # __all__ should only contain classes and functions. # Types and globals should be imported from their respective modules. __all__ = [ + # From vllm.config.attention + "AttentionConfig", # From vllm.config.cache "CacheConfig", # From vllm.config.compilation diff --git a/vllm/config/attention.py b/vllm/config/attention.py new file mode 100644 index 0000000000000..dd62d88826bd6 --- /dev/null +++ b/vllm/config/attention.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any, Literal + +from pydantic import field_validator +from pydantic.dataclasses import dataclass + +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config.utils import config +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@config +@dataclass +class AttentionConfig: + """Configuration for attention mechanisms in vLLM.""" + + backend: AttentionBackendEnum | None = None + """Attention backend to use. If None, will be selected automatically.""" + + flash_attn_version: Literal[2, 3] | None = None + """Force vllm to use a specific flash-attention version (2 or 3). + Only valid when using the flash-attention backend.""" + + use_prefill_decode_attention: bool = False + """Use separate prefill and decode kernels for attention instead of + the unified triton kernel.""" + + flash_attn_max_num_splits_for_cuda_graph: int = 32 + """Flash Attention max number splits for cuda graph decode.""" + + use_cudnn_prefill: bool = False + """Whether to use cudnn prefill.""" + + use_trtllm_ragged_deepseek_prefill: bool = False + """Whether to use TRTLLM ragged deepseek prefill.""" + + use_trtllm_attention: bool | None = None + """If set to True/False, use or don't use the TRTLLM attention backend + in flashinfer. If None, auto-detect the attention backend in flashinfer.""" + + disable_flashinfer_prefill: bool = False + """Whether to disable flashinfer prefill.""" + + disable_flashinfer_q_quantization: bool = False + """If set, when using fp8 kv, do not quantize Q to fp8.""" + + def compute_hash(self) -> str: + """ + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + from vllm.config.utils import get_hash_factors, hash_factors + + ignored_factors: list[str] = [] + factors = get_hash_factors(self, ignored_factors) + return hash_factors(factors) + + @field_validator("backend", mode="before") + @classmethod + def validate_backend_before(cls, value: Any) -> Any: + """Enable parsing of the `backend` enum type from string.""" + if isinstance(value, str): + return AttentionBackendEnum[value.upper()] + return value + + def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None: + """Set field from env var if set, with deprecation warning.""" + from vllm import envs + + if envs.is_set(env_var_name): + value = getattr(envs, env_var_name) + if field_name == "backend": + value = self.validate_backend_before(value) + setattr(self, field_name, value) + logger.warning_once( + "Using %s environment variable is deprecated and will be removed in " + "v0.14.0 or v1.0.0, whichever is soonest. Please use " + "--attention-config.%s command line argument or " + "AttentionConfig(%s=...) config field instead.", + env_var_name, + field_name, + field_name, + ) + + def __post_init__(self) -> None: + self._set_from_env_if_set("backend", "VLLM_ATTENTION_BACKEND") + self._set_from_env_if_set("flash_attn_version", "VLLM_FLASH_ATTN_VERSION") + self._set_from_env_if_set( + "use_prefill_decode_attention", "VLLM_V1_USE_PREFILL_DECODE_ATTENTION" + ) + self._set_from_env_if_set( + "flash_attn_max_num_splits_for_cuda_graph", + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", + ) + self._set_from_env_if_set("use_cudnn_prefill", "VLLM_USE_CUDNN_PREFILL") + self._set_from_env_if_set( + "use_trtllm_ragged_deepseek_prefill", + "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", + ) + self._set_from_env_if_set("use_trtllm_attention", "VLLM_USE_TRTLLM_ATTENTION") + self._set_from_env_if_set( + "disable_flashinfer_prefill", "VLLM_DISABLE_FLASHINFER_PREFILL" + ) + self._set_from_env_if_set( + "disable_flashinfer_q_quantization", + "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", + ) diff --git a/vllm/config/model.py b/vllm/config/model.py index ae5189ce68d9a..5be7d5e7f2df7 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -4,7 +4,6 @@ import warnings from collections.abc import Callable from dataclasses import InitVar, field -from importlib.util import find_spec from typing import TYPE_CHECKING, Any, Literal, cast, get_args import torch @@ -467,18 +466,6 @@ class ModelConfig: self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - if ( - (backend := envs.VLLM_ATTENTION_BACKEND) - and backend == "FLASHINFER" - and find_spec("flashinfer") is None - ): - raise ValueError( - "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " - "module was not found. See " - "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 - "for instructions on how to install it." - ) - from vllm.platforms import current_platform if self.override_attention_dtype is not None and not current_platform.is_rocm(): diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 823bd96db9ac9..ce3d3b20865db 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -27,6 +27,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri from vllm.utils import random_uuid from vllm.utils.hashing import safe_hash +from .attention import AttentionConfig from .cache import CacheConfig from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode from .device import DeviceConfig @@ -192,6 +193,8 @@ class VllmConfig: """Device configuration.""" load_config: LoadConfig = Field(default_factory=LoadConfig) """Load configuration.""" + attention_config: AttentionConfig = Field(default_factory=AttentionConfig) + """Attention configuration.""" lora_config: LoRAConfig | None = None """LoRA configuration.""" speculative_config: SpeculativeConfig | None = None @@ -279,6 +282,10 @@ class VllmConfig: vllm_factors.append(self.load_config.compute_hash()) else: vllm_factors.append("None") + if self.attention_config: + vllm_factors.append(self.attention_config.compute_hash()) + else: + vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) else: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 883ae370f9e74..aad0719548d18 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -34,6 +34,7 @@ from typing_extensions import TypeIs import vllm.envs as envs from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( + AttentionConfig, CacheConfig, CompilationConfig, ConfigType, @@ -527,6 +528,7 @@ class EngineArgs: pooler_config: PoolerConfig | None = ModelConfig.pooler_config compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config") + attention_config: AttentionConfig = get_field(VllmConfig, "attention_config") worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls @@ -542,6 +544,7 @@ class EngineArgs: ) model_impl: str = ModelConfig.model_impl override_attention_dtype: str = ModelConfig.override_attention_dtype + attention_backend: AttentionBackendEnum | None = AttentionConfig.backend calculate_kv_scales: bool = CacheConfig.calculate_kv_scales mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype @@ -580,6 +583,8 @@ class EngineArgs: # CompilationConfig object if isinstance(self.compilation_config, dict): self.compilation_config = CompilationConfig(**self.compilation_config) + if isinstance(self.attention_config, dict): + self.attention_config = AttentionConfig(**self.attention_config) if isinstance(self.eplb_config, dict): self.eplb_config = EPLBConfig(**self.eplb_config) # Setup plugins @@ -717,6 +722,16 @@ class EngineArgs: "--pt-load-map-location", **load_kwargs["pt_load_map_location"] ) + # Attention arguments + attention_kwargs = get_kwargs(AttentionConfig) + attention_group = parser.add_argument_group( + title="AttentionConfig", + description=AttentionConfig.__doc__, + ) + attention_group.add_argument( + "--attention-backend", **attention_kwargs["backend"] + ) + # Structured outputs arguments structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig) structured_outputs_group = parser.add_argument_group( @@ -1140,6 +1155,9 @@ class EngineArgs: vllm_group.add_argument( "--compilation-config", "-cc", **vllm_kwargs["compilation_config"] ) + vllm_group.add_argument( + "--attention-config", "-ac", **vllm_kwargs["attention_config"] + ) vllm_group.add_argument( "--additional-config", **vllm_kwargs["additional_config"] ) @@ -1693,6 +1711,16 @@ class EngineArgs: if model_config.quantization == "bitsandbytes": self.quantization = self.load_format = "bitsandbytes" + # Attention config overrides + attention_config = copy.deepcopy(self.attention_config) + if self.attention_backend is not None: + if attention_config.backend is not None: + raise ValueError( + "attention_backend and attention_config.backend " + "are mutually exclusive" + ) + attention_config.backend = self.attention_backend + load_config = self.create_load_config() # Pass reasoning_parser into StructuredOutputsConfig @@ -1750,9 +1778,10 @@ class EngineArgs: parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, + load_config=load_config, + attention_config=attention_config, lora_config=lora_config, speculative_config=speculative_config, - load_config=load_config, structured_outputs_config=self.structured_outputs_config, observability_config=observability_config, compilation_config=compilation_config, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index fbeb28a1c0b36..55dd6e50ad249 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -4,7 +4,7 @@ from copy import deepcopy from math import lcm from typing import TYPE_CHECKING -import vllm.envs as envs +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform @@ -331,6 +331,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # Enable FULL_AND_PIECEWISE by default MambaModelConfig.verify_and_update_config(vllm_config) + attention_config = vllm_config.attention_config cache_config = vllm_config.cache_config model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config @@ -347,7 +348,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # * CUTLASS_MLA backend: kernel_block_size 128 alignment # * Other MLA backends: kernel_block_size 64 alignment if model_config.use_mla: - use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" + use_cutlass_mla = ( + attention_config.backend == AttentionBackendEnum.CUTLASS_MLA + ) kernel_block_alignment_size = 128 if use_cutlass_mla else 64 attn_page_size_1_token = MLAAttentionSpec( block_size=1, @@ -361,8 +364,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): current_platform.is_device_capability(100) and model_config.get_head_size() == 256 and ( - envs.VLLM_ATTENTION_BACKEND is None - or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER" + attention_config.backend is None + or attention_config.backend == AttentionBackendEnum.FLASHINFER ) ): # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that` diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index e5d70eb7bc2fc..7602eca9c3257 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -11,7 +11,7 @@ import torch from transformers import PretrainedConfig from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.config import VllmConfig +from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -91,10 +91,7 @@ def get_vit_attn_backend( if attn_backend_override is not None: return attn_backend_override - # Lazy import to avoid circular dependency - from vllm.attention.selector import get_env_variable_attn_backend - - selected_backend: AttentionBackendEnum | None = get_env_variable_attn_backend() + selected_backend = get_current_vllm_config().attention_config.backend if selected_backend is not None: return selected_backend diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 1467ca71efec1..7e6ce6aeef536 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -14,7 +14,6 @@ from typing_extensions import ParamSpec # import custom ops, trigger op registration import vllm._C # noqa -import vllm.envs as envs from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger @@ -149,6 +148,8 @@ class CudaPlatformBase(Platform): @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: + from vllm.attention.backends.registry import AttentionBackendEnum + parallel_config = vllm_config.parallel_config model_config = vllm_config.model_config @@ -171,7 +172,7 @@ class CudaPlatformBase(Platform): and cache_config.block_size is not None ): use_sparse = hasattr(vllm_config.model_config.hf_config, "index_topk") - # If `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, + # If `--attention-config.backend` is not set and we are using MLA, # then we default to FlashMLA backend for non-blackwell GPUs, # else we default to CutlassMLA. For each case, we force the # required block_size. @@ -179,23 +180,25 @@ class CudaPlatformBase(Platform): use_cutlass_mla = False use_flashinfer_mla = False - if envs.VLLM_ATTENTION_BACKEND is None: + if vllm_config.attention_config.backend is None: # Default case if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. use_cutlass_mla = True - # TODO: This does not work, because the - # global_force_attn_backend_context_manager is not set. - # See vllm/attention/selector.py:_cached_get_attn_backend - envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" + # Set the backend in AttentionConfig so it's used during + # backend selection + vllm_config.attention_config.backend = ( + AttentionBackendEnum.CUTLASS_MLA + ) else: # Not Blackwell use_flashmla = True else: # Forced case - use_flashmla = envs.VLLM_ATTENTION_BACKEND == "FLASHMLA" - use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" - use_flashinfer_mla = envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA" + backend = vllm_config.attention_config.backend + use_flashmla = backend == AttentionBackendEnum.FLASHMLA + use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA + use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA from vllm.attention.ops.flashmla import is_flashmla_dense_supported diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 9f9976d52b4ae..7aaf690cbaa13 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -267,21 +267,16 @@ def supports_trtllm_attention() -> bool: return current_platform.is_device_capability(100) and has_nvidia_artifactory() -@functools.cache -def _force_use_trtllm_attention(env_value: bool | None) -> bool | None: - """Cache the env value for VLLM_USE_TRTLLM_ATTENTION""" - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value) - return env_value - - def force_use_trtllm_attention() -> bool | None: """ - Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set, + Return `None` if --attention-config.use_trtllm_attention is not set, return `True` if TRTLLM attention is forced to be used, return `False` if TRTLLM attention is forced to be not used. """ - return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION) + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + return vllm_config.attention_config.use_trtllm_attention def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool: @@ -307,7 +302,7 @@ def use_trtllm_attention( """Return `True` if TRTLLM attention is used.""" force_use_trtllm = force_use_trtllm_attention() - # Environment variable is set to 0 - respect it + # CLI argument is set to 0 - respect it if force_use_trtllm is not None and not force_use_trtllm: return False @@ -324,7 +319,7 @@ def use_trtllm_attention( if force_use_trtllm: logger.warning_once( "TRTLLM attention is not supported on this platform, " - "but VLLM_USE_TRTLLM_ATTENTION is set to 1" + "but --attention-config.use_trtllm_attention is set to 1" ) return False @@ -333,7 +328,8 @@ def use_trtllm_attention( if force_use_trtllm: logger.warning_once( "TRTLLM attention is not supported for this combination of " - "query and key heads, but VLLM_USE_TRTLLM_ATTENTION is set to 1" + "query and key heads, but --attention-config.use_trtllm_attention is " + "set to 1" ) return False @@ -354,7 +350,7 @@ def use_trtllm_attention( return True if force_use_trtllm is None: - # Environment variable not set - use auto-detection + # CLI argument not set - use auto-detection if is_prefill: # Prefill auto-detection use_trtllm = kv_cache_dtype == "auto" @@ -367,8 +363,10 @@ def use_trtllm_attention( logger.warning_once("Using TRTLLM decode attention (auto-detected).") return use_trtllm - # Environment variable is set to 1 - respect it - logger.info_once("Using TRTLLM attention (VLLM_USE_TRTLLM_ATTENTION is set to 1)") + # CLI argument is set to 1 - respect it + logger.info_once( + "Using TRTLLM attention (--attention-config.use_trtllm_attention is set to 1)" + ) return True @@ -500,12 +498,6 @@ def flashinfer_scaled_fp8_mm( return output -@functools.cache -def flashinfer_disable_q_quantization() -> bool: - """Cache result which only depends on the environment""" - return envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION - - __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", @@ -526,7 +518,6 @@ __all__ = [ "supports_trtllm_attention", "can_use_trtllm_attention", "use_trtllm_attention", - "flashinfer_disable_q_quantization", "flashinfer_scaled_fp4_mm", "flashinfer_scaled_fp8_mm", ] diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fb080b0b33bc0..f5ad98cf2125c 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -8,7 +8,6 @@ from typing import ClassVar import numpy as np import torch -from vllm import envs from vllm.attention.backends.abstract import ( AttentionBackend, AttentionImpl, @@ -264,6 +263,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad self.parallel_config = vllm_config.parallel_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config + self.attention_config = vllm_config.attention_config self.num_heads_q = self.model_config.get_num_attention_heads( self.parallel_config @@ -304,7 +304,9 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + self.attention_config.flash_attn_max_num_splits_for_cuda_graph + ) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. @@ -554,8 +556,7 @@ class FlashAttentionImpl(AttentionImpl): "heads in the layer" ) - def supports_quant_query_input(self) -> bool: - return True + self.supports_quant_query_input = True def forward( self, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 3d9640a2d4024..8e9d764e4a123 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -26,7 +26,7 @@ from vllm.attention.backends.abstract import ( ) from vllm.attention.ops.common import cp_lse_ag_out_rs from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.config import CUDAGraphMode, VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.distributed.parallel_state import get_dcp_group from vllm.logger import init_logger @@ -43,7 +43,6 @@ from vllm.platforms.interface import DeviceCapability from vllm.triton_utils import tl, triton from vllm.utils.flashinfer import ( can_use_trtllm_attention, - flashinfer_disable_q_quantization, use_trtllm_attention, ) from vllm.utils.math_utils import cdiv @@ -362,7 +361,8 @@ class FlashInferBackend(AttentionBackend): supports_trtllm_attention, ) - # Respect explicit disable flag (e.g., VLLM_USE_TRTLLM_ATTENTION=0) + # Respect explicit disable flag (e.g., + # --attention-config.use_trtllm_attention=0) if force_use_trtllm_attention() is False: return False @@ -500,11 +500,14 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.kv_cache_dtype = self.kv_cache_spec.dtype # Use model dtype as q dtype when TRTLLM attn is not supported, or - # VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION is set to 1. Otherwise, try to - # use fp8 q if kv cache is fp8, and will fall back to model dtype + # --attention-config.disable_flashinfer_q_quantization is set to 1. Otherwise, + # try to use fp8 q if kv cache is fp8, and will fall back to model dtype # if TRTLLM attention kernel is not used when building attn metadata can_use_trtllm = can_use_trtllm_attention(self.num_qo_heads, self.num_kv_heads) - if can_use_trtllm and not flashinfer_disable_q_quantization(): + if ( + can_use_trtllm + and not vllm_config.attention_config.disable_flashinfer_q_quantization + ): self.q_data_type = self.kv_cache_dtype else: self.q_data_type = self.model_config.dtype @@ -1035,6 +1038,11 @@ class FlashInferImpl(AttentionImpl): self.sinks = sinks self.support_trtllm_attn = can_use_trtllm_attention(num_heads, num_kv_heads) + vllm_config = get_current_vllm_config() + self.supports_quant_query_input = ( + self.support_trtllm_attn + and not vllm_config.attention_config.disable_flashinfer_q_quantization + ) self.bmm1_scale: float | None = None self.bmm2_scale: float | None = None self.o_sf_scale: float | None = None @@ -1046,12 +1054,6 @@ class FlashInferImpl(AttentionImpl): and quant_key in (kFp8StaticTensorSym, kNvfp4Quant) ) - def supports_quant_query_input(self) -> bool: - if flashinfer_disable_q_quantization(): - return False - - return self.support_trtllm_attn - # FlashInfer requires attention sinks to be float32 def process_weights_after_loading(self, act_dtype: torch.dtype): if self.sinks is not None and self.sinks.dtype != torch.float32: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 180625b6ce897..309ddee4fc2f0 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -438,19 +438,25 @@ A = TypeVar("A") def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() return ( - not envs.VLLM_DISABLE_FLASHINFER_PREFILL + not vllm_config.attention_config.disable_flashinfer_prefill and flashinfer_available - and not envs.VLLM_USE_CUDNN_PREFILL - and not envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL + and not vllm_config.attention_config.use_cudnn_prefill + and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill and current_platform.is_device_capability(100) ) def use_cudnn_prefill() -> bool: + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() return ( flashinfer_available - and envs.VLLM_USE_CUDNN_PREFILL + and vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability(100) and has_nvidia_artifactory() ) @@ -458,9 +464,12 @@ def use_cudnn_prefill() -> bool: def use_trtllm_ragged_deepseek_prefill() -> bool: """Check if TRT-LLM ragged DeepSeek prefill should be used.""" + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() return ( flashinfer_available - and envs.VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL + and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill and current_platform.is_device_capability(100) ) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index d369814c10b6f..eccf4ec791095 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -6,7 +6,6 @@ from typing import ClassVar import torch -from vllm import envs from vllm.attention.backends.abstract import ( AttentionLayer, AttentionType, @@ -131,7 +130,9 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + vllm_config.attention_config.flash_attn_max_num_splits_for_cuda_graph + ) if vllm_is_batch_invariant(): self.max_num_splits = 1 diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 868143cc192e7..e2410a70b1a63 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend): raise ValueError( f"Head size {head_size} is not supported by {attn_type}. " f"Supported head sizes are: {cls.get_supported_head_sizes()}. " - "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " + "Set --attention-config.backend=FLEX_ATTENTION to use " "FlexAttention backend which supports all head sizes." ) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index d051a89f03bb4..3b17c4bcd89cc 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -210,9 +210,6 @@ class TritonAttentionImpl(AttentionImpl): def fused_output_quant_supported(self, quant_key: QuantKey): return quant_key == kFp8StaticTensorSym - def supports_quant_query_input(self) -> bool: - return current_platform.is_cuda() - def __init__( self, num_heads: int, @@ -262,6 +259,8 @@ class TritonAttentionImpl(AttentionImpl): f"num_heads: {num_heads}." ) + self.supports_quant_query_input = current_platform.is_cuda() + def forward( self, layer: torch.nn.Module, From 4e26d3b09e9895279bf127f225c5f45922612261 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Fri, 5 Dec 2025 19:17:32 +0100 Subject: [PATCH 012/133] [Compile] Conditional compilation. Introduce compile_ranges (#24252) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič Signed-off-by: Luka Govedič Signed-off-by: ilmarkov Signed-off-by: Luka Govedič Signed-off-by: ProExpertProg Co-authored-by: Luka Govedič Co-authored-by: Luka Govedič Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Luka Govedič --- tests/compile/distributed/test_fusions_e2e.py | 23 ++- tests/compile/test_compile_ranges.py | 168 ++++++++++++++++++ tests/conftest.py | 14 ++ vllm/compilation/backends.py | 147 ++++++--------- vllm/compilation/collective_fusion.py | 146 +++++++-------- vllm/compilation/compiler_interface.py | 40 ++--- vllm/compilation/inductor_pass.py | 11 +- vllm/compilation/pass_manager.py | 16 +- vllm/compilation/piecewise_backend.py | 123 ++++++++----- vllm/compilation/sequence_parallelism.py | 5 +- vllm/config/compilation.py | 38 +++- vllm/config/utils.py | 34 +++- vllm/config/vllm.py | 48 +++++ vllm/v1/worker/gpu_worker.py | 35 +++- vllm/v1/worker/utils.py | 2 +- 15 files changed, 582 insertions(+), 268 deletions(-) create mode 100644 tests/compile/test_compile_ranges.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 5d2786e122a61..75a81efedea3b 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -298,10 +298,14 @@ def test_tp2_attn_quant_allreduce_rmsnorm( r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", log_holder.text, ) - assert len(log_matches) == 2, log_holder.text + # 2 for each compile range + # (global compile range can be split due to fuse_allreduce_rmsnorm) + num_compile_ranges = len(compilation_config.get_compile_ranges()) + assert num_compile_ranges in [1, 2] - assert int(log_matches[0]) == matches.attention_fusion - assert int(log_matches[1]) == matches.attention_fusion + assert len(log_matches) == 2 * num_compile_ranges, log_holder.text + + assert all(int(log_match) == matches.attention_fusion for log_match in log_matches) log_matches = re.findall( r"collective_fusion.py:\d+] Replaced (\d+) patterns", @@ -312,6 +316,12 @@ def test_tp2_attn_quant_allreduce_rmsnorm( assert int(log_matches[0]) == matches.allreduce_fusion assert int(log_matches[1]) == matches.allreduce_fusion + log_matches = re.findall( + r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range", + log_holder.text, + ) + assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize( @@ -446,7 +456,6 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg # No cudagraphs by default if compilation_config.cudagraph_mode is None: compilation_config.cudagraph_mode = CUDAGraphMode.NONE - llm = LLM( model=model, compilation_config=compilation_config, @@ -459,3 +468,9 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Get the compile ranges split points after vllm config post init + # in order to compute compile ranges correctly + compilation_config.compile_ranges_split_points = ( + llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points + ) diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py new file mode 100644 index 0000000000000..d849a8617ebd2 --- /dev/null +++ b/tests/compile/test_compile_ranges.py @@ -0,0 +1,168 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import torch +from torch import fx as fx +from torch import nn + +# This import automatically registers `torch.ops.silly.attention` +import tests.compile.silly_attention # noqa +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.compilation.inductor_pass import ( + InductorPass, + get_pass_context, +) +from vllm.config import ( + VllmConfig, + set_current_vllm_config, +) +from vllm.config.compilation import CompilationConfig, CompilationMode +from vllm.config.scheduler import SchedulerConfig +from vllm.config.utils import Range +from vllm.forward_context import set_forward_context + +BATCH_SIZE = 64 +MLP_SIZE = 128 + + +@support_torch_compile +class TestModel(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + x + attn_output = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, attn_output) + x = attn_output + x = x * 3 + return x + + +@torch.inference_mode +def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]): + with set_forward_context({}, vllm_config=vllm_config): + model(torch.randn(BATCH_SIZE, MLP_SIZE)) + for batch_size in batch_sizes: + model(torch.randn(batch_size, MLP_SIZE)) + + +class PostGradRangeChecker(InductorPass): + def __init__(self, ranges: list[Range]): + self.ranges = ranges + self.num_calls = 0 + + def __call__(self, graph: fx.Graph): + compile_range = get_pass_context().compile_range + assert compile_range in self.ranges, ( + f"Compile range {compile_range} not in {self.ranges}" + ) + self.num_calls += 1 + + def uuid(self) -> str: + state: dict[str, Any] = {} + return InductorPass.hash_dict(state) + + +def test_compile_ranges(use_fresh_inductor_cache): + post_grad_range_checker = PostGradRangeChecker( + [ + Range(start=1, end=8), + Range(start=16, end=16), + Range(start=9, end=32), + Range(start=64, end=64), + Range(start=33, end=8192), + ] + ) + torch.set_default_device("cuda") + vllm_config = VllmConfig( + scheduler_config=SchedulerConfig( + max_num_batched_tokens=8192, + ), + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + compile_ranges_split_points=[8, 32], + compile_sizes=[16, 64, 128], + inductor_compile_config={ + "post_grad_custom_post_pass": post_grad_range_checker, + }, + ), + ) + + with set_current_vllm_config(vllm_config): + model = TestModel(vllm_config=vllm_config, prefix="").eval() + # Number of compilations: 3 for each compile range + 2 compile sizes + batch_sizes = [1, 4, 16, 24, 48, 64, 8192] + + with compilation_counter.expect( + num_graphs_seen=1, + num_piecewise_graphs_seen=1, + num_backend_compilations=5, + ): + run_model(vllm_config, model, batch_sizes) + assert post_grad_range_checker.num_calls == 5 + + +def test_compile_config_get_compile_ranges(): + compilation_config = CompilationConfig( + compile_ranges_split_points=[8, 32], + ) + VllmConfig( + scheduler_config=SchedulerConfig( + max_num_batched_tokens=8192, + ), + compilation_config=compilation_config, + ) + assert compilation_config.get_compile_ranges() == [ + Range(start=1, end=8), + Range(start=9, end=32), + Range(start=33, end=8192), + ] + + +def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): + # To force multiple compilations, we disable the compile cache + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + post_grad_range_checker = PostGradRangeChecker( + ranges=[ + Range(start=1, end=8), + Range(start=9, end=8192), + ] + ) + scheduler_config = SchedulerConfig( + max_num_batched_tokens=8192, + ) + torch.set_default_device("cuda") + + def create_vllm_config(): + return VllmConfig( + scheduler_config=scheduler_config, + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + compile_ranges_split_points=[8], + inductor_compile_config={ + "post_grad_custom_post_pass": post_grad_range_checker, + }, + ), + ) + + vllm_config_1 = create_vllm_config() + with set_current_vllm_config(vllm_config_1): + model1 = TestModel(vllm_config=vllm_config_1, prefix="").eval() + batch_sizes = [1, 16] + run_model(vllm_config_1, model1, batch_sizes) + assert post_grad_range_checker.num_calls == 2 + + post_grad_range_checker.num_calls = 0 + # Create a new vllm config with the new pass context + vllm_config_2 = create_vllm_config() + with set_current_vllm_config(vllm_config_2): + model2 = TestModel(vllm_config=vllm_config_2, prefix="").eval() + batch_sizes = [4, 32] + run_model(vllm_config_2, model2, batch_sizes) + # Check that cache is used, so the number of calls + # should be 0 + assert post_grad_range_checker.num_calls == 0 diff --git a/tests/conftest.py b/tests/conftest.py index 204452b5835ce..0d456fb364cda 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -67,6 +67,9 @@ from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_num_threads +from torch._inductor.utils import fresh_cache + + if TYPE_CHECKING: from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from transformers.generation.utils import GenerateOutput @@ -1465,3 +1468,14 @@ def clean_gpu_memory_between_tests(): if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() + + +@pytest.fixture +def use_fresh_inductor_cache(): + """ + Use a fresh inductor cache for the test. + This is useful to ensure that the test is not affected by the + previous test calls. + """ + with fresh_cache(): + yield diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index b5b7fe2b76c27..26f4f16a845a2 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -26,7 +26,7 @@ from vllm.compilation.partition_rules import ( should_split, ) from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig -from vllm.config.utils import hash_factors +from vllm.config.utils import Range, hash_factors from vllm.logger import init_logger from vllm.logging_utils import lazy from vllm.platforms import current_platform @@ -90,7 +90,7 @@ class CompilerManager: """ def __init__(self, compilation_config: CompilationConfig): - self.cache: dict[tuple[int | None, int, str], Any] = dict() + self.cache: dict[tuple[Range, int, str], Any] = dict() self.is_cache_updated = False self.compilation_config = compilation_config self.compiler = make_compiler(compilation_config) @@ -99,11 +99,11 @@ class CompilerManager: return self.compiler.compute_hash(vllm_config) @contextmanager - def compile_context(self, runtime_shape: int | None = None): + def compile_context(self, compile_range: Range): """Provide compilation context for the duration of compilation to set any torch global properties we want to scope to a single Inductor compilation (e.g. partition rules, pass context).""" - with pass_context(runtime_shape): + with pass_context(compile_range): if self.compilation_config.use_inductor_graph_partition: with inductor_partition_rule_context( self.compilation_config.splitting_ops @@ -159,29 +159,21 @@ class CompilerManager: graph: fx.GraphModule, example_inputs: list[Any], graph_index: int, - runtime_shape: int | None = None, + compile_range: Range, ) -> Callable | None: - if (runtime_shape, graph_index, self.compiler.name) not in self.cache: + if (compile_range, graph_index, self.compiler.name) not in self.cache: return None - handle = self.cache[(runtime_shape, graph_index, self.compiler.name)] + handle = self.cache[(compile_range, graph_index, self.compiler.name)] compiled_graph = self.compiler.load( - handle, graph, example_inputs, graph_index, runtime_shape + handle, graph, example_inputs, graph_index, compile_range + ) + logger.debug( + "Directly load the %s-th graph for compile range %sfrom %s via handle %s", + graph_index, + str(compile_range), + self.compiler.name, + handle, ) - if runtime_shape is None: - logger.debug( - "Directly load the %s-th graph for dynamic shape from %s via handle %s", - graph_index, - self.compiler.name, - handle, - ) - else: - logger.debug( - "Directly load the %s-th graph for shape %s from %s via handle %s", - graph_index, - str(runtime_shape), - self.compiler.name, - handle, - ) return compiled_graph def compile( @@ -190,9 +182,9 @@ class CompilerManager: example_inputs, additional_inductor_config, compilation_config: CompilationConfig, + compile_range: Range, graph_index: int = 0, num_graphs: int = 1, - runtime_shape: int | None = None, ) -> Any: if graph_index == 0: # before compiling the first graph, record the start time @@ -204,7 +196,7 @@ class CompilerManager: compiled_graph = None # try to load from the cache - compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape) + compiled_graph = self.load(graph, example_inputs, graph_index, compile_range) if compiled_graph is not None: if graph_index == num_graphs - 1: # after loading the last graph for this shape, record the time. @@ -212,19 +204,12 @@ class CompilerManager: now = time.time() elapsed = now - compilation_start_time compilation_config.compilation_time += elapsed - if runtime_shape is None: - logger.info( - "Directly load the compiled graph(s) for dynamic shape " - "from the cache, took %.3f s", - elapsed, - ) - else: - logger.info( - "Directly load the compiled graph(s) for shape %s " - "from the cache, took %.3f s", - str(runtime_shape), - elapsed, - ) + logger.info( + "Directly load the compiled graph(s) for compile range %s " + "from the cache, took %.3f s", + str(compile_range), + elapsed, + ) return compiled_graph # no compiler cached the graph, or the cache is disabled, @@ -233,14 +218,15 @@ class CompilerManager: # Let compile_fx generate a key for us maybe_key = None else: - maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}" - - with self.compile_context(runtime_shape): + maybe_key = "artifact_compile_range_" + maybe_key += f"{compile_range.start}_{compile_range.end}" + maybe_key += f"_subgraph_{graph_index}" + with self.compile_context(compile_range): compiled_graph, handle = self.compiler.compile( graph, example_inputs, additional_inductor_config, - runtime_shape, + compile_range, maybe_key, ) @@ -248,55 +234,34 @@ class CompilerManager: # store the artifact in the cache if is_compile_cache_enabled(additional_inductor_config) and handle is not None: - self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle + self.cache[(compile_range, graph_index, self.compiler.name)] = handle compilation_counter.num_cache_entries_updated += 1 self.is_cache_updated = True if graph_index == 0: # adds some info logging for the first graph - if runtime_shape is None: - logger.info_once( - "Cache the graph for dynamic shape for later use", scope="local" - ) - else: - logger.info_once( - "Cache the graph of shape %s for later use", - str(runtime_shape), - scope="local", - ) - if runtime_shape is None: - logger.debug( - "Store the %s-th graph for dynamic shape from %s via handle %s", - graph_index, - self.compiler.name, - handle, - ) - else: - logger.debug( - "Store the %s-th graph for shape %s from %s via handle %s", - graph_index, - str(runtime_shape), - self.compiler.name, - handle, + logger.info_once( + "Cache the graph of compile range %s for later use", + str(compile_range), ) + logger.debug( + "Store the %s-th graph for compile range%s from %s via handle %s", + graph_index, + str(compile_range), + self.compiler.name, + handle, + ) # after compiling the last graph, record the end time if graph_index == num_graphs - 1: now = time.time() elapsed = now - compilation_start_time compilation_config.compilation_time += elapsed - if runtime_shape is None: - logger.info_once( - "Compiling a graph for dynamic shape takes %.2f s", - elapsed, - scope="local", - ) - else: - logger.info_once( - "Compiling a graph for shape %s takes %.2f s", - runtime_shape, - elapsed, - scope="local", - ) + logger.info_once( + "Compiling a graph for compile range %s takes %.2f s", + str(compile_range), + elapsed, + scope="local", + ) return compiled_graph @@ -427,19 +392,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): sym_shape_indices = [ i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] - global compilation_start_time - compiled_graph_for_dynamic_shape = ( - self.vllm_backend.compiler_manager.compile( - submod, - args, - self.vllm_backend.inductor_config, - self.compilation_config, - graph_index=index, - num_graphs=len(self.compile_submod_names), - runtime_shape=None, - ) - ) # Lazy import here to avoid circular import from .piecewise_backend import PiecewiseBackend @@ -449,7 +402,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): index, len(self.compile_submod_names), sym_shape_indices, - compiled_graph_for_dynamic_shape, self.vllm_backend, ) @@ -589,8 +541,13 @@ class VllmBackend: ) else: # Config should automatically wrap all inductor passes - assert isinstance(self.inductor_config[self.pass_key], InductorPass) - self.pass_manager.add(self.inductor_config[self.pass_key]) + assert isinstance( + self.compilation_config.inductor_compile_config[self.pass_key], + InductorPass, + ) + self.pass_manager.add( + self.compilation_config.inductor_compile_config[self.pass_key] + ) self.inductor_config[self.pass_key] = self.pass_manager def __call__( diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 69d4606d73ebd..2717738dd7c29 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -10,6 +10,7 @@ from torch._inductor.pattern_matcher import PatternMatcherPass from torch.distributed._symmetric_memory import enable_symm_mem_for_group from vllm.config import VllmConfig +from vllm.config.utils import Range from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, @@ -431,7 +432,7 @@ class AsyncTPPass(VllmPatternMatcherPass): self.dump_patterns(config, self.patterns) - def is_applicable(self, shape: int | None) -> bool: + def is_applicable_for_range(self, compile_range: Range) -> bool: # This pass is applied on top of the sequence parallelism pass. # It inherits the same applicability condition as `SequenceParallelismPass`. # See `SequenceParallelismPass.is_applicable` for more details. @@ -441,7 +442,7 @@ class AsyncTPPass(VllmPatternMatcherPass): ): return True tp_size = get_tensor_model_parallel_world_size() - return shape is not None and shape % tp_size == 0 + return compile_range.is_single_size() and compile_range.end % tp_size == 0 @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): @@ -505,91 +506,60 @@ if flashinfer_comm is not None: num_tokens, hidden_size = allreduce_in.shape element_size = allreduce_in.element_size() current_tensor_size = num_tokens * hidden_size * element_size + max_tensor_size = max_token_num * hidden_size * element_size + assert current_tensor_size <= max_tensor_size, ( + f"Current tensor size {current_tensor_size} is larger than " + f"max token num {max_token_num} * hidden size {hidden_size} * " + f"element size {element_size}" + ) + device_capability = current_platform.get_device_capability().to_int() + # Get one shot input size limit for the current world size + # for the current device capability + max_one_shot_size = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get( + device_capability, {} + ).get(world_size, None) + # Use one shot if no max size is specified + use_oneshot = ( + max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB + ) - if num_tokens <= max_token_num: - device_capability = current_platform.get_device_capability().to_int() - # Get one shot input size limit for the current world size - # for the current device capability - max_one_shot_size_mb = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB.get( - device_capability, {} - ).get(world_size, None) - # Use one shot if no max size for one shot is specified - use_oneshot = ( - max_one_shot_size_mb is None - or current_tensor_size <= max_one_shot_size_mb * MiB - ) - - assert _FI_WORKSPACE_TENSOR is not None, ( - "Flashinfer must be enabled when using flashinfer" - ) - if norm_out is None: - norm_out = allreduce_in - residual_out = residual - else: - # return residual_out as allreduce_out with zeroed residual_in - # as flashinfer does not support rms_norm - # and allreduce_out together - residual_out = allreduce_in - # For the sizes that are smaller than the max size, - # we only use flashinfer one shot allreduce - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=allreduce_in, - token_num=allreduce_in.shape[0], - residual_in=residual, - residual_out=residual_out, - norm_out=norm_out, - rms_gamma=rms_gamma, - rms_eps=rms_eps, - world_rank=world_rank, - world_size=world_size, - hidden_dim=allreduce_in.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - launch_with_pdl=launch_with_pdl, - use_oneshot=use_oneshot, - trigger_completion_at_end=trigger_completion_at_end, - fp32_acc=fp32_acc, - pattern_code=pattern_code, - allreduce_out=None, - quant_out=quant_out, - scale_out=scale_out, - # in vllm we only support swizzled layout - layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, - scale_factor=scale_factor, - ) + assert _FI_WORKSPACE_TENSOR is not None, ( + "Flashinfer must be enabled when using flashinfer" + ) + if norm_out is None: + norm_out = allreduce_in + residual_out = residual else: - allreduce_out = tensor_model_parallel_all_reduce(allreduce_in) - if scale_factor is not None and scale_out is None: - # Do fused rms norm static fp8 quant fused op - if norm_out is None: - torch.ops._C.fused_add_rms_norm_static_fp8_quant( - quant_out, - allreduce_out, - residual, - rms_gamma, - scale_factor, - rms_eps, - ) - else: - torch.ops._C.rms_norm_static_fp8_quant( - quant_out, allreduce_out, rms_gamma, scale_factor, rms_eps - ) - else: - if norm_out is None: - torch.ops._C.fused_add_rms_norm( - allreduce_out, residual, rms_gamma, rms_eps - ) - norm_out = allreduce_out - else: - torch.ops._C.rms_norm(norm_out, allreduce_out, rms_gamma, rms_eps) - if scale_factor is not None and scale_out is not None: - torch.ops._C.scaled_fp4_quant( - quant_out, norm_out, scale_out, scale_factor - ) - if scale_factor is None or norm_out is not None: - # we need to return allreduce output - # in cases of non quant fused AR + RMS norm - # and fused AR + RMS norm + quant without fused add - allreduce_in.copy_(allreduce_out) + # return residual_out as allreduce_out with zeroed residual_in + # as flashinfer does not support rms_norm + # and allreduce_out together + residual_out = allreduce_in + # For the sizes that are smaller than the max size, + # we only use flashinfer one shot allreduce + flashinfer_comm.trtllm_allreduce_fusion( + allreduce_in=allreduce_in, + token_num=allreduce_in.shape[0], + residual_in=residual, + residual_out=residual_out, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + world_rank=world_rank, + world_size=world_size, + hidden_dim=allreduce_in.shape[-1], + workspace_ptrs=_FI_WORKSPACE_TENSOR, + launch_with_pdl=launch_with_pdl, + use_oneshot=use_oneshot, + trigger_completion_at_end=trigger_completion_at_end, + fp32_acc=fp32_acc, + pattern_code=pattern_code, + allreduce_out=None, + quant_out=quant_out, + scale_out=scale_out, + # in vllm we only support swizzled layout + layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + scale_factor=scale_factor, + ) def call_trtllm_fused_allreduce_norm_fake( allreduce_in: torch.Tensor, @@ -1128,7 +1098,8 @@ class AllReduceFusionPass(VllmPatternMatcherPass): if max_size is None: # Flashinfer doesn't support current world size logger.warning( - "Flashinfer allreduce fusion is not supported for world size %s", + "Flashinfer allreduce fusion is not supported for world size %s" + " or max size is not provided", self.tp_size, ) return @@ -1216,6 +1187,9 @@ class AllReduceFusionPass(VllmPatternMatcherPass): self.disabled = False + def is_applicable_for_range(self, compile_range: Range) -> bool: + return compile_range.end <= self.max_token_num + @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): if self.disabled: diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 7deaba1a99fad..ab56d3561c569 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -15,6 +15,7 @@ import torch.fx as fx import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.config import VllmConfig +from vllm.config.utils import Range from vllm.utils.hashing import safe_hash from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -63,16 +64,16 @@ class CompilerInterface: graph: fx.GraphModule, example_inputs: list[Any], compiler_config: dict[str, Any], - runtime_shape: int | None = None, + compile_range: Range, key: str | None = None, ) -> tuple[Callable | None, Any | None]: """ Compile the graph with the given example inputs and compiler config, - with a runtime shape. If the `runtime_shape` is None, it means - the `example_inputs` have a dynamic shape. Otherwise, the - `runtime_shape` specifies the shape of the inputs. Right now we only - support one variable shape for all inputs, which is the batchsize - (number of tokens) during inference. + with a range. The `compile_range` specifies the range of the inputs, + it could be concrete size (if compile_sizes is provided), e.g. [4, 4] + or a range [5, 8]. + Right now we only support one variable in ranges for all inputs, + which is the batchsize (number of tokens) during inference. Dynamo will make sure `graph(*example_inputs)` is valid. @@ -98,7 +99,7 @@ class CompilerInterface: graph: fx.GraphModule, example_inputs: list[Any], graph_index: int, - runtime_shape: int | None = None, + compile_range: Range, ) -> Callable: """ Load the compiled function from the handle. @@ -212,20 +213,20 @@ class InductorStandaloneAdaptor(CompilerInterface): graph: fx.GraphModule, example_inputs: list[Any], compiler_config: dict[str, Any], - runtime_shape: int | None = None, + compile_range: Range, key: str | None = None, ) -> tuple[Callable | None, Any | None]: compilation_counter.num_inductor_compiles += 1 current_config = {} if compiler_config is not None: current_config.update(compiler_config) - set_inductor_config(current_config, runtime_shape) + set_inductor_config(current_config, compile_range) set_functorch_config() - if isinstance(runtime_shape, int): + if compile_range.is_single_size(): dynamic_shapes = "from_example_inputs" else: - dynamic_shapes = "from_tracing_context" + dynamic_shapes = "from_graph" from torch._inductor import standalone_compile @@ -235,7 +236,6 @@ class InductorStandaloneAdaptor(CompilerInterface): dynamic_shapes=dynamic_shapes, options={"config_patches": current_config}, ) - # Save the compiled artifact to disk in the specified path assert key is not None path = os.path.join(self.cache_dir, key) @@ -251,7 +251,7 @@ class InductorStandaloneAdaptor(CompilerInterface): graph: fx.GraphModule, example_inputs: list[Any], graph_index: int, - runtime_shape: int | None = None, + compile_range: Range, ) -> Callable: assert isinstance(handle, tuple) assert isinstance(handle[0], str) @@ -315,7 +315,7 @@ class InductorAdaptor(CompilerInterface): graph: fx.GraphModule, example_inputs: list[Any], compiler_config: dict[str, Any], - runtime_shape: int | None = None, + compile_range: Range, key: str | None = None, ) -> tuple[Callable | None, Any | None]: compilation_counter.num_inductor_compiles += 1 @@ -329,7 +329,7 @@ class InductorAdaptor(CompilerInterface): current_config["fx_graph_cache"] = True current_config["fx_graph_remote_cache"] = False - set_inductor_config(current_config, runtime_shape) + set_inductor_config(current_config, compile_range) set_functorch_config() # inductor can inplace modify the graph, so we need to copy it @@ -512,7 +512,7 @@ class InductorAdaptor(CompilerInterface): graph: fx.GraphModule, example_inputs: list[Any], graph_index: int, - runtime_shape: int | None = None, + compile_range: Range, ) -> Callable: assert isinstance(handle, tuple) assert isinstance(handle[0], str) @@ -608,9 +608,9 @@ class InductorAdaptor(CompilerInterface): return contextlib.nullcontext() -def set_inductor_config(config, runtime_shape): - if isinstance(runtime_shape, int): - # for a specific batchsize, tuning triton kernel parameters +def set_inductor_config(config, compile_range: Range): + if compile_range.is_single_size(): + # for a specific batch size, tuning triton kernel parameters # can be beneficial config["max_autotune"] = envs.VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE config["coordinate_descent_tuning"] = ( @@ -630,7 +630,7 @@ class EagerAdaptor(CompilerInterface): graph: fx.GraphModule, example_inputs: list[Any], compiler_config: dict[str, Any], - runtime_shape: int | None = None, + compile_range: Range, key: str | None = None, ) -> tuple[Callable | None, Any | None]: compilation_counter.num_eager_compiles += 1 diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 9af635a929b4b..8159b817f637a 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -14,6 +14,7 @@ import torch from torch import fx from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily +from vllm.config.utils import Range from vllm.utils.torch_utils import is_torch_equal_or_newer if is_torch_equal_or_newer("2.6"): @@ -28,8 +29,8 @@ _pass_context = None class PassContext: - def __init__(self, runtime_shape: int | None): - self.runtime_shape = runtime_shape + def __init__(self, compile_range: Range): + self.compile_range: Range = compile_range def get_pass_context() -> PassContext: @@ -39,13 +40,13 @@ def get_pass_context() -> PassContext: @contextmanager -def pass_context(runtime_shape: int | None): +def pass_context(compile_range: Range): """A context manager that stores the current pass context, usually it is a list of sizes to specialize. """ global _pass_context prev_context = _pass_context - _pass_context = PassContext(runtime_shape) + _pass_context = PassContext(compile_range) try: yield finally: @@ -96,7 +97,7 @@ class InductorPass(CustomGraphPass): encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") return hashlib.sha256(encoded).hexdigest() - def is_applicable(self, shape: int | None): + def is_applicable_for_range(self, compile_range: Range): return True diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 37f48721ea208..6848bfb6a3c53 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -24,7 +24,11 @@ if current_platform.is_cuda(): from .collective_fusion import AllReduceFusionPass, AsyncTPPass from .fix_functionalization import FixFunctionalizationPass -from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context +from .inductor_pass import ( + CustomGraphPass, + InductorPass, + get_pass_context, +) from .noop_elimination import NoOpEliminationPass logger = init_logger(__name__) @@ -70,13 +74,13 @@ class PostGradPassManager(CustomGraphPass): def __call__(self, graph: fx.Graph): VllmInductorPass.dump_prefix = 0 # reset dump index - shape = get_pass_context().runtime_shape + compile_range = get_pass_context().compile_range for pass_ in self.passes: - if pass_.is_applicable(shape): + if pass_.is_applicable_for_range(compile_range): pass_(graph) VllmInductorPass.dump_prefix += 1 else: - logger.debug("Skipping %s with shape %s", pass_, shape) + logger.debug("Skipping %s with compile range %s", pass_, compile_range) # post-cleanup goes before fix_functionalization # because it requires a functional graph @@ -133,4 +137,8 @@ class PostGradPassManager(CustomGraphPass): state["passes"].append(pass_.uuid()) state["passes"].append(self.fix_functionalization.uuid()) + # Include the compile range in the uuid to ensure that inductor + # recompiles the graph for the new dynamic compile range. + state["compile_range"] = str(get_pass_context().compile_range) + return InductorPass.hash_dict(state) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index e535d2c461c6e..129b9b5deea31 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -7,18 +7,18 @@ from typing import Any import torch.fx as fx -import vllm.envs as envs from vllm.compilation.backends import VllmBackend from vllm.compilation.monitor import end_monitoring_torch_compile from vllm.config import VllmConfig +from vllm.config.compilation import Range from vllm.logger import init_logger logger = init_logger(__name__) @dataclasses.dataclass -class ConcreteSizeEntry: - runtime_shape: int +class RangeEntry: + compile_range: Range compiled: bool = False runnable: Callable = None # type: ignore @@ -31,7 +31,6 @@ class PiecewiseBackend: piecewise_compile_index: int, total_piecewise_compiles: int, sym_shape_indices: list[int], - compiled_graph_for_general_shape: Callable, vllm_backend: VllmBackend, ): """ @@ -55,67 +54,111 @@ class PiecewiseBackend: self.is_full_graph = total_piecewise_compiles == 1 - self.compile_sizes: set[int] = set(self.compilation_config.compile_sizes) + self.compile_ranges = self.compilation_config.get_compile_ranges() + log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" + logger.debug_once(log_string) - self.first_run_finished = False - - self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa + self.compile_sizes = self.compilation_config.compile_sizes + log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}" + logger.debug_once(log_string) self.sym_shape_indices = sym_shape_indices - self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + # the entries for ranges that we need to either + self.range_entries: dict[Range, RangeEntry] = {} - # the entries for different shapes that we need to compile - self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} - - # to_be_compiled_sizes tracks the remaining sizes to compile, + # to_be_compiled_ranges tracks the remaining ranges to compile, # and updates during the compilation process, so we need to copy it - self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() + self.to_be_compiled_ranges: set[Range] = set(self.compile_ranges) # We only keep compilation management inside this class directly. - for shape in self.compile_sizes: - self.concrete_size_entries[shape] = ConcreteSizeEntry( - runtime_shape=shape, - runnable=self.compiled_graph_for_general_shape, + for size in self.compile_sizes: + range = Range(start=size, end=size) + if range not in self.compile_ranges: + self.range_entries[range] = RangeEntry( + compile_range=range, + ) + self.to_be_compiled_ranges.add(range) + + for range in self.compile_ranges: + self.range_entries[range] = RangeEntry( + compile_range=range, ) def check_for_ending_compilation(self): - if self.is_last_graph and not self.to_be_compiled_sizes: + if self.is_last_graph and not self.to_be_compiled_ranges: # no specific sizes to compile # save the hash of the inductor graph for the next run self.vllm_backend.compiler_manager.save_to_file() end_monitoring_torch_compile(self.vllm_config) - def __call__(self, *args) -> Any: - if not self.first_run_finished: - self.first_run_finished = True - self.check_for_ending_compilation() - return self.compiled_graph_for_general_shape(*args) + def _fakify_args(self, args: list[Any]) -> list[Any]: + # We need to pass fake example_inputs, otherwise torch.compile + # will fakify the example_inputs potentially causing some non dynamic + # dimension to be be duck shaped to other existing shapes that have hints + # matching their values. + # This is problem because it can lead to unintended specializations! + # if the new wrongly dynamic dim is specialized + # it will force specializing the whole shape + # torch.compile probably should not accept + # non fake tensors as example inputs! + # See issue https://github.com/vllm-project/vllm/issues/27899 + fake_example_inputs = [] + for node in self.graph.graph.nodes: + # All place holders come first + if node.op == "placeholder": + fake_example_inputs.append(node.meta["example_value"]) + else: + break + assert len(fake_example_inputs) == len(args) + return fake_example_inputs - runtime_shape = args[self.sym_shape_indices[0]] + def _maybe_compile_for_range_entry(self, range_entry: RangeEntry, args) -> Any: + if not range_entry.compiled: + range_entry.compiled = True + self.to_be_compiled_ranges.remove(range_entry.compile_range) - if runtime_shape not in self.concrete_size_entries: - # we don't need to do anything for this shape - return self.compiled_graph_for_general_shape(*args) - - entry = self.concrete_size_entries[runtime_shape] - - if not entry.compiled: - entry.compiled = True - self.to_be_compiled_sizes.remove(runtime_shape) # args are real arguments - entry.runnable = self.vllm_backend.compiler_manager.compile( + # fakify for range, real args for concrete size. + # For concrete size, we clear the shape env in + # compiler_manager.compile() so no need to fakify. + args = ( + self._fakify_args(args) + if not range_entry.compile_range.is_single_size() + else args + ) + range_entry.runnable = self.vllm_backend.compiler_manager.compile( self.graph, args, self.vllm_backend.inductor_config, self.compilation_config, + compile_range=range_entry.compile_range, graph_index=self.piecewise_compile_index, num_graphs=self.total_piecewise_compiles, - runtime_shape=runtime_shape, ) - # finished compilations for all required shapes - if self.is_last_graph and not self.to_be_compiled_sizes: - self.check_for_ending_compilation() + self.check_for_ending_compilation() - return entry.runnable(*args) + def _find_range_for_shape(self, runtime_shape: int) -> Range | None: + # First we try to find the range entry for the concrete compile size + # If not found, we search for the range entry + # that contains the runtime shape. + if runtime_shape in self.compile_sizes: + return self.range_entries[Range(start=runtime_shape, end=runtime_shape)] + else: + for range in self.compile_ranges: + if runtime_shape in range: + return self.range_entries[range] + return None + + def __call__(self, *args) -> Any: + runtime_shape = args[self.sym_shape_indices[0]] + range_entry = self._find_range_for_shape(runtime_shape) + + assert range_entry is not None, ( + f"Shape out of considered range: {runtime_shape} " + "[1, max_num_batched_tokens]" + ) + + self._maybe_compile_for_range_entry(range_entry, args) + return range_entry.runnable(*args) diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index cf4b8118f6b5c..a4046356bcda0 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -9,6 +9,7 @@ import torch.fx as fx from torch._inductor.pattern_matcher import PatternMatcherPass from vllm.config import VllmConfig +from vllm.config.compilation import Range from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -333,7 +334,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass): self.dump_patterns(config, self.patterns) - def is_applicable(self, shape: int | None) -> bool: + def is_applicable_for_range(self, compile_range: Range) -> bool: # When sequence parallelism is enabled, the residual tensor from RMSNorm # needs to be split along the sequence dimension. However, this dimension # is symbolic during piecewise compilation, and splitting symbolic shapes @@ -353,7 +354,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass): ): return True tp_size = get_tensor_model_parallel_world_size() - return shape is not None and shape % tp_size == 0 + return (compile_range.is_single_size()) and (compile_range.end % tp_size == 0) @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph): diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index d3d50e6ae7b2e..5f9e2cfdd789a 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -13,7 +13,13 @@ from pydantic.dataclasses import dataclass import vllm.envs as envs from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass -from vllm.config.utils import config, get_hash_factors, handle_deprecated, hash_factors +from vllm.config.utils import ( + Range, + config, + get_hash_factors, + handle_deprecated, + hash_factors, +) from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.import_utils import resolve_obj_by_qualname @@ -173,6 +179,9 @@ class PassConfig: """ MiB = 1024 * 1024 + FI_SUPPORTED_WORLD_SIZES = [2, 4, 8] + if world_size not in FI_SUPPORTED_WORLD_SIZES: + return None max_size_mb = self.fi_allreduce_fusion_max_size_mb if max_size_mb is None: max_size_mb = self.default_fi_allreduce_fusion_max_size_mb().get(world_size) @@ -379,6 +388,8 @@ class CompilationConfig: [vllm.config.CompilationConfig.cudagraph_copy_inputs] - Inductor compilation: - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] + - [`compile_ranges_split_points`] + [vllm.config.CompilationConfig.compile_ranges_split_points] - [`inductor_compile_config`] [vllm.config.CompilationConfig.inductor_compile_config] - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes] @@ -492,6 +503,21 @@ class CompilationConfig: to integers, it also supports "cudagraph_capture_sizes" to specify the sizes for cudagraph capture.""" + compile_ranges_split_points: list[int] | None = None + """Split points that represent compile ranges for inductor. + The compile ranges are + [1, split_points[0]], + [split_points[0] + 1, split_points[1]], ..., + [split_points[-1] + 1, max_num_batched_tokens]. + Compile sizes are also used single element ranges, + the range is represented as [compile_sizes[i], compile_sizes[i]]. + + If a range overlaps with the compile size, graph for compile size + will be prioritized, i.e. if we have a range [1, 8] and a compile size 4, + graph for compile size 4 will be compiled and used instead of the graph + for range [1, 8]. + """ + inductor_compile_config: dict = field(default_factory=dict) """Additional configurations for inductor. - None: use default configurations.""" @@ -1153,3 +1179,13 @@ class CompilationConfig: self.bs_to_padded_graph_size[bs] = start else: self.bs_to_padded_graph_size[bs] = end + + def get_compile_ranges(self) -> list[Range]: + """Get the compile ranges for the compilation config.""" + if self.compile_ranges_split_points is None: + return [] + split_points = sorted(set(self.compile_ranges_split_points)) + return [ + Range(start=s + 1, end=e) + for s, e in zip([0] + split_points[:-1], split_points) + ] diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 3124fcf007396..93da3fd417ace 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -10,7 +10,7 @@ import json import pathlib import textwrap from collections.abc import Iterable, Mapping, Sequence, Set -from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar @@ -322,3 +322,35 @@ def handle_deprecated( for new_name in new_names: setattr(config, new_name, old_val) + + +@dataclass +class Range: + """ + A range of numbers. + Inclusive of start, inclusive of end. + """ + + start: int + end: int + + def is_single_size(self) -> bool: + return self.start == self.end + + def __contains__(self, size: int) -> bool: + # Inclusive of start, inclusive of end + return self.start <= size <= self.end + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Range): + return False + return self.start == other.start and self.end == other.end + + def __hash__(self) -> int: + return hash((self.start, self.end)) + + def __str__(self) -> str: + return f"({self.start}, {self.end})" + + def __repr__(self) -> str: + return self.__str__() diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index ce3d3b20865db..47e7ffded797b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -725,6 +725,8 @@ class VllmConfig: "--kv-sharing-fast-prefill requires changes on model side for " "correctness and to realize prefill savings. " ) + # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands + self._set_compile_ranges() if self.model_config and self.model_config.is_encoder_decoder: from vllm.multimodal import MULTIMODAL_REGISTRY @@ -1126,6 +1128,52 @@ class VllmConfig: # complete the remaining process. self.compilation_config.post_init_cudagraph_sizes() + def _set_compile_ranges(self): + """ + Set the compile ranges for the compilation config. + """ + compilation_config = self.compilation_config + computed_compile_ranges_split_points = [] + + # The upper bound of the compile ranges is the max_num_batched_tokens + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + if max_num_batched_tokens is not None: + computed_compile_ranges_split_points.append(max_num_batched_tokens) + + # Add the compile ranges for flashinfer + if compilation_config.pass_config.fuse_allreduce_rms: + tp_size = self.parallel_config.tensor_parallel_size + max_size = compilation_config.pass_config.flashinfer_max_size(tp_size) + if max_size is not None: + max_token_num = max_size // ( + self.model_config.get_hidden_size() + * self.model_config.dtype.itemsize + ) + if ( + max_num_batched_tokens is not None + and max_token_num < max_num_batched_tokens + ): + computed_compile_ranges_split_points.append(max_token_num) + else: + logger.debug( + "Max num batched tokens below allreduce-rms fusion threshold, " + "allreduce-rms fusion will be enabled for all num_tokens." + ) + + if compilation_config.compile_ranges_split_points is not None: + for x in compilation_config.compile_ranges_split_points: + assert isinstance(x, int) + assert x > 0, f"Invalid compile range split point: {x}" + if ( + max_num_batched_tokens is not None + and x < max_num_batched_tokens + and x > 1 + ): + computed_compile_ranges_split_points.append(x) + compilation_config.compile_ranges_split_points = sorted( + computed_compile_ranges_split_points + ) + def recalculate_max_model_len(self, max_model_len: int): # Can only be called in try_verify_and_update_config model_config = self.model_config diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d189d0860b9a7..a46ec2bd118fe 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -15,6 +15,7 @@ import torch.nn as nn import vllm.envs as envs from vllm.config import CUDAGraphMode, VllmConfig +from vllm.config.compilation import CompilationMode from vllm.distributed import ( ensure_model_parallel_initialized, init_distributed_environment, @@ -407,15 +408,31 @@ class Worker(WorkerBase): self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: - # warm up sizes that are not in cudagraph capture sizes, - # but users still want to compile for better performance, - # e.g. for the max-num-batched token size in chunked prefill. - compile_sizes = self.vllm_config.compilation_config.compile_sizes - warmup_sizes = compile_sizes.copy() if compile_sizes is not None else [] - if not self.model_config.enforce_eager: - capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes - if capture_sizes is not None: - warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes] + warmup_sizes = [] + + if self.vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE: + # warm up sizes that are not in cudagraph capture sizes, + # but users still want to compile for better performance, + # e.g. for the max-num-batched token size in chunked prefill. + compile_sizes = self.vllm_config.compilation_config.compile_sizes + warmup_sizes = compile_sizes.copy() if compile_sizes is not None else [] + cg_capture_sizes: list[int] = [] + + if self.vllm_config.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + cg_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes + cg_capture_sizes = [] if cg_sizes is None else cg_sizes + warmup_sizes = [x for x in warmup_sizes if x not in cg_capture_sizes] + + compile_ranges = self.vllm_config.compilation_config.get_compile_ranges() + # For each compile_range, if none of the batch sizes + # in warmup_sizes or cudagraph_capture_sizes are in the range, + # add the end of the range to ensure compilation/warmup. + all_sizes = set(cg_capture_sizes) + all_sizes.update([x for x in warmup_sizes if isinstance(x, int)]) + for compile_range in compile_ranges: + if not any(x in compile_range for x in all_sizes): + warmup_sizes.append(compile_range.end) + # We skip EPLB here since we don't want to record dummy metrics for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 427a0d296b253..0b0e2006d73d2 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -337,7 +337,7 @@ def is_residual_scattered_for_sp( The residual tensor is scattered across tensor parallel ranks when sequence parallelism and tensor parallelism is enabled. - This follows the same logic as SequenceParallelismPass.is_applicable(): + This follows the same logic as SequenceParallelismPass.is_applicable_for_range(): - In full-graph compilation mode (no splitting ops or using inductor graph partition), SP is always applied - Otherwise, SP is only applied for specific shapes in compile_sizes From adb315060cd9943b38e68141cd9a54421144ae64 Mon Sep 17 00:00:00 2001 From: Tova Movshovitz Date: Fri, 5 Dec 2025 20:33:26 +0200 Subject: [PATCH 013/133] [KVConnector][Feature] Support KV connector cache reset via /reset_prefix_cache (#27170) Signed-off-by: tovam Signed-off-by: Tova Movshovitz Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../kv_transfer/kv_connector/v1/base.py | 14 ++++++++++ .../kv_connector/v1/multi_connector.py | 4 +++ vllm/engine/protocol.py | 6 +++-- vllm/entrypoints/llm.py | 8 ++++-- vllm/entrypoints/openai/api_server.py | 21 ++++++++++++--- vllm/v1/core/sched/interface.py | 4 ++- vllm/v1/core/sched/scheduler.py | 22 +++++++++++++++- vllm/v1/engine/async_llm.py | 8 ++++-- vllm/v1/engine/core.py | 8 ++++-- vllm/v1/engine/core_client.py | 26 +++++++++++++------ vllm/v1/engine/llm_engine.py | 8 ++++-- 11 files changed, 105 insertions(+), 24 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index d37ec25675b72..8e9182a9bca4c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -573,3 +573,17 @@ class KVConnectorBase_V1(ABC): expose connector transfer stats via Prometheus. """ return None + + def reset_cache(self) -> bool | None: + """ + Reset the connector's internal cache. + + Returns: + bool: True if the cache was successfully reset, False otherwise. + """ + logger.debug( + "Connector cache reset requested, but %s does not implement reset_cache().", + type(self).__name__, + ) + + return None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 51d5df6c6ba15..c80dc1a567fdb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -452,3 +452,7 @@ class MultiConnector(KVConnectorBase_V1): per_engine_labelvalues, prom_metrics, ) + + def reset_cache(self) -> bool: + results = [c.reset_cache() is not False for c in self._connectors] + return all(results) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 1b6330c9f9b65..d94951a0cffc8 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -116,8 +116,10 @@ class EngineClient(ABC): ... @abstractmethod - async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - """Reset the prefix cache""" + async def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + """Reset the prefix cache and optionally any configured connector cache""" ... @abstractmethod diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 481a47a97f7d4..add9176340a9e 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1491,8 +1491,12 @@ class LLM: def stop_profile(self) -> None: self.llm_engine.stop_profile() - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - return self.llm_engine.reset_prefix_cache(reset_running_requests) + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return self.llm_engine.reset_prefix_cache( + reset_running_requests, reset_connector + ) def sleep(self, level: int = 1): """ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2fa6afa2bacb5..7be601d824f34 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -663,14 +663,27 @@ if envs.VLLM_SERVER_DEV_MODE: @router.post("/reset_prefix_cache") async def reset_prefix_cache( - raw_request: Request, reset_running_requests: bool = Query(default=False) + raw_request: Request, + reset_running_requests: bool = Query(default=False), + reset_external: bool = Query(default=False), ): """ - Reset the prefix cache. Note that we currently do not check if the - prefix cache is successfully reset in the API server. + Reset the local prefix cache. + + Optionally, if the query parameter `reset_external=true` + also resets the external (connector-managed) prefix cache. + + Note that we currently do not check if the prefix cache + is successfully reset in the API server. + + Example: + POST /reset_prefix_cache?reset_external=true """ logger.info("Resetting prefix cache...") - await engine_client(raw_request).reset_prefix_cache(reset_running_requests) + + await engine_client(raw_request).reset_prefix_cache( + reset_running_requests, reset_external + ) return Response(status_code=200) @router.post("/reset_mm_cache") diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index c2f503ef2354e..596ab05ad320a 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -152,7 +152,9 @@ class SchedulerInterface(ABC): return self.has_unfinished_requests() or self.has_finished_requests() @abstractmethod - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: """Reset the prefix cache for KV cache. This is particularly required when the model weights are live-updated. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 75a7385df38b1..0a8efa2fd512f 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1380,7 +1380,9 @@ class Scheduler(SchedulerInterface): def has_finished_requests(self) -> bool: return len(self.finished_req_ids) > 0 - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: """Reset the KV prefix cache. If reset_running_requests is True, all the running requests will be @@ -1418,8 +1420,26 @@ class Scheduler(SchedulerInterface): "the presence of running requests waiting for remote KV transfer, " "which is not supported yet." ) + + if reset_connector: + reset_successful = self.reset_connector_cache() and reset_successful + return reset_successful + def reset_connector_cache(self) -> bool: + if self.connector is None: + logger.warning("reset_connector called but no KV connector is configured.") + return False + + if self.connector.reset_cache() is False: + return False + + if self.log_stats: + assert self.connector_prefix_cache_stats is not None + self.connector_prefix_cache_stats.reset = True + + return True + def make_stats( self, spec_decoding_stats: SpecDecodingStats | None = None, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ec5d6e95ce3aa..fd7e04dc02082 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -749,8 +749,12 @@ class AsyncLLM(EngineClient): self.input_processor.clear_mm_cache() await self.engine_core.reset_mm_cache_async() - async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - return await self.engine_core.reset_prefix_cache_async(reset_running_requests) + async def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return await self.engine_core.reset_prefix_cache_async( + reset_running_requests, reset_connector + ) async def sleep(self, level: int = 1) -> None: await self.reset_prefix_cache() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 8e34dfcea7f61..3d3a1e138ddef 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -503,8 +503,12 @@ class EngineCore: self.model_executor.reset_mm_cache() - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - return self.scheduler.reset_prefix_cache(reset_running_requests) + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return self.scheduler.reset_prefix_cache( + reset_running_requests, reset_connector + ) def sleep(self, level: int = 1): self.model_executor.sleep(level) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index afa0593921d06..c936646aa7993 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -138,7 +138,9 @@ class EngineCoreClient(ABC): def reset_mm_cache(self) -> None: raise NotImplementedError - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: raise NotImplementedError def sleep(self, level: int = 1) -> None: @@ -209,7 +211,7 @@ class EngineCoreClient(ABC): raise NotImplementedError async def reset_prefix_cache_async( - self, reset_running_requests: bool = False + self, reset_running_requests: bool = False, reset_connector: bool = False ) -> bool: raise NotImplementedError @@ -289,8 +291,12 @@ class InprocClient(EngineCoreClient): def reset_mm_cache(self) -> None: self.engine_core.reset_mm_cache() - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - return self.engine_core.reset_prefix_cache(reset_running_requests) + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return self.engine_core.reset_prefix_cache( + reset_running_requests, reset_connector + ) def sleep(self, level: int = 1) -> None: self.engine_core.sleep(level) @@ -753,8 +759,12 @@ class SyncMPClient(MPClient): def reset_mm_cache(self) -> None: self.call_utility("reset_mm_cache") - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - return self.call_utility("reset_prefix_cache", reset_running_requests) + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return self.call_utility( + "reset_prefix_cache", reset_running_requests, reset_connector + ) def add_lora(self, lora_request: LoRARequest) -> bool: return self.call_utility("add_lora", lora_request) @@ -958,10 +968,10 @@ class AsyncMPClient(MPClient): await self.call_utility_async("reset_mm_cache") async def reset_prefix_cache_async( - self, reset_running_requests: bool = False + self, reset_running_requests: bool = False, reset_connector: bool = False ) -> bool: return await self.call_utility_async( - "reset_prefix_cache", reset_running_requests + "reset_prefix_cache", reset_running_requests, reset_connector ) async def sleep_async(self, level: int = 1) -> None: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 8772f2e488dc0..4c31291005477 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -328,8 +328,12 @@ class LLMEngine: self.input_processor.clear_mm_cache() self.engine_core.reset_mm_cache() - def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: - return self.engine_core.reset_prefix_cache(reset_running_requests) + def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return self.engine_core.reset_prefix_cache( + reset_running_requests, reset_connector + ) def sleep(self, level: int = 1): self.engine_core.sleep(level) From bff78310d979bf516b45d8908b5e621a170578b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 5 Dec 2025 20:23:33 +0100 Subject: [PATCH 014/133] [Enc-Dec] Fix OOT tokenizer issue (#30144) Signed-off-by: NickLucche --- vllm/inputs/preprocess.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 2893a56b1190f..0372b06d0017f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -198,7 +198,7 @@ class InputPreprocessor: ) -> dict[str, Any]: kwargs = dict[str, Any]() - if self.model_config.hf_config.model_type == "whisper": + if self.model_config.is_encoder_decoder: # For Whisper, special tokens should be provided by the user based # on the task and language of their request. Also needed to avoid # appending an EOS token to the prompt which disrupts generation. @@ -573,7 +573,6 @@ class InputPreprocessor: """ encoder_inputs: SingletonInputs decoder_inputs: SingletonInputs | None - if is_explicit_encoder_decoder_prompt(prompt): # `cast` is needed for mypy, but not pyright prompt_ = cast(ExplicitEncoderDecoderPrompt, prompt) @@ -585,7 +584,9 @@ class InputPreprocessor: if (decoder_input := prompt_["decoder_prompt"]) is None: decoder_inputs = None else: - decoder_inputs = self._prompt_to_llm_inputs(decoder_input) + decoder_inputs = self._prompt_to_llm_inputs( + decoder_input, tokenization_kwargs=tokenization_kwargs + ) # For multimodal model, override decoder prompt from processor # with explicit decoder prompt. if self.model_config.is_multimodal_model: From 3633035a3fdee20cca8a8deb72490dc9cacea0f8 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 5 Dec 2025 14:41:40 -0500 Subject: [PATCH 015/133] [Misc] Rename CohereForAI references to CohereLabs (#30147) Signed-off-by: Russell Bryant --- docs/models/supported_models.md | 2 +- examples/offline_inference/vision_language.py | 2 +- examples/offline_inference/vision_language_multi_image.py | 2 +- tests/distributed/test_pipeline_parallel.py | 2 +- tests/models/multimodal/generation/test_common.py | 4 ++-- tests/models/registry.py | 6 +++--- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 96d5ec25c0064..1089de87b994e 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -666,7 +666,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|--------|-------------------|----------------------|---------------------------| | `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | -| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ | | `BeeForConditionalGeneration` | Bee-8B | T + IE+ | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ | | `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 0888a9d60a3fa..22802dddf7893 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -72,7 +72,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: # Aya Vision def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - model_name = "CohereForAI/aya-vision-8b" + model_name = "CohereLabs/aya-vision-8b" engine_args = EngineArgs( model=model_name, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 560ca768d1a6c..28c466c03dfa5 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -76,7 +76,7 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "CohereForAI/aya-vision-8b" + model_name = "CohereLabs/aya-vision-8b" engine_args = EngineArgs( model=model_name, diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 89f035d2cdd6f..cc6251514c3dc 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -109,7 +109,7 @@ TEXT_GENERATION_MODELS = { "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(), "bigscience/bloomz-1b1": PPTestSettings.fast(), "zai-org/chatglm3-6b": PPTestSettings.fast(), - "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"), + "CohereLabs/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"), "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"), "Deci/DeciLM-7B-instruct": PPTestSettings.fast(), "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(), diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index f896126a49089..fd26b838ae209 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -278,7 +278,7 @@ VLM_TEST_SETTINGS = { marks=[large_gpu_mark(min_gb=64)], ), "aya_vision": VLMTestInfo( - models=["CohereForAI/aya-vision-8b"], + models=["CohereLabs/aya-vision-8b"], test_type=(VLMTestType.IMAGE), prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( @@ -294,7 +294,7 @@ VLM_TEST_SETTINGS = { vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}, ), "aya_vision-multi_image": VLMTestInfo( - models=["CohereForAI/aya-vision-8b"], + models=["CohereLabs/aya-vision-8b"], test_type=(VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 single_image_prompts=IMAGE_ASSETS.prompts( diff --git a/tests/models/registry.py b/tests/models/registry.py index 352abdd2da9a0..020cb749341a6 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -211,10 +211,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True, ), "CohereForCausalLM": _HfExamplesInfo( - "CohereForAI/c4ai-command-r-v01", trust_remote_code=True + "CohereLabs/c4ai-command-r-v01", trust_remote_code=True ), "Cohere2ForCausalLM": _HfExamplesInfo( - "CohereForAI/c4ai-command-r7b-12-2024", + "CohereLabs/c4ai-command-r7b-12-2024", trust_remote_code=True, ), "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"), @@ -581,7 +581,7 @@ _AUTOMATIC_CONVERTED_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), - "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), + "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"), "BeeForConditionalGeneration": _HfExamplesInfo( "Open-Bee/Bee-8B-RL", trust_remote_code=True, From e23ca3a0e8bbf8d1e386bd619e0056ec7255810c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 5 Dec 2025 20:47:37 +0100 Subject: [PATCH 016/133] [CI] Re-use whisper_client for all tests (#30148) Signed-off-by: NickLucche --- .../test_transcription_validation_whisper.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py index 47cd7b1f12d00..3c507ee0a3fa7 100644 --- a/tests/entrypoints/openai/test_transcription_validation_whisper.py +++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py @@ -32,24 +32,20 @@ async def whisper_client(server): @pytest.mark.asyncio -async def test_basic_audio(mary_had_lamb): - server_args = ["--enforce-eager"] - +async def test_basic_audio(whisper_client, mary_had_lamb): # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. - with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server: - client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=MODEL_NAME, - file=mary_had_lamb, - language="en", - response_format="text", - temperature=0.0, - ) - out = json.loads(transcription) - out_text = out["text"] - out_usage = out["usage"] - assert "Mary had a little lamb," in out_text - assert out_usage["seconds"] == 16, out_usage["seconds"] + transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0, + ) + out = json.loads(transcription) + out_text = out["text"] + out_usage = out["usage"] + assert "Mary had a little lamb," in out_text + assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio From 962d703818c038b6ea2ae18eab8061a6f2fb8f65 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Fri, 5 Dec 2025 13:57:26 -0600 Subject: [PATCH 017/133] [Bugfix][llama4_eagle] Fix missing 'lm_head' attribute (#29926) Signed-off-by: Divakar Verma --- tests/v1/e2e/test_spec_decode.py | 6 +++++- vllm/model_executor/models/llama4_eagle.py | 13 +++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 5246ea6517f6c..575a6a151f579 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -402,7 +402,11 @@ def test_eagle_correctness( # Scout requires default backend selection # because vision encoder has head_dim 88 being incompatible # with FLASH_ATTN and needs to fall back to Flex Attn - pass + + # pass if not ROCm + if current_platform.is_rocm(): + # TODO: Enable Flex Attn for spec_decode on ROCm + pytest.skip("Flex Attn for spec_decode not supported on ROCm currently") else: m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index 0146b30579287..02f5b5ff639bd 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -28,7 +28,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.torchao import TorchAOConfig -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama4 import Llama4DecoderLayer, Llama4ForCausalLM from vllm.model_executor.models.utils import extract_layer_index @@ -182,6 +185,12 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM): self.config.vocab_size, scale=logit_scale ) + self.lm_head = ParallelLMHead( + self.config.draft_vocab_size, + self.config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head"), + ) + # Set MoE hyperparameters self.set_moe_parameters() @@ -211,6 +220,6 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM): loader = AutoWeightsLoader( self, # lm_head is tied with target model (Llama4ForCausalLM) - skip_prefixes=(["lm_head."]), + skip_prefixes=([]), ) loader.load_weights(map(transform, weights)) From 77e44728090cf11b6ece2ae25d7c732a065fbb45 Mon Sep 17 00:00:00 2001 From: Bangsheng Tang <5318912+bangshengtang@users.noreply.github.com> Date: Fri, 5 Dec 2025 13:33:42 -0800 Subject: [PATCH 018/133] let draft model follow target model's config_format (#30152) --- vllm/config/speculative.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index c6d6f705f535c..bf533bf14e55c 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -337,6 +337,7 @@ class SpeculativeConfig: enforce_eager=self.target_model_config.enforce_eager, max_logprobs=self.target_model_config.max_logprobs, hf_overrides=SpeculativeConfig.hf_config_override, + config_format=self.target_model_config.config_format, ) # Automatically detect the method From 7b5575fa7dcf76ac86ab8d18501b9cc04f74f6bb Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 5 Dec 2025 16:42:12 -0500 Subject: [PATCH 019/133] [Bug] Fix vLLM config is not set error (#29999) Signed-off-by: yewentao256 --- .../layers/fused_moe/cutlass_moe.py | 2 + .../fused_moe/fused_moe_modular_method.py | 6 ++ .../layers/fused_moe/modular_kernel.py | 57 ++++++++++--------- .../compressed_tensors_moe.py | 3 + .../quantization/utils/flashinfer_utils.py | 6 ++ 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 6753a19250b3b..30144ca5452eb 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -460,6 +460,7 @@ def cutlass_moe_fp8( expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, + parallel_config=None, ) -> torch.Tensor: """ This function computes a a8w8-quantized Mixture of Experts (MoE) layer @@ -537,6 +538,7 @@ def cutlass_moe_fp8( c_strides2=c_strides2, quant_config=quant_config, ), + parallel_config=parallel_config, ) return fn( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index c23c41df226f0..b33e7fd8a0215 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -44,6 +44,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): prepare_finalize: FusedMoEPrepareAndFinalize, shared_experts: torch.nn.Module | None, ) -> "FusedMoEModularMethod": + parallel_config = getattr( + getattr(moe_layer, "vllm_config", None), + "parallel_config", + None, + ) return FusedMoEModularMethod( old_quant_method, FusedMoEModularKernel( @@ -51,6 +56,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), shared_experts, getattr(moe_layer, "shared_experts_stream", None), + parallel_config=parallel_config, ), ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index b2af58cdca887..51d3299e7ddf1 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -10,7 +10,7 @@ from typing import final import torch import vllm.envs as envs -from vllm.config import get_current_vllm_config +from vllm.config import ParallelConfig, get_current_vllm_config from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig @@ -716,6 +716,7 @@ class FusedMoEModularKernel(torch.nn.Module): fused_experts: FusedMoEPermuteExpertsUnpermute, shared_experts: torch.nn.Module | None = None, shared_experts_stream: torch.cuda.Stream | None = None, + parallel_config: ParallelConfig | None = None, ): super().__init__() self.prepare_finalize = prepare_finalize @@ -723,6 +724,14 @@ class FusedMoEModularKernel(torch.nn.Module): self.shared_experts = shared_experts self.shared_experts_stream = shared_experts_stream + # cache whether this worker is using DP+EP + if parallel_config is None: + parallel_config = get_current_vllm_config().parallel_config + self.is_dp_ep = ( + parallel_config.data_parallel_size > 1 + and parallel_config.enable_expert_parallel + ) + self._post_init_setup() assert ( prepare_finalize.activation_format == fused_experts.activation_formats[0] @@ -811,33 +820,27 @@ class FusedMoEModularKernel(torch.nn.Module): is_forward_context_available() and get_forward_context().attn_metadata is None ) - if is_profile_run and self.fused_experts.supports_chunking(): - parallel_config = get_current_vllm_config().parallel_config - is_dp_ep = ( - parallel_config.data_parallel_size > 1 - and parallel_config.enable_expert_parallel + if is_profile_run and self.fused_experts.supports_chunking() and self.is_dp_ep: + max_workspace_13, max_workspace_2, max_fused_out_shape = ( + self.fused_experts.workspace_shapes( + envs.VLLM_FUSED_MOE_CHUNK_SIZE, + N, + K, + top_k, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + ) + buffers.workspace13.get( + max_workspace_13, device=device, dtype=workspace_dtype + ) + buffers.workspace2.get( + max_workspace_2, device=device, dtype=workspace_dtype + ) + buffers.fused_out.get( + max_fused_out_shape, device=device, dtype=workspace_dtype ) - if is_dp_ep: - max_workspace_13, max_workspace_2, max_fused_out_shape = ( - self.fused_experts.workspace_shapes( - envs.VLLM_FUSED_MOE_CHUNK_SIZE, - N, - K, - top_k, - global_num_experts, - local_num_experts, - expert_tokens_meta, - ) - ) - buffers.workspace13.get( - max_workspace_13, device=device, dtype=workspace_dtype - ) - buffers.workspace2.get( - max_workspace_2, device=device, dtype=workspace_dtype - ) - buffers.fused_out.get( - max_fused_out_shape, device=device, dtype=workspace_dtype - ) # Get intermediate workspace shapes based off the chunked M size. workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index d7fb6d2ca367d..8013b29f733bb 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1287,6 +1287,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): ab_strides2=self.ab_strides2, c_strides1=self.c_strides1, c_strides2=self.ab_strides1_c_strides2, + parallel_config=getattr( + getattr(layer, "vllm_config", None), "parallel_config", None + ), ) else: diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index eef7a0896c375..00c2720a34875 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -247,6 +247,11 @@ def flashinfer_cutlass_moe_fp8( assert quant_config is not None # Construct modular kernel with block-scale support when requested. + parallel_config = getattr( + getattr(layer, "vllm_config", None), + "parallel_config", + None, + ) fused_experts = mk.FusedMoEModularKernel( build_flashinfer_fp8_cutlass_moe_prepare_finalize( moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale @@ -257,6 +262,7 @@ def flashinfer_cutlass_moe_fp8( out_dtype=hidden_states.dtype, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, ), + parallel_config=parallel_config, ) return fused_experts( From 02a41691932683aa544b8a0139586f43e2f8b4bd Mon Sep 17 00:00:00 2001 From: Deboleina Date: Fri, 5 Dec 2025 22:03:29 -0500 Subject: [PATCH 020/133] [Tests] Tool call tests for openai/gpt-oss-20b (#26237) Signed-off-by: Debolina Roy --- requirements/rocm-test.txt | 1 + .../tool_parsers/test_openai_tool_parser.py | 359 ++++++++++++++++++ 2 files changed, 360 insertions(+) create mode 100644 tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 9d3d711c33797..f25835c68ddcf 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -49,6 +49,7 @@ blobfile==3.0.0 # Multi-Modal Models Test decord==0.6.0 # video processing, required by entrypoints/openai/test_video.py +rapidfuzz==3.12.1 # OpenAI compatibility and testing gpt-oss==0.0.8 diff --git a/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py new file mode 100644 index 0000000000000..7cb87fd13ecfa --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_openai_tool_parser.py @@ -0,0 +1,359 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import jsonschema +import openai +import pytest +import pytest_asyncio +from rapidfuzz import fuzz + +from ....utils import RemoteOpenAIServer + +MODEL_NAME = "openai/gpt-oss-20b" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--max-model-len", + "8192", + "--enforce-eager", + "--enable-auto-tool-choice", + "--tool-call-parser", + "openai", + ] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + """Async fixture providing an OpenAI-compatible vLLM client.""" + async with server.get_async_client() as async_client: + yield async_client + + +# ========================================================== +# Tool Definitions +# ========================================================== +TOOLS = [ + { + "type": "function", + "function": { + "name": "calculator", + "description": "Performs basic arithmetic calculations.", + "parameters": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": ( + "Arithmetic expression to evaluate, e.g. '123 + 456'." + ), + } + }, + "required": ["expression"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_time", + "description": "Retrieves the current local time for a given city.", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "City name, e.g. 'New York'.", + } + }, + "required": ["city"], + }, + }, + }, +] + + +# ========================================================== +# Message Examples +# ========================================================== +MESSAGES_CALC = [ + {"role": "user", "content": "Calculate 123 + 456 using the calculator."} +] + +MESSAGES_GET_TIME = [ + {"role": "user", "content": "What is the current time in New York?"} +] + +MESSAGES_MULTIPLE_CALLS = [ + { + "role": "system", + "content": ( + "You can call multiple tools. " + "When using more than one, return single JSON object with tool_calls array" + "containing each tool call with its function name and arguments. " + "Do not output multiple JSON objects separately." + ), + }, + { + "role": "user", + "content": "First, calculate 7 * 8 using the calculator. " + "Then, use get_time to tell me the current time in New York.", + }, +] + +MESSAGES_INVALID_CALL = [ + { + "role": "user", + "content": "Can you help with something, " + "but don’t actually perform any calculation?", + } +] + + +# Expected outputs +FUNC_CALC = "calculator" +FUNC_ARGS_CALC = '{"expression":"123 + 456"}' + +FUNC_TIME = "get_time" +FUNC_ARGS_TIME = '{"city": "New York"}' + + +# ========================================================== +# Utility to extract reasoning and tool calls +# ========================================================== +def extract_reasoning_and_calls(chunks: list) -> tuple[str, list[str], list[str]]: + """ + Extract accumulated reasoning text and tool call arguments + from streaming chunks. + """ + reasoning_content: str = "" + tool_calls: dict[int, dict[str, str]] = {} + + for chunk in chunks: + choice = getattr(chunk.choices[0], "delta", None) + if not choice: + continue + + if hasattr(choice, "reasoning_content") and choice.reasoning_content: + reasoning_content += choice.reasoning_content + + for tc in getattr(choice, "tool_calls", []) or []: + idx = getattr(tc, "index", 0) + tool_entry = tool_calls.setdefault(idx, {"name": "", "arguments": ""}) + + if getattr(tc, "function", None): + func = tc.function + if getattr(func, "name", None): + tool_entry["name"] = func.name + if getattr(func, "arguments", None): + tool_entry["arguments"] += func.arguments + + function_names: list[str] = [v["name"] for _, v in sorted(tool_calls.items())] + arguments: list[str] = [v["arguments"] for _, v in sorted(tool_calls.items())] + + return reasoning_content, arguments, function_names + + +# ========================================================== +# Test Scenarios +# ========================================================== +@pytest.mark.asyncio +async def test_calculator_tool_call_and_argument_accuracy(client: openai.AsyncOpenAI): + """Verify calculator tool call is made and arguments are accurate.""" + + response = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_CALC, + tools=TOOLS, + temperature=0.0, + stream=False, + ) + + message = response.choices[0].message + tool_calls = getattr(message, "tool_calls", []) + assert tool_calls, "No tool calls detected" + + calc_call = next((c for c in tool_calls if c.function.name == FUNC_CALC), None) + assert calc_call, "Calculator function not called" + + raw_args = calc_call.function.arguments + assert raw_args, "Calculator arguments missing" + assert "123" in raw_args and "456" in raw_args, ( + f"Expected values not in raw arguments: {raw_args}" + ) + + try: + parsed_args = json.loads(raw_args) + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON in calculator arguments: {raw_args}") + + expected_expr = "123 + 456" + actual_expr = parsed_args.get("expression", "") + similarity = fuzz.ratio(actual_expr, expected_expr) + + assert similarity > 90, ( + f"Expression mismatch: expected '{expected_expr}' " + f"got '{actual_expr}' (similarity={similarity}%)" + ) + + +@pytest.mark.asyncio +async def test_streaming_tool_call_get_time_with_reasoning(client: openai.AsyncOpenAI): + """Verify streamed reasoning and tool call behavior for get_time.""" + + stream = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_GET_TIME, + tools=TOOLS, + temperature=0.0, + stream=True, + ) + + chunks = [chunk async for chunk in stream] + reasoning, arguments, function_names = extract_reasoning_and_calls(chunks) + + assert FUNC_TIME in function_names, "get_time function not called" + + assert any("New York" in arg for arg in arguments), ( + f"Expected get_time arguments for New York not found in {arguments}" + ) + + assert len(reasoning) > 0, "Expected reasoning content missing" + + assert any(keyword in reasoning for keyword in ["New York", "time", "current"]), ( + f"Reasoning is not relevant to the request: {reasoning}" + ) + + +@pytest.mark.asyncio +async def test_streaming_multiple_tools(client: openai.AsyncOpenAI): + """Test streamed multi-tool response with reasoning.""" + stream = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_MULTIPLE_CALLS, + tools=TOOLS, + temperature=0.0, + stream=True, + ) + + chunks = [chunk async for chunk in stream] + reasoning, arguments, function_names = extract_reasoning_and_calls(chunks) + + try: + assert FUNC_CALC in function_names, ( + f"Calculator tool missing — found {function_names}" + ) + assert FUNC_TIME in function_names, ( + f"Time tool missing — found {function_names}" + ) + assert len(reasoning) > 0, "Expected reasoning content in streamed response" + except AssertionError as e: + print(f"ERROR: {e}") + + +@pytest.mark.asyncio +async def test_invalid_tool_call(client: openai.AsyncOpenAI): + """ + Verify that ambiguous instructions that should not trigger a tool + do not produce any tool calls. + """ + response = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_INVALID_CALL, + tools=TOOLS, + temperature=0.0, + stream=False, + ) + + message = response.choices[0].message + + assert message is not None, "Expected message in response" + assert hasattr(message, "content"), "Expected 'content' field in message" + + tool_calls = getattr(message, "tool_calls", []) + assert not tool_calls, ( + f"Model unexpectedly attempted a tool call on invalid input: {tool_calls}" + ) + + +@pytest.mark.asyncio +async def test_tool_call_with_temperature(client: openai.AsyncOpenAI): + """ + Verify model produces valid tool or text output + under non-deterministic sampling. + """ + response = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_CALC, + tools=TOOLS, + temperature=0.7, + stream=False, + ) + + message = response.choices[0].message + assert message is not None, "Expected non-empty message in response" + assert message.tool_calls or message.content, ( + "Response missing both text and tool calls" + ) + + print(f"\nTool calls: {message.tool_calls}") + print(f"Text: {message.content}") + + +@pytest.mark.asyncio +async def test_tool_response_schema_accuracy(client: openai.AsyncOpenAI): + """Validate that tool call arguments adhere to their declared JSON schema.""" + response = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_MULTIPLE_CALLS, + tools=TOOLS, + temperature=0.0, + ) + + calls = response.choices[0].message.tool_calls + assert calls, "No tool calls produced" + + for call in calls: + func_name = call.function.name + args = json.loads(call.function.arguments) + + schema: dict[str, object] | None = None + for tool_entry in TOOLS: + function_def = tool_entry.get("function") + if ( + function_def + and isinstance(function_def, dict) + and function_def.get("name") == func_name + ): + schema = function_def.get("parameters") + break + + assert schema is not None, f"No matching tool schema found for {func_name}" + + jsonschema.validate(instance=args, schema=schema) + + +@pytest.mark.asyncio +async def test_semantic_consistency_with_temperature(client: openai.AsyncOpenAI): + """Test that temperature variation doesn't cause contradictory reasoning.""" + responses = [] + for temp in [0.0, 0.5, 1.0]: + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES_CALC, + tools=TOOLS, + temperature=temp, + ) + text = (resp.choices[0].message.content or "").strip() + responses.append(text) + + # Compare fuzzy similarity between low- and mid-temperature outputs + low_mid_sim = fuzz.ratio(responses[0], responses[1]) + assert low_mid_sim > 60, ( + f"Semantic drift too large between T=0.0 and T=0.5 ({low_mid_sim}%)" + ) From dc839ad03d31104c8ebcb0b8f5a75021f1796760 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 5 Dec 2025 22:52:11 -0600 Subject: [PATCH 021/133] [CI/Build][AMD][Quantization] Fix test_int8_kernel.py by updating int8_utils to use hip.libdevice.round (#30151) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../layers/quantization/utils/int8_utils.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 925d0a516ce63..32192225f61e2 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -83,26 +83,11 @@ def block_dequant( if current_platform.is_rocm(): - from triton.language import core - - # NOTE: This can be removed when hip.libdevice.round() is available. - @core.extern - def round_f32(arg0, _builder=None): - return core.extern_elementwise( - "", - "", - [arg0], - { - (core.dtype("fp32"),): ("llvm.round", core.dtype("fp32")), - (core.dtype("fp64"),): ("llvm.round", core.dtype("fp64")), - }, - is_pure=True, - _builder=_builder, - ) @triton.jit def round_int8(x): - return round_f32(x).to(tl.int8) + return tl.extra.hip.libdevice.round(x).to(tl.int8) + else: @triton.jit From 7e31c3a3f699b2ec4bf18cb195edaf7270c1434c Mon Sep 17 00:00:00 2001 From: Samuel Shen <102553648+sammshen@users.noreply.github.com> Date: Fri, 5 Dec 2025 20:53:34 -0800 Subject: [PATCH 022/133] [CI]: Remove unnecessary imports from test_lmache_integration (#30157) Signed-off-by: Samuel Shen Co-authored-by: Samuel Shen --- .../unit/test_lmcache_integration.py | 62 +------------------ 1 file changed, 2 insertions(+), 60 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py index 33418edc325af..cfe8d810cf98a 100644 --- a/tests/v1/kv_connector/unit/test_lmcache_integration.py +++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py @@ -64,22 +64,6 @@ def test_multimodal_interface(): assumes(PlaceholderRange, "offset") assumes(PlaceholderRange, "length") - # test a minimal case - import torch - - from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import ( - apply_mm_hashes_to_token_ids, - ) - - token_ids = torch.arange(10, dtype=torch.long) - mm_hashes = ["0000", "1111"] # hex repr of 0 and 4369 - mm_positions = [ - PlaceholderRange(offset=0, length=4), - PlaceholderRange(offset=5, length=4), - ] - apply_mm_hashes_to_token_ids(token_ids, mm_hashes, mm_positions) - assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9] - @pytest.mark.skipif( current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" @@ -122,16 +106,6 @@ def test_config_interface(): assumes(CacheConfig, "block_size") assumes(CacheConfig, "gpu_memory_utilization") - # mla metadata minimal cases - from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import ( - mla_enabled, - ) - - model_config = ModelConfig(model="deepseek-ai/DeepSeek-R1") - assert mla_enabled(model_config) - model_config = ModelConfig(model="Qwen/Qwen3-0.6B") - assert not mla_enabled(model_config) - # kv metadata minimal case from vllm.utils.torch_utils import get_kv_cache_torch_dtype @@ -139,7 +113,7 @@ def test_config_interface(): parallel_config = ParallelConfig() cache_config = CacheConfig(cache_dtype="bfloat16") kv_dtype = get_kv_cache_torch_dtype(cache_config.cache_dtype, model_config.dtype) - use_mla = mla_enabled(model_config) + use_mla = False chunk_size = 256 num_layer = model_config.get_num_layers(parallel_config) num_kv_head = model_config.get_num_kv_heads(parallel_config) @@ -184,43 +158,11 @@ def test_request_interface(): assumes(req, "num_tokens") assumes(req, "kv_transfer_params", is_instance_of=(dict, NoneType)) - from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem + from vllm.multimodal.inputs import MultiModalFeatureSpec assumes(MultiModalFeatureSpec, "identifier") assumes(MultiModalFeatureSpec, "mm_position") - # minimal case: - from vllm.multimodal.inputs import PlaceholderRange - - request = Request( - request_id="test_request", - prompt_token_ids=[1, 2, 3], - sampling_params=SamplingParams(max_tokens=10), - pooling_params=None, - eos_token_id=100, - lora_request=None, - mm_features=[ - MultiModalFeatureSpec( - modality="image", - identifier="0000", - data=MultiModalKwargsItem.dummy("dummy_m"), - mm_position=PlaceholderRange(offset=0, length=10), - ) - ], - ) - - from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils import ( - extract_mm_features, - ) - - mm_hashes, mm_positions = extract_mm_features(request) - assert isinstance(mm_hashes, list) - assert len(mm_hashes) == 1 - assert isinstance(mm_positions, list) - assert len(mm_positions) == 1 - assert mm_positions[0].offset == 0 - assert mm_positions[0].length == 10 - def test_new_request_interface(): # protect against interface changes From bf4a901af91e431a4cdb51ed4557b31bf89c0e5d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 6 Dec 2025 04:53:52 +0000 Subject: [PATCH 023/133] Better error when world size is larger than node and `distributed_executor_backend` is not set (#30140) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/parallel.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 3a768bcd4f2ce..0327832c4fb8c 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -184,13 +184,14 @@ class ParallelConfig: distributed_executor_backend: ( str | DistributedExecutorBackend | type[Executor] | None ) = None - """Backend to use for distributed model - workers, either "ray" or "mp" (multiprocessing). If the product - of pipeline_parallel_size and tensor_parallel_size is less than - or equal to the number of GPUs available, "mp" will be used to - keep processing on a single host. Otherwise, this will default - to "ray" if Ray is installed and fail otherwise. Note that tpu - only support Ray for distributed inference.""" + """Backend to use for distributed model workers, either "ray" or "mp" + (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size + is less than or equal to the number of GPUs available, "mp" will be used to + keep processing on a single host. Otherwise, an error will be raised. To use "mp" + you must also set nnodes, and to use "ray" you must manually set + distributed_executor_backend to "ray". + + Note that tpu only support Ray for distributed inference.""" worker_cls: str = "auto" """The full name of the worker class to use. If "auto", the worker class @@ -566,8 +567,11 @@ class ParallelConfig: ): gpu_count = cuda_device_count_stateless() raise ValueError( - f"Tensor parallel size ({self.world_size}) cannot be " - f"larger than the number of available GPUs ({gpu_count})." + f"World size ({self.world_size}) is larger than the number of " + f"available GPUs ({gpu_count}) in this node. If this is " + "intentional and you are using:\n" + "- ray, set '--distributed-executor-backend ray'.\n" + "- multiprocessing, set '--nnodes' appropriately." ) elif self.data_parallel_backend == "ray": logger.info( From 62079d86004448a42c4205d00e4a977fe85b69a6 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 5 Dec 2025 22:54:17 -0600 Subject: [PATCH 024/133] [CI/Build][AMD] Skip marlin, machete, and hadacore tests since these require _C functions not defined for ROCm (#30109) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/quantization/test_hadacore.py | 7 +++++++ tests/kernels/quantization/test_machete_mm.py | 6 ++++++ tests/kernels/quantization/test_marlin_gemm.py | 8 ++++++++ 3 files changed, 21 insertions(+) diff --git a/tests/kernels/quantization/test_hadacore.py b/tests/kernels/quantization/test_hadacore.py index 3ccee9db048cf..7a5c7fbd55f72 100644 --- a/tests/kernels/quantization/test_hadacore.py +++ b/tests/kernels/quantization/test_hadacore.py @@ -8,6 +8,13 @@ import torch from compressed_tensors.transform import deterministic_hadamard_matrix from vllm import _custom_ops as ops +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "These tests require hadacore_transform, not supported on ROCm.", + allow_module_level=True, + ) @pytest.mark.parametrize("batch_size", [1, 32]) diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index efa81de158d38..7f4ce2a085807 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -23,6 +23,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +if current_platform.is_rocm(): + pytest.skip( + "These tests require machete_prepack_B, not supported on ROCm.", + allow_module_level=True, + ) + CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index 59516db1b115d..995e777bb5e8b 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -56,6 +56,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +if current_platform.is_rocm(): + pytest.skip( + "These tests require gptq_marlin_repack," + "marlin_int4_fp8_preprocess, gptq_marlin_24_gemm," + "or gptq_marlin_gemm which are not supported on ROCm.", + allow_module_level=True, + ) + ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] USE_ATOMIC_ADD_OPTS = [False, True] From c4d62618ca7cf8507c1e357fa1180fb162c670fa Mon Sep 17 00:00:00 2001 From: yuttian1 Date: Sat, 6 Dec 2025 12:54:38 +0800 Subject: [PATCH 025/133] Fix AWQ MoE marlin check issue in marlin_utils.py for AMD backend (#30102) Signed-off-by: yuttian1 --- vllm/model_executor/layers/quantization/utils/marlin_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 14337ee1d7bee..072b46f055210 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -179,6 +179,8 @@ def check_marlin_supports_shape( def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: + if current_platform.is_rocm(): + return False output_size_per_partition = ( getattr(layer, "output_size_per_partition", None) or layer.output_size ) @@ -195,6 +197,8 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: + if current_platform.is_rocm(): + return False hidden_size = layer.hidden_size intermediate_size_per_partition = layer.intermediate_size_per_partition # apply_router_weight_on_input is not supported for moe marlin From e3fbb6f152fe721506f22b3d8d2ec70c10229569 Mon Sep 17 00:00:00 2001 From: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com> Date: Fri, 5 Dec 2025 23:55:09 -0500 Subject: [PATCH 026/133] fix#30092 Kimi-Linear model loading failure with missing indexer_rotary_emb (#30093) Signed-off-by: baonudesifeizhai --- vllm/model_executor/layers/mla.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index dad960160f2ad..1656f4deb6717 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -24,9 +24,9 @@ class MLAModules: q_b_proj: torch.nn.Module | None q_proj: torch.nn.Module | None indexer: torch.nn.Module | None - indexer_rotary_emb: torch.nn.Module | None is_sparse: bool topk_indices_buffer: torch.Tensor | None + indexer_rotary_emb: torch.nn.Module | None = None @CustomOp.register("multi_head_latent_attention") From e858bc4d1490808e23bbc32e17202ebbf8713a76 Mon Sep 17 00:00:00 2001 From: Peter Salas Date: Fri, 5 Dec 2025 20:55:43 -0800 Subject: [PATCH 027/133] [Model] Add support for transformer-based Ultravox v0.7 projector (#30089) Signed-off-by: Peter Salas --- vllm/model_executor/models/ultravox.py | 96 +++++++++++++++++++-- vllm/transformers_utils/configs/ultravox.py | 2 + 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 26a8355cd22b5..2444159b2ad61 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -4,15 +4,21 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" +import copy from collections.abc import Iterable, Mapping, Sequence +from types import SimpleNamespace from typing import Annotated, Any, Literal, TypeAlias import torch from torch import nn from torch.nn import functional as F from transformers import BatchFeature, ProcessorMixin +from transformers.modeling_utils import ModuleUtilsMixin from transformers.models.whisper import WhisperFeatureExtractor -from transformers.models.whisper.modeling_whisper import WhisperEncoder +from transformers.models.whisper.modeling_whisper import ( + WhisperEncoder, + WhisperEncoderLayer, +) from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -282,7 +288,7 @@ class StackAudioFrames(nn.Module): return audio_embeds -class UltravoxProjector(nn.Module): +class UltravoxFeedForwardProjector(nn.Module): def __init__(self, config: UltravoxConfig): super().__init__() self.hidden_dim = config.hidden_size @@ -310,7 +316,9 @@ class UltravoxProjector(nn.Module): self.ln_mid = nn.Identity() self.ln_post = RMSNorm(dim_out) - def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + def forward( + self, audio_features: torch.Tensor, audio_token_len: torch.Tensor + ) -> torch.Tensor: audio_features = self._pad_and_stack(audio_features) audio_features = self.ln_pre(audio_features) hidden_states = self.linear_1(audio_features) @@ -321,6 +329,70 @@ class UltravoxProjector(nn.Module): return hidden_states +class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin): + def __init__(self, config: UltravoxConfig): + super().__init__() + self.config = SimpleNamespace(is_decoder=False) + + self._pad_and_stack = StackAudioFrames(config.stack_factor) + dim_in = config.audio_config.hidden_size * config.stack_factor + + projector_audio_config = copy.deepcopy(config.audio_config) + + self.ln_pre = RMSNorm(dim_in) + self.linear_in = nn.Linear(dim_in, projector_audio_config.d_model) + + self.embed_positions = nn.Embedding( + projector_audio_config.max_source_positions, + projector_audio_config.d_model, + ) + + self.layers = nn.ModuleList( + [ + WhisperEncoderLayer(projector_audio_config) + for _ in range(config.num_projector_layers) + ] + ) + + self.ln_post = RMSNorm(projector_audio_config.d_model) + self.linear_out = nn.Linear( + projector_audio_config.d_model, config.text_config.hidden_size + ) + + def forward( + self, audio_features: torch.Tensor, audio_token_len: torch.Tensor + ) -> torch.Tensor: + audio_features = self._pad_and_stack(audio_features) + + max_len_stacked = audio_features.shape[1] + attention_mask = torch.arange(max_len_stacked, device=audio_features.device)[ + None, : + ].lt(audio_token_len[:, None]) + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, attention_mask.shape, audio_features.dtype + ) + + hidden_states = self.ln_pre(audio_features) + hidden_states = self.linear_in(hidden_states) + + positions = self.embed_positions( + torch.arange(hidden_states.size(1), device=hidden_states.device) + ) + hidden_states = hidden_states + positions + + for layer in self.layers: + layer_outputs = layer( + hidden_states, + attention_mask=extended_attention_mask, + layer_head_mask=None, + ) + hidden_states = layer_outputs[0] + + hidden_states = self.ln_post(hidden_states) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + class ModifiedWhisperEncoder(WhisperEncoder): """ Encoder portion of OpenAI's Whisper model. @@ -464,7 +536,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): prefix="audio_tower.", ) ) - self.multi_modal_projector = UltravoxProjector(config) + if config.num_projector_layers > 0: + self.multi_modal_projector = UltravoxTransformerProjector(config) + else: + self.multi_modal_projector = UltravoxFeedForwardProjector(config) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.wrapped_model_config, @@ -496,7 +571,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): ) def _audio_features_to_embeddings( - self, input_features: torch.Tensor, audio_lens: torch.Tensor + self, + input_features: torch.Tensor, + audio_lens: torch.Tensor, + audio_token_len: torch.Tensor, ) -> torch.Tensor: audio_features = input_features.to(self.audio_tower.dtype) batch_size = audio_features.size(0) @@ -512,7 +590,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): batch_features = batch_features.to(self.audio_tower.dtype) # Process through projector - batch_embeddings = self.multi_modal_projector(batch_features) + batch_embeddings = self.multi_modal_projector( + batch_features, audio_token_len[start:end] + ) audio_embeddings.append(batch_embeddings) # Concatenate results @@ -559,7 +639,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): audio_lens = audio_input["lens"] audio_token_len = audio_input["token_len"] - embeddings = self._audio_features_to_embeddings(audio_features, audio_lens) + embeddings = self._audio_features_to_embeddings( + audio_features, audio_lens, audio_token_len + ) # We should flatten and concatenate embeddings based on token lengths # For example, with token_len = [4, 2, 3], flattened_embeddings will be diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index fc0360a9ecb4e..395b3130d40af 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -61,6 +61,7 @@ class UltravoxConfig(transformers.PretrainedConfig): norm_init: float = 0.4, projector_act: str = "swiglu", projector_ln_mid: bool = False, + num_projector_layers: int = 0, **kwargs, ): self.ignore_index = ignore_index @@ -71,6 +72,7 @@ class UltravoxConfig(transformers.PretrainedConfig): self.norm_init = norm_init self.projector_act = projector_act self.projector_ln_mid = projector_ln_mid + self.num_projector_layers = num_projector_layers # N.B. May set the wrapped_model_config below. self.text_model_id = text_model_id From 40a046cd82af87e65d3be9db8bd27a4be65f1b00 Mon Sep 17 00:00:00 2001 From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com> Date: Fri, 5 Dec 2025 22:56:40 -0600 Subject: [PATCH 028/133] [Bugfix]: Fix `TokenizerLike` interface (#30009) Signed-off-by: Rohan138 --- vllm/benchmarks/datasets.py | 59 ++++++++++++++++++---------------- vllm/benchmarks/serve.py | 32 +++++++++--------- vllm/benchmarks/throughput.py | 22 +++++++++---- vllm/config/model.py | 3 +- vllm/tokenizers/deepseekv32.py | 3 ++ vllm/tokenizers/mistral.py | 6 +++- vllm/tokenizers/protocol.py | 3 ++ vllm/tokenizers/registry.py | 2 +- 8 files changed, 78 insertions(+), 52 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 638ece26071ef..49ee0faf049d1 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -32,7 +32,6 @@ from typing import Any, cast import numpy as np from PIL import Image -from transformers import PreTrainedTokenizerBase from typing_extensions import deprecated from vllm.lora.request import LoRARequest @@ -189,7 +188,7 @@ class BenchmarkDataset(ABC): @abstractmethod def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, request_id_prefix: str = "", no_oversample: bool = False, @@ -201,7 +200,7 @@ class BenchmarkDataset(ABC): for generating a list of SampleRequest objects. Args: - tokenizer (PreTrainedTokenizerBase): The tokenizer to be used + tokenizer (TokenizerLike): The tokenizer to be used for processing the dataset's text. num_requests (int): The number of sample requests to generate. request_id_prefix (str): The prefix of request_id. @@ -380,7 +379,7 @@ def process_video(video: Any) -> Mapping[str, Any]: def gen_prompt_decode_to_target_len( - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, token_sequence: list[int], target_token_len: int, max_retry: int = 10, @@ -468,7 +467,7 @@ class RandomDataset(BenchmarkDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, request_id_prefix: str = "", no_oversample: bool = False, @@ -580,7 +579,7 @@ class RandomDataset(BenchmarkDataset): range_ratio: float, input_len: int, output_len: int, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Get the sampling parameters for the dataset. @@ -626,7 +625,7 @@ class RandomDataset(BenchmarkDataset): def generate_token_sequence( self, *, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, prefix_token_ids: list[int], prefix_len: int, vocab_size: int, @@ -686,7 +685,7 @@ class RandomDatasetForReranking(RandomDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, request_id_prefix: str = "", range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO, @@ -716,7 +715,11 @@ class RandomDatasetForReranking(RandomDataset): doc_lens, _, doc_offsets = self.get_sampling_params( num_requests, range_ratio, doc_len_param, 0, tokenizer ) + vocab_size = tokenizer.vocab_size + prohibited_tokens = tokenizer.all_special_ids + all_tokens = np.arange(vocab_size) + allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens))) query_prompt, query_input_len, token_mismatch_total = ( self.generate_token_sequence( @@ -727,6 +730,7 @@ class RandomDatasetForReranking(RandomDataset): input_len=query_len, offset=int(query_offsets[0]), index=0, + allowed_tokens=allowed_tokens, ) ) @@ -740,6 +744,7 @@ class RandomDatasetForReranking(RandomDataset): input_len=int(doc_lens[i]), offset=int(doc_offsets[i]), index=i + 1, + allowed_tokens=allowed_tokens, ) token_mismatch_total += token_mismatch requests.append((prompt, total_input_len)) @@ -1077,7 +1082,7 @@ class RandomMultiModalDataset(RandomDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, request_id_prefix: str = "", no_oversample: bool = False, @@ -1231,7 +1236,7 @@ class ShareGPTDataset(BenchmarkDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, lora_path: str | None = None, max_loras: int | None = None, @@ -1633,7 +1638,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): ) -def get_samples(args, tokenizer) -> list[SampleRequest]: +def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: if not hasattr(args, "request_id_prefix"): args.request_id_prefix = "" @@ -1971,7 +1976,7 @@ class CustomDataset(BenchmarkDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, lora_path: str | None = None, max_loras: int | None = None, @@ -2101,7 +2106,7 @@ class SonnetDataset(BenchmarkDataset): def sample( self, - tokenizer, + tokenizer: TokenizerLike, num_requests: int, prefix_len: int = DEFAULT_PREFIX_LEN, input_len: int = DEFAULT_INPUT_LEN, @@ -2202,7 +2207,7 @@ class BurstGPTDataset(BenchmarkDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, max_loras: int | None = None, lora_path: str | None = None, @@ -2287,7 +2292,7 @@ class ConversationDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, @@ -2347,7 +2352,7 @@ class MultiModalConversationDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, @@ -2416,7 +2421,7 @@ class VisionArenaDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, @@ -2470,7 +2475,7 @@ class MMVUDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, @@ -2531,7 +2536,7 @@ class InstructCoderDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, @@ -2595,7 +2600,7 @@ class MTBenchDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, @@ -2661,7 +2666,7 @@ class BlazeditDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, skip_chat_template: bool = False, @@ -2742,7 +2747,7 @@ class AIMODataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, request_id_prefix: str = "", @@ -2852,7 +2857,7 @@ class NextEditPredictionDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, request_id_prefix: str = "", no_oversample: bool = False, @@ -2924,7 +2929,7 @@ class ASRDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, request_id_prefix: str = "", @@ -3002,7 +3007,7 @@ class MLPerfDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, request_id_prefix: str = "", @@ -3081,7 +3086,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, prefix_len: int = DEFAULT_PREFIX_LEN, suffix_len: int = DEFAULT_SUFFIX_LEN, @@ -3167,7 +3172,7 @@ class MMStarDataset(HuggingFaceDataset): def sample( self, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, num_requests: int, output_len: int | None = None, enable_multimodal_chat: bool = False, diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 890cd7e089fd6..568290aa894ff 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -36,7 +36,6 @@ from typing import Any, Literal import aiohttp import numpy as np from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples from vllm.benchmarks.lib.endpoint_request_func import ( @@ -47,7 +46,7 @@ from vllm.benchmarks.lib.endpoint_request_func import ( ) from vllm.benchmarks.lib.ready_checker import wait_for_endpoint from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.tokenizers import get_tokenizer +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.utils.gc_utils import freeze_gc_heap from vllm.utils.network_utils import join_host_port @@ -286,7 +285,7 @@ def calculate_metrics( input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], dur_s: float, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, selected_percentiles: list[float], goodput_config_dict: dict[str, float], ) -> tuple[BenchmarkMetrics, list[int]]: @@ -489,7 +488,7 @@ async def benchmark( base_url: str, model_id: str, model_name: str, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, input_requests: list[SampleRequest], logprobs: int | None, request_rate: float, @@ -1032,6 +1031,19 @@ def add_cli_args(parser: argparse.ArgumentParser): type=str, help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) + parser.add_argument( + "--tokenizer-mode", + type=str, + default="auto", + help="""Tokenizer mode:\n + - "auto" will use the tokenizer from `mistral_common` for Mistral models + if available, otherwise it will use the "hf" tokenizer.\n + - "hf" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - Other custom values can be supported via plugins.""", + ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( "--logprobs", @@ -1228,18 +1240,6 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Common prefix length shared by all prompts (used by random dataset)", ) - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - choices=["auto", "slow", "mistral", "custom"], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.', - ) - parser.add_argument( "--served-model-name", type=str, diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 23b5faa1b2c32..ea693613fdd16 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -14,7 +14,7 @@ from typing import Any import torch import uvloop from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase +from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase from vllm.benchmarks.datasets import ( AIMODataset, @@ -35,6 +35,7 @@ from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.utils.async_utils import merge_async_iterators @@ -246,12 +247,15 @@ async def run_vllm_async( def run_hf( requests: list[SampleRequest], model: str, - tokenizer: PreTrainedTokenizerBase, + tokenizer: TokenizerLike, n: int, max_batch_size: int, trust_remote_code: bool, disable_detokenize: bool = False, ) -> float: + assert isinstance(tokenizer, PreTrainedTokenizerBase), ( + "the hf backend only supports HF tokenizers" + ) llm = AutoModelForCausalLM.from_pretrained( model, dtype=torch.float16, trust_remote_code=trust_remote_code ) @@ -692,15 +696,21 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): - if args.tokenizer is None: - args.tokenizer = args.model validate_args(args) if args.seed is None: args.seed = 0 random.seed(args.seed) # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code + if ( + args.backend == "hf" or args.backend == "mii" + ) and args.tokenizer_mode == "auto": + # mistral_common tokenizer is only supported on vllm and vllm-chat backends; + # for hf and mii backends, we use hf tokenizer + args.tokenizer_mode = "hf" + tokenizer = get_tokenizer( + args.tokenizer, + tokenizer_mode=args.tokenizer_mode, + trust_remote_code=args.trust_remote_code, ) requests = get_requests(args, tokenizer) is_multi_modal = any(request.multi_modal_data is not None for request in requests) diff --git a/vllm/config/model.py b/vllm/config/model.py index 5be7d5e7f2df7..509a9c5e162f7 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -136,7 +136,8 @@ class ModelConfig: name or path will be used.""" tokenizer_mode: TokenizerMode | str = "auto" """Tokenizer mode:\n - - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n + - "auto" will use the tokenizer from `mistral_common` for Mistral models + if available, otherwise it will use the "hf" tokenizer.\n - "hf" will use the fast tokenizer if available.\n - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 1140357cf861d..b0490dacbe2d4 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -54,6 +54,9 @@ class DeepseekV32Tokenizer(HfTokenizer): prompt_str = encode_messages(messages, **encode_config) # type: ignore return prompt_str + def num_special_tokens_to_add(self) -> int: + return len(self.encode("")) + @property def all_special_tokens(self) -> list[str]: return self.tokenizer.all_special_tokens diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 37d67607c2cfe..1f44037dd55ec 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -309,6 +309,9 @@ class MistralTokenizer(TokenizerLike): for i in all_special_ids ] + def num_special_tokens_to_add(self) -> int: + return len(self.encode("")) + # the following attributes are set to fit vLLM's design and are used # by the structured output backends. @property @@ -421,6 +424,7 @@ class MistralTokenizer(TokenizerLike): ) -> list[int]: add_generation_prompt = kwargs.pop("add_generation_prompt", False) continue_final_message = kwargs.get("continue_final_message", False) + tokenize = kwargs.get("tokenize", True) padding = kwargs.get("padding", False) truncation = kwargs.get("truncation", False) max_length = kwargs.get("max_length") @@ -433,7 +437,7 @@ class MistralTokenizer(TokenizerLike): conversation=messages, tools=tools, continue_final_message=continue_final_message, - tokenize=True, + tokenize=tokenize, padding=padding, truncation=truncation, max_length=max_length, diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py index 6c807bd998781..d6a3b0ba9b5f5 100644 --- a/vllm/tokenizers/protocol.py +++ b/vllm/tokenizers/protocol.py @@ -22,6 +22,9 @@ class TokenizerLike(Protocol): ) -> "TokenizerLike": raise NotImplementedError + def num_special_tokens_to_add(self) -> int: + raise NotImplementedError + @property def all_special_tokens(self) -> list[str]: raise NotImplementedError diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 87048f2ec7845..1d44feeee500f 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -183,7 +183,7 @@ def get_tokenizer( "`tokenizer_mode='custom'` when initializing vLLM.", tokenizer_args, str(tokenizer_kwargs), - tokenizer_mode, + tokenizer_name, ) tokenizer_mode = str(tokenizer_name) From b12f4a983077f0f085e3734d4d5b0c25f2576cec Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 5 Dec 2025 22:57:38 -0600 Subject: [PATCH 029/133] [CI/Build][AMD] Use ROCM_ATTN instead of FLASH_ATTN test for test_register_kv_caches for ROCm and update test for TRITON_ATTN (#29985) Signed-off-by: Randall Smith Co-authored-by: Randall Smith Co-authored-by: TJian --- .../kv_connector/unit/test_nixl_connector.py | 48 ++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index ec9ff7315d097..53da09cfbc21d 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -41,6 +41,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import ( has_kv_transfer_group, ) from vllm.forward_context import ForwardContext +from vllm.platforms import current_platform from vllm.platforms.interface import Platform from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend @@ -1111,7 +1112,26 @@ def _run_abort_timeout_test(llm: LLM, timeout: int): llm.llm_engine.engine_core.shutdown() -@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "TRITON_ATTN"]) +@pytest.mark.parametrize( + "attn_backend", + [ + pytest.param( + "FLASH_ATTN", + marks=pytest.mark.skipif( + current_platform.is_rocm(), + reason="Attention backend FLASH_ATTN is not supported on ROCm", + ), + ), + pytest.param( + "ROCM_ATTN", + marks=pytest.mark.skipif( + not current_platform.is_rocm(), + reason="Attention backend ROCM_ATTN is only supported on ROCm", + ), + ), + "TRITON_ATTN", + ], +) def test_register_kv_caches(dist_init, attn_backend, monkeypatch): """ Test that register_kv_caches() properly calls nixl_wrapper methods with @@ -1133,6 +1153,10 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend backend_cls = FlashAttentionBackend + elif attn_backend == "ROCM_ATTN": + from vllm.v1.attention.backends.rocm_attn import RocmAttentionBackend + + backend_cls = RocmAttentionBackend else: # TRITON_ATTN from vllm.v1.attention.backends.triton_attn import TritonAttentionBackend @@ -1151,6 +1175,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): } # Store tensor info for validation + test_shape = backend_cls.get_kv_cache_shape( num_blocks=1, block_size=16, num_kv_heads=1, head_size=1 ) @@ -1175,17 +1200,18 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): ] expected_num_entries = 4 + nixl_module = "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector" with ( - patch( - "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper" - ) as mock_nixl_wrapper, - patch( - "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event" - ), - patch( - "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread" - ) as mock_thread, - ): # noqa: E501 + patch(f"{nixl_module}.NixlWrapper") as mock_nixl_wrapper, + patch(f"{nixl_module}.threading.Event"), + patch(f"{nixl_module}.threading.Thread") as mock_thread, + patch(f"{nixl_module}.get_attn_backend") as mock_get_attn_backend, + ): + # Ensure get_attn_backend returns the correct value due to + # _cached_get_attn_backend returning the backend from previous + # test run if not mocking. + mock_get_attn_backend.return_value = backend_cls + # Create connector connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) connector.connector_worker = FakeNixlConnectorWorker( From 4026ae31e910d50da2b80c1c386f1d1db7f1b7d8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 5 Dec 2025 20:59:04 -0800 Subject: [PATCH 030/133] [Misc] Move `disable_nccl_for_dp_synchronization` init logic into `VllmConfig` (#30161) Signed-off-by: Nick Hill --- vllm/config/vllm.py | 9 +++++++++ vllm/engine/arg_utils.py | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 47e7ffded797b..b99be1e5dfbc8 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -586,6 +586,15 @@ class VllmConfig: else: self.scheduler_config.async_scheduling = True + if ( + self.scheduler_config.async_scheduling + and not self.parallel_config.disable_nccl_for_dp_synchronization + ): + logger.info( + "Disabling NCCL for DP synchronization when using async scheduling." + ) + self.parallel_config.disable_nccl_for_dp_synchronization = True + from vllm.platforms import current_platform if ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index aad0719548d18..ceac5407af6e2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1602,12 +1602,6 @@ class EngineArgs: model_config.skip_tokenizer_init = True logger.info("Skipping tokenizer initialization for tokens-only mode.") - if self.async_scheduling and not self.disable_nccl_for_dp_synchronization: - logger.info( - "Disabling NCCL for DP synchronization when using async scheduling." - ) - self.disable_nccl_for_dp_synchronization = True - parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, From a238cbd89d07b4b0ed8fb3dff3c219a3ee3a1651 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 5 Dec 2025 21:42:47 -0800 Subject: [PATCH 031/133] [Model Runner V2] Support min-p sampling (#30171) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/sample/metadata.py | 13 +++++++ vllm/v1/worker/gpu/sample/min_p.py | 53 +++++++++++++++++++++++++++ vllm/v1/worker/gpu/sample/sampler.py | 4 ++ vllm/v1/worker/gpu/states.py | 7 ++++ 4 files changed, 77 insertions(+) create mode 100644 vllm/v1/worker/gpu/sample/min_p.py diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py index 040771c051bb4..f10c72049cbae 100644 --- a/vllm/v1/worker/gpu/sample/metadata.py +++ b/vllm/v1/worker/gpu/sample/metadata.py @@ -13,6 +13,7 @@ class SamplingMetadata: top_p: torch.Tensor | None top_k: torch.Tensor | None + min_p: torch.Tensor | None repetition_penalty: torch.Tensor frequency_penalty: torch.Tensor @@ -44,6 +45,7 @@ class SamplingMetadata: # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device) top_p = None top_k = None + min_p = torch.zeros(num_reqs, dtype=torch.float32, device=device) # NOTE(woosuk): We must set penalties to their default values to make sure # the penalties kernel does not touch the placeholder bin_counts tensors. repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device) @@ -64,6 +66,7 @@ class SamplingMetadata: temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, repetition_penalty=repetition_penalty, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, @@ -85,6 +88,8 @@ def _expand_sampling_metadata_kernel( expanded_top_p_ptr, top_k_ptr, expanded_top_k_ptr, + min_p_ptr, + expanded_min_p_ptr, rep_penalty_ptr, expanded_rep_penalty_ptr, freq_penalty_ptr, @@ -115,6 +120,10 @@ def _expand_sampling_metadata_kernel( top_k = tl.load(top_k_ptr + req_idx) tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask) + if min_p_ptr is not None: + min_p = tl.load(min_p_ptr + req_idx) + tl.store(expanded_min_p_ptr + start_idx + block, min_p, mask=mask) + rep_penalty = tl.load(rep_penalty_ptr + req_idx) tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask) @@ -138,6 +147,7 @@ def expand_sampling_metadata( expanded_temp = create_empty(sampling_metadata.temperature) expanded_top_p = create_empty(sampling_metadata.top_p) expanded_top_k = create_empty(sampling_metadata.top_k) + expanded_min_p = create_empty(sampling_metadata.min_p) expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty) expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty) expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty) @@ -151,6 +161,8 @@ def expand_sampling_metadata( expanded_top_p, sampling_metadata.top_k, expanded_top_k, + sampling_metadata.min_p, + expanded_min_p, sampling_metadata.repetition_penalty, expanded_repetition_penalty, sampling_metadata.frequency_penalty, @@ -166,6 +178,7 @@ def expand_sampling_metadata( temperature=expanded_temp, top_p=expanded_top_p, top_k=expanded_top_k, + min_p=expanded_min_p, seeds=expanded_seeds, repetition_penalty=expanded_repetition_penalty, frequency_penalty=expanded_frequency_penalty, diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py new file mode 100644 index 0000000000000..0638818006f50 --- /dev/null +++ b/vllm/v1/worker/gpu/sample/min_p.py @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.triton_utils import tl, triton + + +@triton.jit +def _min_p_kernel( + logits_ptr, + logits_stride, + min_p_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + min_p = tl.load(min_p_ptr + req_idx).to(tl.float32) + if min_p == 0.0: + return + + max_val = float("-inf") + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, mask=mask, other=float("-inf") + ) + max_val = tl.max(tl.maximum(logits, max_val)) + max_val = max_val.to(tl.float32) # type: ignore + + threshold = max_val + tl.log(min_p) + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, mask=mask, other=float("-inf") + ) + logits = tl.where(logits < threshold, float("-inf"), logits) + tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask) + + +def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None: + if min_p is None: + return + num_reqs, vocab_size = logits.shape + BLOCK_SIZE = 1024 + _min_p_kernel[(num_reqs,)]( + logits, + logits.stride(0), + min_p, + vocab_size, + BLOCK_SIZE=BLOCK_SIZE, + ) diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py index 3429dd3e4d0fb..9a4224d8fddef 100644 --- a/vllm/v1/worker/gpu/sample/sampler.py +++ b/vllm/v1/worker/gpu/sample/sampler.py @@ -9,6 +9,7 @@ from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.worker.gpu.sample.min_p import apply_min_p from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature @@ -61,6 +62,9 @@ class Sampler: # Apply penalties and temperature in place. apply_penalties_and_temperature(logits, sampling_metadata) + # Apply min_p in place. + apply_min_p(logits, sampling_metadata.min_p) + # Apply top_k and/or top_p. This might return a new tensor. logits = apply_top_k_top_p( logits, sampling_metadata.top_k, sampling_metadata.top_p ) diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py index 367348c4a18f7..6823c0c8ee5c7 100644 --- a/vllm/v1/worker/gpu/states.py +++ b/vllm/v1/worker/gpu/states.py @@ -87,6 +87,7 @@ class RequestState: self.temperature = self._make_param(self.max_num_reqs, torch.float32) self.top_p = self._make_param(self.max_num_reqs, torch.float32) self.top_k = self._make_param(self.max_num_reqs, torch.int32) + self.min_p = self._make_param(self.max_num_reqs, torch.float32) self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32) self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32) self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32) @@ -162,6 +163,7 @@ class RequestState: else: top_k = self.vocab_size self.top_k.np[req_idx] = top_k + self.min_p.np[req_idx] = sampling_params.min_p self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty self.presence_penalty.np[req_idx] = sampling_params.presence_penalty @@ -217,6 +219,10 @@ class RequestState: no_top_k = np.all(top_k == self.vocab_size) top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None + min_p = self.min_p.np[idx_mapping_np] + no_min_p = np.all(min_p == 0.0) + min_p = self.min_p.copy_np_to_gpu(min_p) if not no_min_p else None + rep_penalty = self.repetition_penalty.np[idx_mapping_np] rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty) freq_penalty = self.frequency_penalty.np[idx_mapping_np] @@ -236,6 +242,7 @@ class RequestState: temperature=temperature, top_p=top_p, top_k=top_k, + min_p=min_p, repetition_penalty=rep_penalty, frequency_penalty=freq_penalty, presence_penalty=pres_penalty, From d6aeaddf4a6201e35ec89bcd4b3719e4e7293f1f Mon Sep 17 00:00:00 2001 From: kx <1670186653@qq.com> Date: Sat, 6 Dec 2025 15:11:31 +0800 Subject: [PATCH 032/133] [bugfix] fix type[AttentionBackend] bug in kv_connector_base_v1 (#30051) Signed-off-by: 01267596 Co-authored-by: 01267596 --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 8e9182a9bca4c..91f6443f92cbe 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -239,7 +239,7 @@ class KVConnectorBase_V1(ABC): return def register_cross_layers_kv_cache( - self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend] + self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"] ): """ Initialize with a single KV cache tensor used by all layers. From 6476382384d3560367e30732995e2456ec569c5d Mon Sep 17 00:00:00 2001 From: redwrasse Date: Fri, 5 Dec 2025 23:39:56 -0800 Subject: [PATCH 033/133] prefix caching design doc sha256 now default (#29261) Signed-off-by: redwrasse --- docs/design/prefix_caching.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index cf792fdabe1a6..6f2eb3062b3b9 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -22,8 +22,8 @@ In the example above, the KV cache in the first block can be uniquely identified We only cache full blocks. !!! note "Note 2" - The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash. - SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context). + The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we use SHA256** as hash function instead of the builtin hash. + SHA256 is supported since vLLM v0.8.3 and the default since v0.10.2. It comes with a negligible performance impact of about 75ns per token (<4ms for 50k tokens of context). **A hashing example with multi-modality inputs** In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages: From c46b932df2b801ba0a6452e436268f086029d82b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 6 Dec 2025 15:57:28 +0800 Subject: [PATCH 034/133] [Chore] Deprecate `SupportsMultiModal.merge_by_field_config` (#30170) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/aria.py | 2 -- vllm/model_executor/models/aya_vision.py | 2 -- vllm/model_executor/models/blip2.py | 2 -- vllm/model_executor/models/chameleon.py | 2 -- vllm/model_executor/models/clip.py | 1 - vllm/model_executor/models/cohere2_vision.py | 2 -- vllm/model_executor/models/deepseek_ocr.py | 2 -- vllm/model_executor/models/deepseek_vl2.py | 2 -- vllm/model_executor/models/dots_ocr.py | 2 -- vllm/model_executor/models/ernie45_vl.py | 2 -- vllm/model_executor/models/fuyu.py | 2 -- vllm/model_executor/models/gemma3_mm.py | 2 -- vllm/model_executor/models/gemma3n_mm.py | 1 - vllm/model_executor/models/glm4_1v.py | 2 -- vllm/model_executor/models/glm4v.py | 2 -- vllm/model_executor/models/granite_speech.py | 1 - vllm/model_executor/models/hunyuan_vision.py | 1 - .../models/hyperclovax_vision.py | 2 -- vllm/model_executor/models/idefics3.py | 2 -- vllm/model_executor/models/interfaces.py | 25 ++++++++++++++++--- vllm/model_executor/models/interns1.py | 2 -- vllm/model_executor/models/internvl.py | 2 -- vllm/model_executor/models/keye.py | 2 -- vllm/model_executor/models/kimi_vl.py | 2 -- vllm/model_executor/models/llava.py | 2 -- vllm/model_executor/models/llava_next.py | 2 -- .../model_executor/models/llava_next_video.py | 2 -- vllm/model_executor/models/llava_onevision.py | 2 -- vllm/model_executor/models/midashenglm.py | 2 -- vllm/model_executor/models/minicpmv.py | 2 -- vllm/model_executor/models/minimax_vl_01.py | 2 -- vllm/model_executor/models/mistral3.py | 2 -- vllm/model_executor/models/mllama4.py | 2 -- vllm/model_executor/models/molmo.py | 2 -- .../model_executor/models/nano_nemotron_vl.py | 2 -- vllm/model_executor/models/nemotron_vl.py | 2 -- vllm/model_executor/models/opencua.py | 1 - vllm/model_executor/models/ovis.py | 2 -- vllm/model_executor/models/ovis2_5.py | 2 -- vllm/model_executor/models/paddleocr_vl.py | 2 -- vllm/model_executor/models/paligemma.py | 2 -- vllm/model_executor/models/phi3v.py | 2 -- vllm/model_executor/models/phi4mm.py | 2 -- vllm/model_executor/models/pixtral.py | 2 -- .../models/qwen2_5_omni_thinker.py | 2 -- vllm/model_executor/models/qwen2_5_vl.py | 1 - vllm/model_executor/models/qwen2_audio.py | 2 -- vllm/model_executor/models/qwen2_vl.py | 1 - .../models/qwen3_omni_moe_thinker.py | 2 -- vllm/model_executor/models/qwen3_vl.py | 1 - vllm/model_executor/models/qwen_vl.py | 2 -- vllm/model_executor/models/siglip.py | 1 - vllm/model_executor/models/skyworkr1v.py | 2 -- vllm/model_executor/models/step3_vl.py | 2 -- vllm/model_executor/models/tarsier.py | 2 -- vllm/model_executor/models/terratorch.py | 1 - .../models/transformers/multimodal.py | 2 +- vllm/model_executor/models/ultravox.py | 2 -- vllm/model_executor/models/voxtral.py | 2 -- vllm/model_executor/models/whisper.py | 1 - vllm/multimodal/utils.py | 1 - 61 files changed, 23 insertions(+), 110 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 3d07e6b612ca3..c6d7f19cbe90d 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): model to perform tasks that involve both image and text inputs. """ - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 0ada2ed5028bb..ee9e210a3240f 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: dummy_inputs=AyaVisionDummyInputsBuilder, ) class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f71b9c01d359d..1244f97a1bd68 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): class Blip2ForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant ): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 3aa01bb1905fe..dfc05a366b286 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -918,8 +918,6 @@ class ChameleonModel(nn.Module): class ChameleonForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index b8af3050990bc..22f3ecad748e6 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): is_pooling_model = True packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} - merge_by_field_config = True @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 139ccba9df6d8..07dc7a01dc316 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor( dummy_inputs=Cohere2VisionDummyInputsBuilder, ) class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_tower.": "vision_tower.", diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index a612ebd956282..1f07381c0cbd0 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor( dummy_inputs=DeepseekOCRDummyInputsBuilder, ) class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # map prefix for language backbone diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 56c1a87a25401..9f8faf9ed91ce 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor( dummy_inputs=DeepseekVL2DummyInputsBuilder, ) class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "language.": "language_model.", diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 5cc2a48f26d64..da19d8fdb15e0 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module): dummy_inputs=DotsOCRDummyInputsBuilder, ) class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ ".attn.qkv_proj.": ".attn.qkv.", diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 81663dd7bbb45..3305b6a0e58f0 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing class Ernie4_5_VLMoeForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 269c36ab5b9c7..8a7a3dd771c38 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): dummy_inputs=FuyuDummyInputsBuilder, ) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embed_tokens.": "vision_embed_tokens.", diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 43c69e5e13992..e8dec36a1c5b8 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module): class Gemma3ForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 6ae76976eb46c..7036118ada084 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module): class Gemma3nForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsTranscription ): - merge_by_field_config = True supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5ba3c0a35928d..3cb53f2cbabee 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): class Glm4vForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 514082cf60ce2..ec5af94e297c1 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): class GLM4VForCausalLM( ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True - packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index accf7e6ef2f47..a4e50f4086281 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration( SupportsLoRA, SupportsTranscription, ): - merge_by_field_config = True supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 5aef09ca9c256..52ce9564c8d79 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration( SupportsQuant, SupportsXDRoPE, ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw"} # To ensure correct weight loading and mapping. diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index db46353efde5c..3a083870e4b5a 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module): dummy_inputs=HCXVisionDummyInputsBuilder, ) class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 7c3933c6feb7e..0eed464487865 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -576,8 +576,6 @@ class Idefics3Model(nn.Module): dummy_inputs=Idefics3DummyInputsBuilder, ) class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 01b3e7827424d..416ab236cd18d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol): `multimodal_config.mm_encoder_tp_mode="data"`. """ - merge_by_field_config: ClassVar[bool] = True + merge_by_field_config: ClassVar[bool | None] = None """ - A flag that indicates which implementation of + [DEPRECATED] A flag that indicates which implementation of `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use. """ @@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ... def supports_multimodal( model: type[object] | object, ) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]: - return getattr(model, "supports_multimodal", False) + res = getattr(model, "supports_multimodal", False) + + if res: + # We can remove this starting from v0.14 + merge_by_field_config = getattr(model, "merge_by_field_config", None) + if merge_by_field_config is False: + raise ValueError( + "`merge_by_field_config=False` is no longer effective, " + "please update your model to consider the new batching logic " + "in `group_mm_kwargs_by_modality` (refer to " + "https://github.com/vllm-project/vllm/issues/26149), " + "and then remove the override from your model." + ) + if merge_by_field_config is True: + logger.warning_once( + "`merge_by_field_config=True` is redundant, " + "please remove the override from your model." + ) + + return res def supports_multimodal_raw_input_only(model: type[object] | object) -> bool: diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index c2195fd0cb88d..18985cefbf5ea 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -509,8 +509,6 @@ class InternS1MultiModalProcessor(BaseMultiModalProcessor[InternS1ProcessingInfo class InternS1ForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA ): - merge_by_field_config = True - # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index fccddf3a6b293..15f7d4f418e48 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1074,8 +1074,6 @@ class InternVLMultiModalProcessor( dummy_inputs=InternVLDummyInputsBuilder, ) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - supports_encoder_tp_data = True @classmethod diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 09acf8372e168..f31da0ee302b3 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1292,8 +1292,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): class BaseKeyeModule(nn.Module): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 8167b82f32330..85267ccda8a91 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -298,8 +298,6 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]): dummy_inputs=KimiVLDummyInputsBuilder, ) class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - supports_encoder_tp_data = True @classmethod diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c1fb2d4f4af7d..66a327bb7603d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -506,8 +506,6 @@ def init_vision_tower_for_llava( dummy_inputs=LlavaDummyInputsBuilder, ) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index b995cac47ac1c..526846d0d9812 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -223,8 +223,6 @@ class LlavaNextMultiModalProcessor( dummy_inputs=LlavaDummyInputsBuilder, ) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 902c598c226f0..cd55cfec6cdec 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -299,8 +299,6 @@ class LlavaNextMultiModalProjector(nn.Module): dummy_inputs=LlavaNextVideoDummyInputsBuilder, ) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 4e243ade68358..5aa8de7dc252e 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -479,8 +479,6 @@ class LlavaOnevisionMultiModalProjector(nn.Module): dummy_inputs=LlavaOnevisionDummyInputsBuilder, ) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index d9b23811730d4..2d506978d266e 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -683,8 +683,6 @@ class MiDashengLMMultiModalProcessor( dummy_inputs=MiDashengLMDummyInputsBuilder, ) class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 6d0ebf5c9825c..c45bdf95e7487 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1003,8 +1003,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): instantiated. """ - merge_by_field_config = True - supports_encoder_tp_data = True @classmethod diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 0939a72ba53ec..e480454953df8 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -179,8 +179,6 @@ class MiniMaxVL01MultiModalProcessor( dummy_inputs=MiniMaxVL01DummyInputsBuilder, ) class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 1ddb470a0f93d..e9161e69e731b 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -423,8 +423,6 @@ def init_vision_tower_for_llava( class Mistral3ForConditionalGeneration( nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 286859d188d34..e944c0ee38aa1 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -741,8 +741,6 @@ class Llama4ForConditionalGeneration( SupportsEagle3, SupportsLoRA, ): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 7b53299cccbe4..a6cd9ad16c188 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1354,8 +1354,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]): class MolmoForCausalLM( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsQuant ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ # vision backbone mapping diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index c4198d36b392e..6dfab595e5b92 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1116,8 +1116,6 @@ class NanoNemotronVLDummyInputsBuilder( class NemotronH_Nano_VL_V2( nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning ): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index a57668b21fb86..391980fc61f9e 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -358,8 +358,6 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): dummy_inputs=BaseInternVLDummyInputsBuilder[NemotronVLProcessingInfo], ) class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index b92f0c9dac32b..76a2d1cc242c5 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -201,7 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder): dummy_inputs=OpenCUADummyInputsBuilder, ) class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw"} packed_modules_mapping = { diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index a0fab820720fb..0691bbc615be9 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -414,8 +414,6 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]): dummy_inputs=OvisDummyInputsBuilder, ) class Ovis(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 85f37cfea10b1..0ad22aab748e3 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -456,8 +456,6 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]) dummy_inputs=Ovis2_5DummyInputsBuilder, ) class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 1df5ff62fa5b5..9703a5b417d02 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -1103,8 +1103,6 @@ class SiglipVisionModel(nn.Module): dummy_inputs=PaddleOCRVLDummyInputsBuilder, ) class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 9fa32f01d37a0..67240c6e71249 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -251,8 +251,6 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn dummy_inputs=PaliGemmaDummyInputsBuilder, ) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 384572217bc19..b7ae548069f25 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -562,8 +562,6 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]): dummy_inputs=Phi3VDummyInputsBuilder, ) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embed_tokens.wte": "embed_tokens", diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 8425549a7bd20..179d5df869bea 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -984,8 +984,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): Implements the Phi-4-multimodal-instruct model in vLLM. """ - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": [ "qkv_proj", diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index cad241842cd30..faf2d80d24bba 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -365,8 +365,6 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]) dummy_inputs=PixtralDummyInputsBuilder, ) class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 1ce0fb4e4d93d..3438406c4fac1 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -773,8 +773,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( SupportsMRoPE, Qwen2_5OmniConditionalGenerationMixin, ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "thinker.lm_head.": "language_model.lm_head.", diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index cb521ebdf0afb..488af192bfad0 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1039,7 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration( SupportsMultiModalPruning, SupportsMRoPE, ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} packed_modules_mapping = { diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 7e883a393aa8d..f84ddfa84f6ab 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -313,8 +313,6 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessing dummy_inputs=Qwen2AudioDummyInputsBuilder, ) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("audio"): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b748768498412..9da5080f84303 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1131,7 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) class Qwen2VLForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} # To ensure correct weight loading and mapping. diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index e6979211b707f..dbe7bcd07576b 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1131,8 +1131,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( SupportsMRoPE, Qwen3OmniMoeConditionalGenerationMixin, ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "thinker.lm_head.": "language_model.lm_head.", diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 58721303dfc87..a5b10c95872dd 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1190,7 +1190,6 @@ class Qwen3VLForConditionalGeneration( SupportsMRoPE, SupportsEagle3, ): - merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} packed_modules_mapping = { diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 55680b8e7ddfd..caac14716782a 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -703,8 +703,6 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]): class QwenVLForConditionalGeneration( QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal ): - merge_by_field_config = True - packed_modules_mapping = { "c_attn": ["c_attn"], "gate_up_proj": [ diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 9db1423d98e07..2600dc1c9f79c 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -989,7 +989,6 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): is_pooling_model = True packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} - merge_by_field_config = True @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 55c25ce6190fb..f95fbffc1d0b4 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -647,8 +647,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing dummy_inputs=SkyworkR1VDummyInputsBuilder, ) class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 3e55ada0ed2e1..e5038e56a2708 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -916,8 +916,6 @@ class Step3VisionTransformer(nn.Module): dummy_inputs=Step3VLDummyInputsBuilder, ) class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.": "language_model.model.", diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 4d310712f303e..7e82a4d725a62 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -400,8 +400,6 @@ def init_vision_tower_for_tarsier( dummy_inputs=TarsierDummyInputsBuilder, ) class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 9f34090e31071..402081a70631e 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -227,7 +227,6 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor): dummy_inputs=TerratorchInputBuilder, ) class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal): - merge_by_field_config = True supports_multimodal_raw_input_only = True is_pooling_model = True diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index ccf6053719871..9d77dee2810c3 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -264,7 +264,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): supports_multimodal_raw_input_only = True - merge_by_field_config = True + # Backwards compatibility for prev released models. State dicts back then # had different formats and cannot be loaded with `AutoModel` mapping as is hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 2444159b2ad61..32a2ba1ef38f7 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -498,8 +498,6 @@ class ModifiedWhisperEncoder(WhisperEncoder): dummy_inputs=UltravoxDummyInputsBuilder, ) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - merge_by_field_config = True - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 45f8fa079c714..7b408248ec74c 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -330,8 +330,6 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]) class VoxtralForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription ): - merge_by_field_config = True - supported_languages = ISO639_1_SUPPORTED_LANGS packed_modules_mapping = { diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 0daf6bda61ccb..b2feff1335151 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -775,7 +775,6 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo class WhisperForConditionalGeneration( nn.Module, SupportsTranscription, SupportsMultiModal ): - merge_by_field_config = True packed_modules_mapping = { "self_attn.qkv_proj": [ "self_attn.q_proj", diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index f8e8847e8e609..9c5e3fb2b32ad 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -426,7 +426,6 @@ def group_mm_kwargs_by_modality( Yields: A tuple `(modality, num_items, grouped_kwargs)`. """ - # TODO: After v0.13, remove merge_by_field_config attribute from model impls if merge_by_field_config is not None: logger.warning_once( "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` " From 43e75930318e1cb101fb6178876b027634f91f71 Mon Sep 17 00:00:00 2001 From: Yu Jiaqi <54204033+piood@users.noreply.github.com> Date: Sat, 6 Dec 2025 17:12:53 +0800 Subject: [PATCH 035/133] Support tokenization_kwargs override (#29794) Signed-off-by: piood <2477084691@qq.com> --- tests/conftest.py | 17 ++++++++++---- .../models/multimodal/pooling/test_siglip.py | 18 +++++++++++++-- vllm/entrypoints/llm.py | 22 +++++++++++++++++-- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0d456fb364cda..9f811d5d8db2a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -405,6 +405,7 @@ class HfRunner: images: PromptImageInput | None = None, videos: PromptVideoInput | None = None, audios: PromptAudioInput | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]: if images is not None: assert len(prompts) == len(images) @@ -418,10 +419,18 @@ class HfRunner: all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = [] for i, prompt in enumerate(prompts): if isinstance(prompt, str): - processor_kwargs: dict[str, Any] = { - "text": prompt, - "return_tensors": "pt", - } + # Create a copy to avoid modifying the original dict + processor_kwargs = ( + tokenization_kwargs.copy() + if tokenization_kwargs is not None + else {} + ) + processor_kwargs.update( + { + "text": prompt, + "return_tensors": "pt", + } + ) if images is not None and (image := images[i]) is not None: processor_kwargs["images"] = image if videos is not None and (video := videos[i]) is not None: diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 92ae115a19831..72886cbf7f323 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + import pytest from transformers import SiglipModel @@ -35,7 +37,11 @@ def _run_test( model: str, *, dtype: str, + tokenization_kwargs: dict[str, Any] | None = None, ) -> None: + if tokenization_kwargs is None: + tokenization_kwargs = {} + with vllm_runner( model, runner="pooling", @@ -44,10 +50,14 @@ def _run_test( max_model_len=64, gpu_memory_utilization=0.7, ) as vllm_model: - vllm_outputs = vllm_model.embed(input_texts, images=input_images) + vllm_outputs = vllm_model.embed( + input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs + ) with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model: - all_inputs = hf_model.get_inputs(input_texts, images=input_images) + all_inputs = hf_model.get_inputs( + input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs + ) all_outputs = [] for inputs in all_inputs: @@ -94,6 +104,10 @@ def test_models_text( input_images, # type: ignore model, dtype=dtype, + tokenization_kwargs={ + "padding": "max_length", + "max_length": 64, + }, # siglip2 was trained with this padding setting. ) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index add9176340a9e..913324fd5f9c3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1076,6 +1076,7 @@ class LLM: params=pooling_params, use_tqdm=use_tqdm, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1113,6 +1114,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[EmbeddingRequestOutput]: """ Generate an embedding vector for each prompt. @@ -1150,6 +1152,7 @@ class LLM: pooling_params=pooling_params, lora_request=lora_request, pooling_task="embed", + tokenization_kwargs=tokenization_kwargs, ) return [EmbeddingRequestOutput.from_base(item) for item in items] @@ -1161,6 +1164,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ClassificationRequestOutput]: """ Generate class logits for each prompt. @@ -1196,6 +1200,7 @@ class LLM: pooling_params=pooling_params, lora_request=lora_request, pooling_task="classify", + tokenization_kwargs=tokenization_kwargs, ) return [ClassificationRequestOutput.from_base(item) for item in items] @@ -1209,6 +1214,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[PoolingRequestOutput]: """ Generate rewards for each prompt. @@ -1236,6 +1242,7 @@ class LLM: pooling_params=pooling_params, truncate_prompt_tokens=truncate_prompt_tokens, pooling_task="token_classify", + tokenization_kwargs=tokenization_kwargs, ) def _embedding_score( @@ -1247,6 +1254,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, @@ -1255,6 +1263,7 @@ class LLM: lora_request=lora_request, pooling_params=pooling_params, pooling_task="embed", + tokenization_kwargs=tokenization_kwargs, ) encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)] @@ -1279,6 +1288,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: model_config = self.model_config @@ -1294,7 +1304,8 @@ class LLM: pooling_params.verify("score", model_config) pooling_params_list = list[PoolingParams]() - tokenization_kwargs: dict[str, Any] = {} + local_kwargs = tokenization_kwargs or {} + tokenization_kwargs = local_kwargs.copy() _validate_truncation_size( model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs @@ -1557,6 +1568,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, lora_request: Sequence[LoRARequest] | LoRARequest | None, priority: list[int] | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> None: if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. @@ -1602,6 +1614,7 @@ class LLM: if isinstance(lora_request, Sequence) else lora_request, priority=priority[i] if priority else 0, + tokenization_kwargs=tokenization_kwargs, ) added_request_ids.append(request_id) except Exception as e: @@ -1665,9 +1678,12 @@ class LLM: *, lora_request: LoRARequest | None, priority: int, + tokenization_kwargs: dict[str, Any] | None = None, ) -> tuple[EngineCoreRequest, dict[str, Any]]: """Use the Processor to process inputs for LLMEngine.""" - tokenization_kwargs: dict[str, Any] = {} + + local_kwargs = tokenization_kwargs or {} + tokenization_kwargs = local_kwargs.copy() _validate_truncation_size( self.model_config.max_model_len, params.truncate_prompt_tokens, @@ -1690,6 +1706,7 @@ class LLM: params: SamplingParams | PoolingParams, lora_request: LoRARequest | None = None, priority: int = 0, + tokenization_kwargs: dict[str, Any] | None = None, ) -> str: prompt_text, _, _ = get_prompt_components(prompt) request_id = str(next(self.request_counter)) @@ -1700,6 +1717,7 @@ class LLM: params, lora_request=lora_request, priority=priority, + tokenization_kwargs=tokenization_kwargs, ) self.llm_engine.add_request( From 92c35abb242babbf592390960fb5a4155261e017 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Sat, 6 Dec 2025 01:24:03 -0800 Subject: [PATCH 036/133] [Misc] Fix circular import in vllm.transformers_utils.config (#30179) Signed-off-by: Ye (Charlotte) Qi --- vllm/transformers_utils/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index f926b523afdfa..773fc05a52ef3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -25,7 +25,6 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs -from vllm.config.utils import getattr_iter from vllm.logger import init_logger from vllm.transformers_utils.utils import parse_safetensors_file_metadata @@ -305,6 +304,8 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" + from vllm.config.utils import getattr_iter + rope_theta_names = ("rope_theta", "rotary_emb_base") rope_theta = getattr_iter(config, rope_theta_names, None) if Version(version("transformers")) < Version("5.0.0.dev0"): From 17a9abec2b7c38d55ebb6ea0dfe5cbe135993af2 Mon Sep 17 00:00:00 2001 From: Chukwuma Nwaugha <20521315+nwaughachukwuma@users.noreply.github.com> Date: Sat, 6 Dec 2025 09:42:41 +0000 Subject: [PATCH 037/133] simplify requires_files list creation (#29656) Signed-off-by: Chukwuma Nwaugha --- use_existing_torch.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/use_existing_torch.py b/use_existing_torch.py index fd4caa69ec9c1..e2d3f2ec81956 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -3,9 +3,7 @@ import glob -requires_files = glob.glob("requirements/*.txt") -requires_files += ["pyproject.toml"] -for file in requires_files: +for file in (*glob.glob("requirements/*.txt"), "pyproject.toml"): print(f">>> cleaning {file}") with open(file) as f: lines = f.readlines() @@ -17,5 +15,4 @@ for file in requires_files: f.write(line) else: print(line.strip()) - print(f"<<< done cleaning {file}") - print() + print(f"<<< done cleaning {file}\n") From 21bb323542bad9d7a7206d949f33734caf48c40c Mon Sep 17 00:00:00 2001 From: Viacheslav Date: Sat, 6 Dec 2025 15:04:14 +0300 Subject: [PATCH 038/133] Gigachat 3 tool parser and tests (#29905) Signed-off-by: Viacheslav Barinov --- docs/features/tool_calling.md | 13 ++ .../test_gigachat3_tool_parser.py | 176 ++++++++++++++++ .../openai/tool_parsers/__init__.py | 4 + .../tool_parsers/gigachat3_tool_parser.py | 190 ++++++++++++++++++ 4 files changed, 383 insertions(+) create mode 100644 tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py create mode 100644 vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index b6dfbf10b4568..c77fe44659790 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -376,6 +376,19 @@ Supported models: Flags: `--tool-call-parser olmo3` +### Gigachat 3 Models (`gigachat3`) + +Use chat template from the Hugging Face model files. + +Supported models: + +* `ai-sage/GigaChat3-702B-A36B-preview` +* `ai-sage/GigaChat3-702B-A36B-preview-bf16` +* `ai-sage/GigaChat3-10B-A1.8B` +* `ai-sage/GigaChat3-10B-A1.8B-bf16` + +Flags: `--tool-call-parser gigachat3` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. diff --git a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py new file mode 100644 index 0000000000000..02c5189d0f6c1 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import ( + run_tool_extraction, + run_tool_extraction_streaming, +) +from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.tokenizers import TokenizerLike + +SIMPLE_ARGS_DICT = { + "action": "create", + "id": "preferences", +} +SIMPLE_FUNCTION_JSON = json.dumps( + { + "name": "manage_user_memory", + "arguments": SIMPLE_ARGS_DICT, + }, + ensure_ascii=False, +) +SIMPLE_FUNCTION_OUTPUT = "function call" + SIMPLE_FUNCTION_JSON +SIMPLE_FUNCTION_CALL = FunctionCall( + name="manage_user_memory", + arguments=json.dumps(SIMPLE_ARGS_DICT, ensure_ascii=False), +) + + +PARAMETERLESS_FUNCTION_JSON = json.dumps( + { + "name": "manage_user_memory", + "arguments": {}, + }, + ensure_ascii=False, +) +PARAMETERLESS_FUNCTION_OUTPUT = "function call" + PARAMETERLESS_FUNCTION_JSON +PARAMETERLESS_FUNCTION_CALL = FunctionCall( + name="manage_user_memory", + arguments=json.dumps({}, ensure_ascii=False), +) + + +COMPLEX_ARGS_DICT = { + "action": "create", + "id": "preferences", + "content": { + "short_answers": True, + "hate_emojis": True, + "english_ui": False, + "russian_math_explanations": True, + }, +} +COMPLEX_FUNCTION_JSON = json.dumps( + { + "name": "manage_user_memory", + "arguments": COMPLEX_ARGS_DICT, + }, + ensure_ascii=False, +) +COMPLEX_FUNCTION_OUTPUT = "function call" + COMPLEX_FUNCTION_JSON +COMPLEX_FUNCTION_CALL = FunctionCall( + name="manage_user_memory", + arguments=json.dumps(COMPLEX_ARGS_DICT, ensure_ascii=False), +) + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike): + tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")( + default_tokenizer + ) + model_output = "How can I help you today?" + content, tool_calls = run_tool_extraction( + tool_parser, model_output, streaming=streaming + ) + assert content == model_output + assert len(tool_calls) == 0 + + +TEST_CASES = [ + pytest.param( + True, + SIMPLE_FUNCTION_OUTPUT, + [SIMPLE_FUNCTION_CALL], + None, + id="simple_streaming", + ), + pytest.param( + False, + SIMPLE_FUNCTION_OUTPUT, + [SIMPLE_FUNCTION_CALL], + None, + id="simple_nonstreaming", + ), + pytest.param( + True, + PARAMETERLESS_FUNCTION_OUTPUT, + [PARAMETERLESS_FUNCTION_CALL], + None, + id="parameterless_streaming", + ), + pytest.param( + False, + PARAMETERLESS_FUNCTION_OUTPUT, + [PARAMETERLESS_FUNCTION_CALL], + None, + id="parameterless_nonstreaming", + ), + pytest.param( + True, + COMPLEX_FUNCTION_OUTPUT, + [COMPLEX_FUNCTION_CALL], + None, + id="complex_streaming", + ), + pytest.param( + False, + COMPLEX_FUNCTION_OUTPUT, + [COMPLEX_FUNCTION_CALL], + None, + id="complex_nonstreaming", + ), +] + + +@pytest.mark.parametrize( + "streaming, model_output, expected_tool_calls, expected_content", TEST_CASES +) +def test_tool_call( + streaming: bool, + model_output: str, + expected_tool_calls: list[FunctionCall], + expected_content: str | None, + default_tokenizer: TokenizerLike, +): + tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")( + default_tokenizer + ) + content, tool_calls = run_tool_extraction( + tool_parser, model_output, streaming=streaming + ) + assert content == expected_content + assert len(tool_calls) == len(expected_tool_calls) + for actual, expected in zip(tool_calls, expected_tool_calls): + assert actual.type == "function" + assert actual.function.name == expected.name + actual_args = json.loads(actual.function.arguments) + expected_args = json.loads(expected.arguments) + assert actual_args == expected_args + + +def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike): + tool_parser: ToolParser = ToolParserManager.get_tool_parser("gigachat3")( + default_tokenizer + ) + model_output_deltas = [ + "function call", + COMPLEX_FUNCTION_JSON[:40], + COMPLEX_FUNCTION_JSON[40:], + ] + reconstructor = run_tool_extraction_streaming( + tool_parser, + model_output_deltas, + assert_one_tool_per_delta=False, + ) + assert len(reconstructor.tool_calls) == 1 + call = reconstructor.tool_calls[0] + assert call.type == "function" + assert call.function.name == "manage_user_memory" + args_dict = json.loads(call.function.arguments) + assert args_dict == COMPLEX_ARGS_DICT diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index ed43ea7eec82f..7be1263e802dc 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = { "xlam_tool_parser", "xLAMToolParser", ), + "gigachat3": ( + "gigachat3_tool_parser", + "GigaChat3ToolParser", + ), } diff --git a/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py new file mode 100644 index 0000000000000..dd27ffa83cfc4 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/gigachat3_tool_parser.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence + +import regex as re + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParser +from vllm.logger import init_logger +from vllm.tokenizers import TokenizerLike + +logger = init_logger(__name__) + +REGEX_FUNCTION_CALL = re.compile( + r"function call(?:<\|role_sep\|>\n)?(\{.*)", + re.DOTALL, +) + +NAME_REGEX = re.compile( + r'"name"\s*:\s*"([^"]*)"', + re.DOTALL, +) + +ARGS_REGEX = re.compile( + r'"arguments"\s*:\s*(.*)', + re.DOTALL, +) + + +class GigaChat3ToolParser(ToolParser): + def __init__(self, tokenizer: TokenizerLike): + super().__init__(tokenizer) + self.tool_started: bool = False + self.tool_name_sent: bool = False + self.tool_id: str | None = None + self.prev_tool_call_arr: list[dict] = [] + self.content_buffer: str = "" + self.trigger_start = "function call{" + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + match = REGEX_FUNCTION_CALL.search(model_output) + if not match: + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + json_candidate = match.group(1).strip() + try: + data = json.loads(json_candidate) + except json.JSONDecodeError: + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + if not (isinstance(data, dict) and "name" in data and "arguments" in data): + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + name = data["name"] + args = data["arguments"] + if not isinstance(args, str): + args = json.dumps(args, ensure_ascii=False) + + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=name, + arguments=args, + ), + ) + ] + prefix = model_output[: match.start()] + content = prefix.rstrip() if prefix and prefix.strip() else None + + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content, + ) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> DeltaMessage | None: + func_name = None + cur_args = None + if not self.tool_started: + match = REGEX_FUNCTION_CALL.search(current_text) + if match: + self.tool_started = True + self.content_buffer = "" + else: + self.content_buffer += delta_text + clean_buffer = self.content_buffer.lstrip() + is_prefix = self.trigger_start.startswith(clean_buffer) + starts_with_trigger = clean_buffer.startswith(self.trigger_start) + if is_prefix or starts_with_trigger: + return None + else: + flush_text = self.content_buffer + self.content_buffer = "" + return DeltaMessage(content=flush_text) + + match = REGEX_FUNCTION_CALL.search(current_text) + if not match: + return None + json_tail = match.group(1).strip() + name_match = NAME_REGEX.search(json_tail) + if name_match: + func_name = name_match.group(1) + args_match = ARGS_REGEX.search(json_tail) + if args_match: + cur_args = args_match.group(1).strip() + if cur_args.endswith("}"): # last '}' end of json + try: + candidate = cur_args[:-1].strip() + json.loads(candidate) + cur_args = candidate + except json.JSONDecodeError: + pass + if not self.prev_tool_call_arr: + self.prev_tool_call_arr.append({}) + if not self.tool_name_sent: + if not func_name: + return None + self.tool_name_sent = True + self.tool_id = make_tool_call_id() + self.prev_tool_call_arr[0]["name"] = func_name + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=0, + id=self.tool_id, + type="function", + function=DeltaFunctionCall( + name=func_name, + ).model_dump(exclude_none=True), + ) + ], + content=None, + ) + if cur_args is None: + return None + prev_args = self.prev_tool_call_arr[0].get("arguments", "") + if not prev_args: + delta_args = cur_args + elif cur_args.startswith(prev_args): + delta_args = cur_args[len(prev_args) :] + else: + return None + if not delta_args: + return None + self.prev_tool_call_arr[0]["arguments"] = cur_args + return DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=0, + function=DeltaFunctionCall( + arguments=delta_args, + ).model_dump(exclude_none=True), + ) + ], + content=None, + ) From 671427efbf57d4e40370a336cf1d3a4d1fd8eb91 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 6 Dec 2025 21:40:02 +0800 Subject: [PATCH 039/133] [Model] Move `multimodal_cpu_fields` definition to field config (#30181) Signed-off-by: DarkLight1337 --- tests/distributed/test_shm_storage.py | 2 +- tests/multimodal/test_cache.py | 2 +- tests/v1/test_serial_utils.py | 21 +++- vllm/model_executor/models/glm4_1v.py | 29 ++--- vllm/model_executor/models/hunyuan_vision.py | 4 +- vllm/model_executor/models/interfaces.py | 13 ++- vllm/model_executor/models/opencua.py | 2 - vllm/model_executor/models/qwen2_5_vl.py | 2 - vllm/model_executor/models/qwen2_vl.py | 12 +- vllm/model_executor/models/qwen3_vl.py | 6 +- vllm/multimodal/inputs.py | 116 +++++++++++++------ vllm/multimodal/utils.py | 8 +- vllm/v1/serial_utils.py | 11 +- vllm/v1/worker/gpu_model_runner.py | 5 - vllm/v1/worker/tpu_model_runner.py | 3 - 15 files changed, 141 insertions(+), 95 deletions(-) diff --git a/tests/distributed/test_shm_storage.py b/tests/distributed/test_shm_storage.py index b9a5c22447fd8..9ab35a292f872 100644 --- a/tests/distributed/test_shm_storage.py +++ b/tests/distributed/test_shm_storage.py @@ -28,7 +28,7 @@ def _dummy_elem(modality: str, key: str, size: int): modality=modality, key=key, data=torch.empty((size,), dtype=torch.int8), - field=MultiModalSharedField(1), + field=MultiModalSharedField(batch_size=1), ) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e4fcc34740edb..e641b1111abaf 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -51,7 +51,7 @@ def _dummy_elem( modality=modality, key=key, data=data, - field=MultiModalSharedField(1), + field=MultiModalSharedField(batch_size=1), ) diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 00749c5415c8e..dbbbfce97d286 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -104,22 +104,31 @@ class MyRequest(msgspec.Struct): def test_multimodal_kwargs(): e1 = MultiModalFieldElem( - "audio", "a0", torch.zeros(1000, dtype=torch.bfloat16), MultiModalBatchedField() + "audio", + "a0", + torch.zeros(1000, dtype=torch.bfloat16), + MultiModalBatchedField(), ) e2 = MultiModalFieldElem( "video", "v0", [torch.zeros(1000, dtype=torch.int8) for _ in range(4)], - MultiModalFlatField([[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], 0), + MultiModalFlatField( + slices=[[slice(1, 2, 3), slice(4, 5, 6)], [slice(None, 2)]], + dim=0, + ), ) e3 = MultiModalFieldElem( - "image", "i0", torch.zeros(1000, dtype=torch.int32), MultiModalSharedField(4) + "image", + "i0", + torch.zeros(1000, dtype=torch.int32), + MultiModalSharedField(batch_size=4), ) e4 = MultiModalFieldElem( "image", "i1", torch.zeros(1000, dtype=torch.int32), - MultiModalFlatField([slice(1, 2, 3), slice(4, 5, 6)], 2), + MultiModalFlatField(slices=[slice(1, 2, 3), slice(4, 5, 6)], dim=2), ) audio = MultiModalKwargsItem.from_elems([e1]) video = MultiModalKwargsItem.from_elems([e2]) @@ -138,8 +147,8 @@ def test_multimodal_kwargs(): total_len = sum(memoryview(x).cast("B").nbytes for x in encoded) - # expected total encoding length, should be 14306, +-20 for minor changes - assert 14275 <= total_len <= 14325 + # expected total encoding length, should be 14395, +-20 for minor changes + assert 14375 <= total_len <= 14425 decoded = decoder.decode(encoded).mm[0] assert isinstance(decoded, MultiModalKwargsItems) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 3cb53f2cbabee..39a837b789bed 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -787,10 +787,10 @@ class Glm4vVisionTransformer(nn.Module): def forward( self, x: torch.Tensor, - grid_thw: list[list[int]], + grid_thw: torch.Tensor | list[list[int]], ) -> torch.Tensor: - # Convert grid_thw to tensor (always expecting list format now) - grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long) + if isinstance(grid_thw, list): + grid_thw = torch.tensor(grid_thw, dtype=torch.int32) # patchify x = x.to(device=self.device, dtype=self.dtype) @@ -805,7 +805,8 @@ class Glm4vVisionTransformer(nn.Module): cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] ).cumsum(dim=0, dtype=torch.int32) - cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) + cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) @@ -1548,7 +1549,6 @@ class Glm4vForConditionalGeneration( ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 - grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"].type(self.visual.dtype) @@ -1559,12 +1559,10 @@ class Glm4vForConditionalGeneration( self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d" ) else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw.tolist()) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + merge_size = self.visual.spatial_merge_size - sizes = ( - torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) - // (merge_size * merge_size) - ).tolist() + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() return image_embeds.split(sizes) def _process_video_input( @@ -1572,7 +1570,6 @@ class Glm4vForConditionalGeneration( ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 - grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"].type(self.visual.dtype) @@ -1588,15 +1585,11 @@ class Glm4vForConditionalGeneration( rope_type="rope_3d", ) else: - video_embeds = self.visual( - pixel_values_videos, grid_thw=grid_thw.tolist() - ) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size - sizes = ( - torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) - // (merge_size * merge_size) - ).tolist() + sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist() return video_embeds.split(sizes) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 52ce9564c8d79..e5c1be626be07 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -563,7 +563,7 @@ def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]): return dict( pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), - image_grid_thw=MultiModalFieldConfig.batched("image"), + image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True), ) @@ -786,8 +786,6 @@ class HunYuanVLForConditionalGeneration( SupportsQuant, SupportsXDRoPE, ): - multimodal_cpu_fields = {"image_grid_thw"} - # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 416ab236cd18d..607ff55835f1d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -84,9 +84,9 @@ class SupportsMultiModal(Protocol): `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use. """ - multimodal_cpu_fields: ClassVar[Set[str]] = frozenset() + multimodal_cpu_fields: ClassVar[Set[str] | None] = None """ - A set indicating CPU-only multimodal fields. + [DEPRECATED] A set indicating CPU-only multimodal fields. """ _processor_factory: ClassVar[_ProcessorFactories] @@ -279,6 +279,15 @@ def supports_multimodal( "please remove the override from your model." ) + multimodal_cpu_fields = getattr(model, "multimodal_cpu_fields", None) + if multimodal_cpu_fields is not None: + raise ValueError( + "`multimodal_cpu_fields` is no longer effective, " + "please set `keep_on_cpu=True` in `MultiModalFieldConfig` " + "(refer to https://github.com/vllm-project/vllm/pull/30181), " + "and then remove the override from your model." + ) + return res diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index 76a2d1cc242c5..23668cc2b746e 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -201,8 +201,6 @@ class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder): dummy_inputs=OpenCUADummyInputsBuilder, ) class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): - multimodal_cpu_fields = {"image_grid_thw"} - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 488af192bfad0..3cc3a3a7873c6 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1039,8 +1039,6 @@ class Qwen2_5_VLForConditionalGeneration( SupportsMultiModalPruning, SupportsMRoPE, ): - multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} - packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 9da5080f84303..885e172d1e81a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -811,14 +811,14 @@ def _create_qwen2vl_field_factory( image_embeds=MultiModalFieldConfig.flat_from_sizes( "image", image_embed_grid_sizes ), - image_grid_thw=MultiModalFieldConfig.batched("image"), + image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True), pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( "video", video_grid_sizes ), video_embeds=MultiModalFieldConfig.flat_from_sizes( "video", video_embed_grid_sizes ), - video_grid_thw=MultiModalFieldConfig.batched("video"), + video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True), ) return _qwen2vl_field_config @@ -1131,8 +1131,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]) class Qwen2VLForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} - # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -1393,9 +1391,11 @@ class Qwen2VLForConditionalGeneration( else: pixel_values_videos = video_input["pixel_values_videos"] if self.use_data_parallel: - grid_thw_list = grid_thw.tolist() return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d" + self.visual, + pixel_values_videos, + grid_thw.tolist(), + rope_type="rope_3d", ) else: video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index a5b10c95872dd..1add39d6b0a84 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -984,14 +984,14 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]) image_embeds=MultiModalFieldConfig.flat_from_sizes( "image", image_grid_sizes ), - image_grid_thw=MultiModalFieldConfig.batched("image"), + image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True), pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( "video", video_grid_sizes ), video_embeds=MultiModalFieldConfig.flat_from_sizes( "video", video_grid_sizes ), - video_grid_thw=MultiModalFieldConfig.batched("video"), + video_grid_thw=MultiModalFieldConfig.batched("video", keep_on_cpu=True), ) def _get_prompt_updates( @@ -1190,8 +1190,6 @@ class Qwen3VLForConditionalGeneration( SupportsMRoPE, SupportsEagle3, ): - multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} - packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 32f15240cb7da..d9118f5b9e9a5 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict -from collections.abc import Mapping, Sequence, Set +from collections.abc import Mapping, Sequence from dataclasses import dataclass from functools import partial from itertools import accumulate @@ -223,6 +223,23 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: return a == b +def _nested_tensors_h2d( + tensors: NestedTensors, + device: torch.types.Device, +) -> NestedTensors: + if device is None: + return tensors + + return json_map_leaves( + ( + lambda x: x.to(device=device, non_blocking=True) + if isinstance(x, torch.Tensor) + else x + ), + tensors, + ) + + BatchedTensorInputs: TypeAlias = dict[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via @@ -334,7 +351,7 @@ class MultiModalFieldElem: ) # noqa: E721 -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class BaseMultiModalField(ABC): """ Defines how to interpret tensor data belonging to a keyword argument in @@ -342,6 +359,12 @@ class BaseMultiModalField(ABC): multi-modal items, and vice versa. """ + keep_on_cpu: bool = False + """ + If `True`, then this field is excluded from being moved to the accelerator + when `MultiModalKwargsItems.get_data()` is called to batch the data. + """ + def _field_factory(self, *, modality: str, key: str): f = partial( MultiModalFieldElem, @@ -386,6 +409,7 @@ class BaseMultiModalField(ABC): self, elems: list[MultiModalFieldElem], *, + device: torch.types.Device = None, pin_memory: bool = False, ) -> NestedTensors: """ @@ -399,11 +423,17 @@ class BaseMultiModalField(ABC): if len(set(field_types)) > 1: raise ValueError(f"Cannot merge different {field_types=}") + if device is not None and self.keep_on_cpu: + device = "cpu" + if pin_memory and self.keep_on_cpu: + pin_memory = False + batch = [elem.data for elem in elems] - return self._reduce_data(batch, pin_memory=pin_memory) + out = self._reduce_data(batch, pin_memory=pin_memory) + return _nested_tensors_h2d(out, device=device) -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class MultiModalBatchedField(BaseMultiModalField): """ Info: @@ -445,7 +475,7 @@ class MultiModalBatchedField(BaseMultiModalField): return batch -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class MultiModalFlatField(BaseMultiModalField): """ Info: @@ -505,7 +535,7 @@ class MultiModalFlatField(BaseMultiModalField): return [e for elem in batch for e in elem] -@dataclass(frozen=True) +@dataclass(frozen=True, kw_only=True) class MultiModalSharedField(BaseMultiModalField): """ Info: @@ -532,9 +562,10 @@ class MultiModalSharedField(BaseMultiModalField): return batch[0] +@dataclass(frozen=True) class MultiModalFieldConfig: @staticmethod - def batched(modality: str): + def batched(modality: str, *, keep_on_cpu: bool = False): """ Defines a field where an element in the batch is obtained by indexing into the first dimension of the underlying data. @@ -542,6 +573,7 @@ class MultiModalFieldConfig: Args: modality: The modality of the multi-modal item that uses this keyword argument. + keep_on_cpu: Whether to keep this field on the CPU for the model inputs. Example: @@ -558,7 +590,7 @@ class MultiModalFieldConfig: ``` """ return MultiModalFieldConfig( - field=MultiModalBatchedField(), + field=MultiModalBatchedField(keep_on_cpu=keep_on_cpu), modality=modality, ) @@ -567,6 +599,8 @@ class MultiModalFieldConfig: modality: str, slices: Sequence[slice] | Sequence[Sequence[slice]], dim: int = 0, + *, + keep_on_cpu: bool = False, ): """ Defines a field where an element in the batch is obtained by @@ -579,6 +613,7 @@ class MultiModalFieldConfig: slices (dim>0) that is used to extract the data corresponding to it. dim: The dimension to extract data, default to 0. + keep_on_cpu: Whether to keep this field on the CPU for the model inputs. Example: @@ -613,12 +648,22 @@ class MultiModalFieldConfig: ``` """ return MultiModalFieldConfig( - field=MultiModalFlatField(slices=slices, dim=dim), + field=MultiModalFlatField( + slices=slices, + dim=dim, + keep_on_cpu=keep_on_cpu, + ), modality=modality, ) @staticmethod - def flat_from_sizes(modality: str, size_per_item: "torch.Tensor", dim: int = 0): + def flat_from_sizes( + modality: str, + size_per_item: "torch.Tensor", + dim: int = 0, + *, + keep_on_cpu: bool = False, + ): """ Defines a field where an element in the batch is obtained by slicing along the first dimension of the underlying data. @@ -629,6 +674,7 @@ class MultiModalFieldConfig: size_per_item: For each multi-modal item, the size of the slice that is used to extract the data corresponding to it. dim: The dimension to slice, default to 0. + keep_on_cpu: Whether to keep this field on the CPU for the model inputs. Example: @@ -676,10 +722,20 @@ class MultiModalFieldConfig: for i in range(len(size_per_item)) ] - return MultiModalFieldConfig.flat(modality, slices, dim=dim) + return MultiModalFieldConfig.flat( + modality, + slices, + dim=dim, + keep_on_cpu=keep_on_cpu, + ) @staticmethod - def shared(modality: str, batch_size: int): + def shared( + modality: str, + batch_size: int, + *, + keep_on_cpu: bool = False, + ): """ Defines a field where an element in the batch is obtained by taking the entirety of the underlying data. @@ -690,6 +746,7 @@ class MultiModalFieldConfig: modality: The modality of the multi-modal item that uses this keyword argument. batch_size: The number of multi-modal items which share this data. + keep_on_cpu: Whether to keep this field on the CPU for the model inputs. Example: @@ -708,18 +765,15 @@ class MultiModalFieldConfig: ``` """ return MultiModalFieldConfig( - field=MultiModalSharedField(batch_size), + field=MultiModalSharedField( + batch_size=batch_size, + keep_on_cpu=keep_on_cpu, + ), modality=modality, ) - def __init__(self, field: BaseMultiModalField, modality: str) -> None: - super().__init__() - - self.field = field - self.modality = modality - - def __repr__(self) -> str: - return f"MultiModalFieldConfig(field={self.field}, modality={self.modality})" + field: BaseMultiModalField + modality: str def build_elems( self, @@ -744,7 +798,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): modality=modality, key="dummy", data=torch.empty(nbytes, dtype=torch.uint8), - field=MultiModalSharedField(1), + field=MultiModalSharedField(batch_size=1), ) return MultiModalKwargsItem.from_elems([mm_elem]) @@ -844,7 +898,6 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]): *, device: torch.types.Device = None, pin_memory: bool = False, - cpu_fields: Set[str] = frozenset(), ) -> BatchedTensorInputs: """Construct a dictionary of keyword arguments to pass to the model.""" elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) @@ -859,21 +912,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]): elems_by_key[key].append(elem) data = { - key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) + key: elems[0].field.reduce_data( + elems, + device=device, + pin_memory=pin_memory, + ) for key, elems in elems_by_key.items() } - if device is not None: - for k in data.keys() - cpu_fields: - data[k] = json_map_leaves( - ( - lambda x: x.to(device=device, non_blocking=True) - if isinstance(x, torch.Tensor) - else x - ), - data[k], - ) - return data diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 9c5e3fb2b32ad..d4bdc55e569b2 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -413,7 +413,7 @@ def group_mm_kwargs_by_modality( device: torch.types.Device = None, pin_memory: bool = False, merge_by_field_config: bool | None = None, - multimodal_cpu_fields: Set[str] = frozenset(), + multimodal_cpu_fields: Set[str] | None = None, ) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]: """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same modality together into the same `MultiModalKwargs` instance. @@ -431,6 +431,11 @@ def group_mm_kwargs_by_modality( "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` " "is deprecated and will be removed in v0.13." ) + if multimodal_cpu_fields is not None: + logger.warning_once( + "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` " + "is deprecated and will be removed in v0.13." + ) from vllm.multimodal.inputs import MultiModalKwargsItems @@ -440,7 +445,6 @@ def group_mm_kwargs_by_modality( mm_kwargs_data = mm_kwargs_items.get_data( device=device, pin_memory=pin_memory, - cpu_fields=multimodal_cpu_fields, ) yield modality, len(items_lst), mm_kwargs_data diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 14ae487f3eb13..a3c30e368b828 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -269,10 +269,11 @@ class MsgpackEncoder: name = MMF_CLASS_TO_FACTORY.get(field.__class__) if not name: raise TypeError(f"Unsupported field type: {field.__class__}") + # We just need to copy all of the field values in order # which will be then used to reconstruct the field. - field_values = (getattr(field, f.name) for f in dataclasses.fields(field)) - return name, *field_values + factory_kw = {f.name: getattr(field, f.name) for f in dataclasses.fields(field)} + return name, factory_kw class MsgpackDecoder: @@ -392,15 +393,15 @@ class MsgpackDecoder: obj["data"] = self._decode_nested_tensors(obj["data"]) # Reconstruct the field processor using MultiModalFieldConfig - factory_meth_name, *field_args = obj["field"] + factory_meth_name, factory_kw = obj["field"] factory_meth = getattr(MultiModalFieldConfig, factory_meth_name) # Special case: decode the union "slices" field of # MultiModalFlatField if factory_meth_name == "flat": - field_args[0] = self._decode_nested_slices(field_args[0]) + factory_kw["slices"] = self._decode_nested_slices(factory_kw["slices"]) - obj["field"] = factory_meth(None, *field_args).field + obj["field"] = factory_meth("", **factory_kw).field return MultiModalFieldElem(**obj) def _decode_nested_tensors(self, obj: Any) -> NestedTensors: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b6a8145226b3f..a50360ab08694 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1097,7 +1097,6 @@ class GPUModelRunner( device=self.device, pin_memory=self.pin_memory, merge_by_field_config=model.merge_by_field_config, - multimodal_cpu_fields=model.multimodal_cpu_fields, ): mm_kwargs_combined.update(mm_kwargs_group) @@ -2109,7 +2108,6 @@ class GPUModelRunner( mm_kwargs, device=self.device, pin_memory=self.pin_memory, - multimodal_cpu_fields=model.multimodal_cpu_fields, ): curr_group_outputs: list[torch.Tensor] = [] @@ -2135,7 +2133,6 @@ class GPUModelRunner( [video_mm_kwargs_item], device=self.device, pin_memory=self.pin_memory, - multimodal_cpu_fields=model.multimodal_cpu_fields, ) ) @@ -3887,14 +3884,12 @@ class GPUModelRunner( dummy_mm_item = dummy_mm_data[modality][0] dummy_mm_items = [dummy_mm_item] * max_items_per_batch - model = cast(SupportsMultiModal, self.model) return next( mm_kwargs_group for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( dummy_mm_items, device=self.device, pin_memory=self.pin_memory, - multimodal_cpu_fields=model.multimodal_cpu_fields, ) ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 292f12969aaed..283f21b779e38 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -969,7 +969,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_kwargs, device=self.device, pin_memory=self.pin_memory, - multimodal_cpu_fields=model.multimodal_cpu_fields, ): # Run the encoder. # `curr_group_outputs` is either of the following: @@ -2050,14 +2049,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): dummy_mm_item = dummy_mm_data[modality][0] dummy_mm_items = [dummy_mm_item] * max_items_per_batch - model = cast(SupportsMultiModal, self.model) return next( grouped_mm_kwargs for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality( dummy_mm_items, device=self.device, pin_memory=self.pin_memory, - multimodal_cpu_fields=model.multimodal_cpu_fields, ) ) From 421125d03a110df7d49f84c7cf8ee9fa089d1dff Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Sat, 6 Dec 2025 14:34:34 -0800 Subject: [PATCH 040/133] [ez] move harmony utils to parser folder (#30117) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- tests/entrypoints/openai/parser/__init__.py | 0 tests/entrypoints/{ => openai/parser}/test_harmony_utils.py | 2 +- tests/entrypoints/openai/test_response_api_with_harmony.py | 2 +- vllm/entrypoints/context.py | 2 +- vllm/entrypoints/{ => openai/parser}/harmony_utils.py | 0 vllm/entrypoints/openai/serving_chat.py | 4 ++-- vllm/entrypoints/openai/serving_responses.py | 4 ++-- vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py | 2 +- vllm/reasoning/gptoss_reasoning_parser.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 tests/entrypoints/openai/parser/__init__.py rename tests/entrypoints/{ => openai/parser}/test_harmony_utils.py (99%) rename vllm/entrypoints/{ => openai/parser}/harmony_utils.py (100%) diff --git a/tests/entrypoints/openai/parser/__init__.py b/tests/entrypoints/openai/parser/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py similarity index 99% rename from tests/entrypoints/test_harmony_utils.py rename to tests/entrypoints/openai/parser/test_harmony_utils.py index 82ff562d5c6d2..ae6f558f22b03 100644 --- a/tests/entrypoints/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -4,7 +4,7 @@ from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem from openai_harmony import Author, Message, Role, TextContent -from vllm.entrypoints.harmony_utils import ( +from vllm.entrypoints.openai.parser.harmony_utils import ( has_custom_tools, parse_input_to_harmony_message, parse_output_message, diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 8fd3545eccffa..6f2a50020699c 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -726,7 +726,7 @@ async def test_function_calling_required(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_system_message_with_tools(client: OpenAI, model_name: str): - from vllm.entrypoints.harmony_utils import get_system_message + from vllm.entrypoints.openai.parser.harmony_utils import get_system_message # Test with custom tools enabled - commentary channel should be available sys_msg = get_system_message(with_custom_tools=True) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index f50c473d7a773..a484a437c853e 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -19,7 +19,7 @@ from vllm import envs from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ) -from vllm.entrypoints.harmony_utils import ( +from vllm.entrypoints.openai.parser.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, render_for_completion, diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py similarity index 100% rename from vllm/entrypoints/harmony_utils.py rename to vllm/entrypoints/openai/parser/harmony_utils.py diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9b7bc461e4511..c6333d170c663 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -21,7 +21,8 @@ from vllm.entrypoints.chat_utils import ( get_history_tool_calls_cnt, make_tool_call_id, ) -from vllm.entrypoints.harmony_utils import ( +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.parser.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, @@ -30,7 +31,6 @@ from vllm.entrypoints.harmony_utils import ( parse_input_to_harmony_message, render_for_completion, ) -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 1eb1243e7e5bc..91616a78e11dc 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -64,7 +64,8 @@ from vllm.entrypoints.context import ( SimpleContext, StreamingHarmonyContext, ) -from vllm.entrypoints.harmony_utils import ( +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.parser.harmony_utils import ( construct_harmony_previous_input_messages, get_developer_message, get_stop_tokens_for_assistant_actions, @@ -76,7 +77,6 @@ from vllm.entrypoints.harmony_utils import ( parse_response_input, render_for_completion, ) -from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( DeltaMessage, ErrorResponse, diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py index 8bdf35d408805..387e87f208e66 100644 --- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py @@ -4,7 +4,7 @@ import json from collections.abc import Sequence from typing import TYPE_CHECKING -from vllm.entrypoints.harmony_utils import parse_output_into_messages +from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaMessage, diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index fa45b12856c7d..e0920ef3160b2 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -5,7 +5,7 @@ from collections.abc import Sequence from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.harmony_utils import parse_chat_output +from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from vllm.entrypoints.tool_server import ToolServer from vllm.logger import init_logger From 8d3da4c79ddbd199d4185c505bcf3a5d7e7a3316 Mon Sep 17 00:00:00 2001 From: AuruTus <33182215+AuruTus@users.noreply.github.com> Date: Sun, 7 Dec 2025 08:21:03 +0800 Subject: [PATCH 041/133] [MISC]: change NIXL compatibility hash logging level to debug (#30182) --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 7aa12e9993c76..514b8534aaa6b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -189,7 +189,7 @@ def compute_nixl_compatibility_hash( } compat_hash = hash_factors(factors) - logger.info( + logger.debug( "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, " "cache_dtype=%s, attn_backend=%s)", compat_hash, From cbedb703cc594632db796f9ca748ea4b7b4e8435 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Sat, 6 Dec 2025 18:53:42 -0800 Subject: [PATCH 042/133] [Frontend] Remove confusing -O.xx flag error (#30169) Signed-off-by: Yanan Cao --- tests/utils_/test_argparse_utils.py | 19 ------------------- vllm/utils/argparse_utils.py | 7 ------- 2 files changed, 26 deletions(-) diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py index 6f24c77e066a4..fbc278404e3f0 100644 --- a/tests/utils_/test_argparse_utils.py +++ b/tests/utils_/test_argparse_utils.py @@ -458,22 +458,3 @@ def test_flat_product(): (3, 4, "a", 5, 6), (3, 4, "b", 5, 6), ] - - -def test_o_dotted_syntax_error(): - """Test that -O.* dotted syntax raises a clear error message.""" - parser = FlexibleArgumentParser() - parser.add_argument("-cc", "--compilation-config", type=json.loads) - - # Test that -O.* syntax raises a clear ValueError - with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"): - parser.parse_args(["-O.backend=eager"]) - - with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"): - parser.parse_args(["-O.mode=2"]) - - with pytest.raises( - ValueError, - match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'", - ): - parser.parse_args(["-O.cudagraph_mode=NONE"]) diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index 356f383cc52bd..87ee6f54c0c9b 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -244,13 +244,6 @@ class FlexibleArgumentParser(ArgumentParser): else: key = pattern.sub(repl, arg, count=1) processed_args.append(key) - elif arg.startswith("-O."): - # Provide clear error for deprecated -O.* syntax - raise ValueError( - f"The -O.* syntax is no longer supported. " - f"Please use -cc.* instead. " - f"For example, replace '{arg}' with '{arg.replace('-O', '-cc', 1)}'" - ) elif arg.startswith("-O") and arg != "-O": # allow -O flag to be used without space, e.g. -O3 or -Odecode # also handle -O= here From dce6d229f7d405a4757aa8f0a76ba62f0e39eaa4 Mon Sep 17 00:00:00 2001 From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com> Date: Sat, 6 Dec 2025 20:34:24 -0800 Subject: [PATCH 043/133] Support multiple image/audio embeddings per requests (#29988) Signed-off-by: Jeremy Teboul Co-authored-by: Jeremy Teboul --- docs/features/multimodal_inputs.md | 10 +- tests/entrypoints/test_chat_utils.py | 178 +++++++++++++++++++++++++++ vllm/entrypoints/chat_utils.py | 30 ++--- 3 files changed, 198 insertions(+), 20 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 0adb32a7ac33c..c3fd726e9938c 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -445,7 +445,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features. -#### Audio Embeddings +#### Audio Embedding Inputs You can pass pre-computed audio embeddings similar to image embeddings: @@ -892,5 +892,11 @@ For Online Serving, you can also skip sending media if you expect cache hits wit ``` !!! note - Only one message can contain `{"type": "image_embeds"}`. + Multiple messages can now contain `{"type": "image_embeds"}`, enabling you to pass multiple image embeddings in a single request (similar to regular images). The number of embeddings is limited by `--limit-mm-per-prompt`. + + **Important**: The embedding shape format differs based on the number of embeddings: + + - **Single embedding**: 3D tensor of shape `(1, feature_size, hidden_size)` + - **Multiple embeddings**: List of 2D tensors, each of shape `(feature_size, hidden_size)` + If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 75be34820bcd7..527322c71ae4b 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -6,6 +6,7 @@ from collections.abc import Mapping from typing import Literal import pytest +import torch from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy from vllm.assets.audio import AudioAsset @@ -915,6 +916,183 @@ async def test_parse_chat_messages_audio_embeds_async( _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) +def test_parse_chat_messages_multiple_image_embeds( + phi3v_model_config_image_embeds, +): + """Test that multiple image_embeds in a single message are now supported. + + This test validates the fix for the limitation that previously only allowed + one message with {'type': 'image_embeds'}. Now multiple image embeddings + can be provided in a single request, similar to regular images. + """ + # Create two sample image embedding tensors + image_embedding_1 = torch.randn(256, 1024) + image_embedding_2 = torch.randn(128, 1024) + + # Encode them as base64 using the convenience function + base64_image_embedding_1 = tensor2base64(image_embedding_1) + base64_image_embedding_2 = tensor2base64(image_embedding_2) + + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": base64_image_embedding_1, + }, + { + "type": "image_embeds", + "image_embeds": base64_image_embedding_2, + }, + {"type": "text", "text": "Describe these two images."}, + ], + } + ], + phi3v_model_config_image_embeds, + content_format="string", + ) + + # Verify conversation structure + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.", + } + ] + + # Verify mm_data contains a list of embeddings (not a single embedding) + assert mm_data is not None + assert "image" in mm_data + assert isinstance(mm_data["image"], list) + assert len(mm_data["image"]) == 2 + + # Verify each embedding has the correct shape + assert isinstance(mm_data["image"][0], torch.Tensor) + assert mm_data["image"][0].shape == image_embedding_1.shape + assert isinstance(mm_data["image"][1], torch.Tensor) + assert mm_data["image"][1].shape == image_embedding_2.shape + + # Verify UUIDs (None since we didn't provide any) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + +def test_parse_chat_messages_multiple_image_embeds_with_uuids( + phi3v_model_config_image_embeds, +): + """Test multiple image_embeds with UUIDs. + + This validates that UUIDs are properly tracked for multiple embeddings. + """ + uuid1 = "image-uuid-1" + uuid2 = "image-uuid-2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid1, + }, + { + "type": "image_embeds", + "image_embeds": None, + "uuid": uuid2, + }, + {"type": "text", "text": "Compare these images."}, + ], + } + ], + phi3v_model_config_image_embeds, + content_format="string", + ) + + # Verify conversation structure + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\n<|image_2|>\nCompare these images.", + } + ] + + # Verify mm_data contains a list with None values (UUID references) + assert mm_data is not None + assert "image" in mm_data + assert isinstance(mm_data["image"], list) + assert len(mm_data["image"]) == 2 + assert mm_data["image"][0] is None + assert mm_data["image"][1] is None + + # Verify UUIDs are correctly tracked + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_image_embeds_async( + phi3v_model_config_image_embeds, +): + """Test multiple image_embeds with async parsing. + + This validates the AsyncMultiModalItemTracker also supports multiple embeddings. + """ + # Create two sample image embedding tensors + image_embedding_1 = torch.randn(200, 768) + image_embedding_2 = torch.randn(150, 768) + + # Encode them as base64 using the convenience function + base64_image_embedding_1 = tensor2base64(image_embedding_1) + base64_image_embedding_2 = tensor2base64(image_embedding_2) + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [ + { + "role": "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": base64_image_embedding_1, + }, + { + "type": "image_embeds", + "image_embeds": base64_image_embedding_2, + }, + {"type": "text", "text": "What do these images show?"}, + ], + } + ], + phi3v_model_config_image_embeds, + content_format="string", + ) + + # Verify conversation structure + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?", + } + ] + + # Await the future and verify mm_data + mm_data = await mm_future + assert mm_data is not None + assert "image" in mm_data + assert isinstance(mm_data["image"], list) + assert len(mm_data["image"]) == 2 + + # Verify each embedding has the correct shape + assert isinstance(mm_data["image"][0], torch.Tensor) + assert mm_data["image"][0].shape == image_embedding_1.shape + assert isinstance(mm_data["image"][1], torch.Tensor) + assert mm_data["image"][1].shape == image_embedding_2.shape + + # Verify UUIDs + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + @pytest.mark.asyncio async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( phi3v_model_config_image_embeds, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 077fe681bc5b8..aceaa8bd45b81 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -694,16 +694,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): raise ValueError("Mixing raw image and embedding inputs is not allowed") if "image_embeds" in uuids_by_modality: - image_embeds_uuids = uuids_by_modality["image_embeds"] - if len(image_embeds_uuids) > 1: - raise ValueError("Only one message can have {'type': 'image_embeds'}") mm_uuids["image"] = uuids_by_modality["image_embeds"] if "image" in uuids_by_modality: mm_uuids["image"] = uuids_by_modality["image"] # UUIDs of images if "audio_embeds" in uuids_by_modality: - audio_embeds_uuids = uuids_by_modality["audio_embeds"] - if len(audio_embeds_uuids) > 1: - raise ValueError("Only one message can have {'type': 'audio_embeds'}") mm_uuids["audio"] = uuids_by_modality["audio_embeds"] if "audio" in uuids_by_modality: mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios @@ -729,16 +723,16 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]): if "image_embeds" in items_by_modality: image_embeds_lst = items_by_modality["image_embeds"] - if len(image_embeds_lst) > 1: - raise ValueError("Only one message can have {'type': 'image_embeds'}") - mm_inputs["image"] = image_embeds_lst[0] + mm_inputs["image"] = ( + image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0] + ) if "image" in items_by_modality: mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio_embeds" in items_by_modality: audio_embeds_lst = items_by_modality["audio_embeds"] - if len(audio_embeds_lst) > 1: - raise ValueError("Only one message can have {'type': 'audio_embeds'}") - mm_inputs["audio"] = audio_embeds_lst[0] + mm_inputs["audio"] = ( + audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0] + ) if "audio" in items_by_modality: mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: @@ -771,16 +765,16 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): if "image_embeds" in items_by_modality: image_embeds_lst = items_by_modality["image_embeds"] - if len(image_embeds_lst) > 1: - raise ValueError("Only one message can have {'type': 'image_embeds'}") - mm_inputs["image"] = image_embeds_lst[0] + mm_inputs["image"] = ( + image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0] + ) if "image" in items_by_modality: mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio_embeds" in items_by_modality: audio_embeds_lst = items_by_modality["audio_embeds"] - if len(audio_embeds_lst) > 1: - raise ValueError("Only one message can have {'type': 'audio_embeds'}") - mm_inputs["audio"] = audio_embeds_lst[0] + mm_inputs["audio"] = ( + audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0] + ) if "audio" in items_by_modality: mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: From 17eb25e3271f8f22a0d8920a8115158495827cba Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Sat, 6 Dec 2025 23:44:50 -0500 Subject: [PATCH 044/133] [Perf] Enable cuda graph for deepepHT, 5.3% throughput improvement, 4.4% TTFT improvement (#29558) Signed-off-by: yewentao256 --- tests/compile/test_config.py | 66 ++++++++++++++++++++- vllm/config/compilation.py | 111 +++++++++++++++++++++++------------ vllm/config/vllm.py | 5 +- vllm/platforms/cuda.py | 38 ------------ 4 files changed, 142 insertions(+), 78 deletions(-) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 8dd6959a01d09..0e91cf525411e 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -10,7 +10,7 @@ from pydantic import ValidationError from vllm.compilation.counter import compilation_counter from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig +from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig from vllm.config.compilation import CompilationMode, PassConfig from vllm.engine.arg_utils import EngineArgs from vllm.logger import _print_warning_once @@ -235,6 +235,70 @@ def test_splitting_ops_dynamic(): assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE +def test_moe_splitting_ops_deepep_ht_piecewise(): + # Non-inductor, non-attn-fusion case: DeepEP HT with dp>1 + # should add MoE ops to splitting_ops on top of attention ops. + config = VllmConfig( + parallel_config=ParallelConfig( + all2all_backend="deepep_high_throughput", + data_parallel_size=8, + ), + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + ), + ) + splitting_ops = config.compilation_config.splitting_ops + assert splitting_ops is not None + assert "vllm::moe_forward" in splitting_ops + assert "vllm::moe_forward_shared" in splitting_ops + + +def test_moe_splitting_ops_deepep_ht_inductor_partition(): + # Inductor partition case: user-provided splitting_ops should be + # preserved and MoE ops should be appended for DeepEP HT with dp>1. + config = VllmConfig( + parallel_config=ParallelConfig( + all2all_backend="deepep_high_throughput", + data_parallel_size=8, + ), + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + use_inductor_graph_partition=True, + splitting_ops=[ + "vllm::unified_attention", + "vllm::moe_forward", + "vllm::moe_forward_shared", + ], + ), + ) + splitting_ops = config.compilation_config.splitting_ops + assert splitting_ops == [ + "vllm::unified_attention", + "vllm::moe_forward", + "vllm::moe_forward_shared", + ] + + +def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): + # Pure attn-fusion case without inductor partition: even with + # DeepEP HT and dp>1, we should not re-enable piecewise compilation + # or add MoE ops into splitting_ops. + config = VllmConfig( + parallel_config=ParallelConfig( + all2all_backend="deepep_high_throughput", + data_parallel_size=8, + ), + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + pass_config={"enable_attn_fusion": True, "enable_noop": True}, + custom_ops=["+quant_fp8"], + cudagraph_mode=CUDAGraphMode.PIECEWISE, + ), + ) + assert config.compilation_config.splitting_ops == [] + assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL + + def test_should_split(): import torch diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 5f9e2cfdd789a..b79200f0e4779 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -966,7 +966,9 @@ class CompilationConfig: # May get recomputed in the model runner if adjustment is needed for spec-decode self.compute_bs_to_padded_graph_size() - def set_splitting_ops_for_v1(self): + def set_splitting_ops_for_v1( + self, all2all_backend: str | None = None, data_parallel_size: int | None = None + ): # To compatible with OOT hardware plugin platform (for example vllm-ascend) # which currently only supports sequence parallelism in eager mode. if self.mode != CompilationMode.VLLM_COMPILE: @@ -981,50 +983,83 @@ class CompilationConfig: "mode is CompilationMode.VLLM_COMPILE" ) - if self.use_inductor_graph_partition: - self.set_splitting_ops_for_inductor_graph_partition() - return + added_default_splitting_ops = False - if self.pass_config.fuse_attn_quant: - # here use_inductor_graph_partition is False + if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition: self.set_splitting_ops_for_attn_fusion() - return - - if self.splitting_ops is None: - # NOTE: When using full cudagraph, instead of setting an empty - # list and capture the full cudagraph inside the flattened fx - # graph, we keep the piecewise fx graph structure but capture - # the full cudagraph outside the fx graph. This reduces some - # cpu overhead when the runtime batch_size is not cudagraph - # captured. see https://github.com/vllm-project/vllm/pull/20059 - # for details. Make a copy to avoid mutating the class-level - # list via reference. - self.splitting_ops = list(self._attention_ops) - elif len(self.splitting_ops) == 0: - logger.warning_once("Using piecewise compilation with empty splitting_ops") - if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + else: + if self.splitting_ops is None: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture + # the full cudagraph outside the fx graph. This reduces some + # cpu overhead when the runtime batch_size is not cudagraph + # captured. see https://github.com/vllm-project/vllm/pull/20059 + # for details. Make a copy to avoid mutating the class-level + # list via reference. + self.splitting_ops = list(self._attention_ops) + added_default_splitting_ops = True + elif len(self.splitting_ops) == 0: logger.warning_once( - "Piecewise compilation with empty splitting_ops do not" - "contains piecewise cudagraph. Setting cudagraph_" - "mode to NONE. Hint: If you are using attention backends " - "that support cudagraph, consider manually setting " - "cudagraph_mode to FULL or FULL_DECODE_ONLY to enable " - "full cudagraphs." + "Using piecewise compilation with empty splitting_ops" ) + if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning_once( + "Piecewise compilation with empty splitting_ops do not" + "contains piecewise cudagraph. Setting cudagraph_" + "mode to NONE. Hint: If you are using attention " + "backends that support cudagraph, consider manually " + "setting cudagraph_mode to FULL or FULL_DECODE_ONLY " + "to enable full cudagraphs." + ) + self.cudagraph_mode = CUDAGraphMode.NONE + elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: + logger.warning_once( + "Piecewise compilation with empty splitting_ops do " + "not contains piecewise cudagraph. Setting " + "cudagraph_mode to FULL." + ) + self.cudagraph_mode = CUDAGraphMode.FULL + self.splitting_ops = [] + + # split MoE ops for cudagraph + moe_ops = [ + "vllm::moe_forward", + "vllm::moe_forward_shared", + ] + backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND + dp_size = data_parallel_size if data_parallel_size is not None else 1 + need_moe_splitting = ( + backend == "deepep_high_throughput" + and dp_size > 1 + # pure attn-fusion without inductor partition deliberately disables + # piecewise graphs and MoE splitting. + and not ( + self.pass_config.fuse_attn_quant + and not self.use_inductor_graph_partition + ) + ) + + if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE: + # if we just initialized default splitting_ops for this config, + # automatically append the MoE ops + if added_default_splitting_ops: + for op in moe_ops: + if op not in self.splitting_ops: + self.splitting_ops.append(op) + + # make sure MoE ops are split out + if not any(op in self.splitting_ops for op in moe_ops): self.cudagraph_mode = CUDAGraphMode.NONE - elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: logger.warning_once( - "Piecewise compilation with empty splitting_ops do not " - "contains piecewise cudagraph. Setting cudagraph_mode " - "to FULL." + "DeepEP high throughput backend with data_parallel_size > 1 " + "requires splitting MoE ops from cudagraphs. Please ensure " + "'vllm::moe_forward' or 'vllm::moe_forward_shared' are " + "present in CompilationConfig.splitting_ops." ) - self.cudagraph_mode = CUDAGraphMode.FULL - self.splitting_ops = [] - - def set_splitting_ops_for_inductor_graph_partition(self): - assert self.use_inductor_graph_partition - if self.splitting_ops is None: - self.splitting_ops = list(self._attention_ops) + elif self.cudagraph_mode.has_full_cudagraphs(): + # fall back to piecewise when MoE splitting is required. + self.cudagraph_mode = CUDAGraphMode.PIECEWISE def set_splitting_ops_for_attn_fusion(self): assert self.pass_config.fuse_attn_quant diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index b99be1e5dfbc8..36e4bd159dc72 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -813,7 +813,10 @@ class VllmConfig: ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." # Do this after all the updates to compilation_config.mode - self.compilation_config.set_splitting_ops_for_v1() + self.compilation_config.set_splitting_ops_for_v1( + all2all_backend=self.parallel_config.all2all_backend, + data_parallel_size=self.parallel_config.data_parallel_size, + ) if self.compilation_config.pass_config.enable_sp: # With pipeline parallelism or dynamo partitioning, diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 7e6ce6aeef536..37c95f48669f8 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -232,44 +232,6 @@ class CudaPlatformBase(Platform): logger.info( "Forcing kv cache block size to 64 for FlashMLASparse backend." ) - # lazy import to avoid circular import - from vllm.config import CUDAGraphMode - - compilation_config = vllm_config.compilation_config - if compilation_config.cudagraph_mode.has_full_cudagraphs(): - # decode context parallel does not support full cudagraphs - if parallel_config.decode_context_parallel_size > 1: - logger.warning_once( - "Decode context parallel (DCP) is enabled, which is " - "incompatible with full CUDA graphs. " - "Overriding cudagraph_mode to PIECEWISE." - ) - compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - # prefill context parallel do not support full cudagraphs - elif parallel_config.prefill_context_parallel_size > 1: - logger.warning_once( - "Prefill context parallel (PCP) is enabled, which is " - "incompatible with full CUDA graphs. " - "Overriding cudagraph_mode to PIECEWISE." - ) - compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - if ( - parallel_config.all2all_backend == "deepep_high_throughput" - and parallel_config.data_parallel_size > 1 - and compilation_config.cudagraph_mode != CUDAGraphMode.NONE - ): - # TODO: Piecewise Cuda graph might be enabled - # if torch compile cache key issue fixed - # See https://github.com/vllm-project/vllm/pull/25093 - logger.info( - "WideEP: Disabling CUDA Graphs since DeepEP high-throughput " - "kernels are optimized for prefill and are incompatible with " - "CUDA Graphs. " - "In order to use CUDA Graphs for decode-optimized workloads, " - "use --all2all-backend with another option, such as " - "deepep_low_latency, pplx, or allgather_reducescatter." - ) - compilation_config.cudagraph_mode = CUDAGraphMode.NONE @classmethod def get_current_memory_usage( From a49d813fa88338a4409928a6e6d2ab3d0019f83b Mon Sep 17 00:00:00 2001 From: Luke Date: Sat, 6 Dec 2025 23:13:14 -0800 Subject: [PATCH 045/133] Lazy loading to avoid importing all files (#29716) Signed-off-by: Luke --- vllm/transformers_utils/configs/__init__.py | 90 ++++++++++++--------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 0e8d167886935..e536ca8521325 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -10,46 +10,47 @@ Model configs may be defined in this directory for the following reasons: deepseek-ai/DeepSeek-V3.2-Exp. """ -from transformers import DeepseekV3Config +from __future__ import annotations -from vllm.transformers_utils.configs.afmoe import AfmoeConfig -from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config -from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig -from vllm.transformers_utils.configs.eagle import EAGLEConfig +import importlib -# RWConfig is for the original tiiuae/falcon-40b(-instruct) and -# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the -# `FalconConfig` class from the official HuggingFace transformers library. -from vllm.transformers_utils.configs.falcon import RWConfig -from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig -from vllm.transformers_utils.configs.hunyuan_vl import ( - HunYuanVLConfig, - HunYuanVLTextConfig, - HunYuanVLVisionConfig, -) -from vllm.transformers_utils.configs.jais import JAISConfig -from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig -from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig -from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig -from vllm.transformers_utils.configs.medusa import MedusaConfig -from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig -from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig -from vllm.transformers_utils.configs.moonvit import MoonViTConfig -from vllm.transformers_utils.configs.nemotron import NemotronConfig -from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig -from vllm.transformers_utils.configs.olmo3 import Olmo3Config -from vllm.transformers_utils.configs.ovis import OvisConfig -from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig -from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig -from vllm.transformers_utils.configs.step3_vl import ( - Step3TextConfig, - Step3VisionEncoderConfig, - Step3VLConfig, -) -from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config -from vllm.transformers_utils.configs.ultravox import UltravoxConfig +_CLASS_TO_MODULE: dict[str, str] = { + "AfmoeConfig": "vllm.transformers_utils.configs.afmoe", + "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm", + "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2", + "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr", + "EAGLEConfig": "vllm.transformers_utils.configs.eagle", + "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo", + "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl", + "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl", + "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl", + # RWConfig is for the original tiiuae/falcon-40b(-instruct) and + # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the + # `FalconConfig` class from the official HuggingFace transformers library. + "RWConfig": "vllm.transformers_utils.configs.falcon", + "JAISConfig": "vllm.transformers_utils.configs.jais", + "Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe", + "MedusaConfig": "vllm.transformers_utils.configs.medusa", + "MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm", + "MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator", + "MoonViTConfig": "vllm.transformers_utils.configs.moonvit", + "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear", + "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl", + "NemotronConfig": "vllm.transformers_utils.configs.nemotron", + "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h", + "Olmo3Config": "vllm.transformers_utils.configs.olmo3", + "OvisConfig": "vllm.transformers_utils.configs.ovis", + "RadioConfig": "vllm.transformers_utils.configs.radio", + "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base", + "UltravoxConfig": "vllm.transformers_utils.configs.ultravox", + "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl", + "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl", + "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl", + "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next", + "Tarsier2Config": "vllm.transformers_utils.configs.tarsier2", + # Special case: DeepseekV3Config is from HuggingFace Transformers + "DeepseekV3Config": "transformers", +} __all__ = [ "AfmoeConfig", @@ -84,3 +85,16 @@ __all__ = [ "Qwen3NextConfig", "Tarsier2Config", ] + + +def __getattr__(name: str): + if name in _CLASS_TO_MODULE: + module_name = _CLASS_TO_MODULE[name] + module = importlib.import_module(module_name) + return getattr(module, name) + + raise AttributeError(f"module 'configs' has no attribute '{name}'") + + +def __dir__(): + return sorted(list(__all__)) From 27f4c2fd46b99778d7ea19dfe7751fbaab615177 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 7 Dec 2025 15:15:42 +0800 Subject: [PATCH 046/133] [Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145) Signed-off-by: DarkLight1337 --- docs/contributing/model/transcription.md | 12 +- .../distributed/test_sequence_parallelism.py | 2 + tests/compile/test_functionalization.py | 6 +- tests/compile/test_fusion.py | 6 +- tests/compile/test_fusion_attn.py | 2 + tests/compile/test_pass_manager.py | 8 +- tests/compile/test_qk_norm_rope_fusion.py | 5 +- tests/distributed/test_kvlayout.py | 3 + .../entrypoints/openai/test_chat_template.py | 22 +- .../entrypoints/openai/test_lora_resolvers.py | 21 +- tests/entrypoints/openai/test_serving_chat.py | 28 ++- .../entrypoints/openai/test_serving_engine.py | 8 +- .../entrypoints/openai/test_serving_models.py | 8 +- tests/entrypoints/test_chat_utils.py | 190 +++++++----------- tests/lora/test_lora_manager.py | 14 +- tests/lora/test_worker.py | 2 + .../test_model_load_with_params.py | 22 +- tests/models/language/pooling/test_gritlm.py | 5 +- .../multimodal/processing/test_common.py | 22 +- .../multimodal/processing/test_glm4_1v.py | 4 +- .../multimodal/processing/test_h2ovl.py | 2 +- .../multimodal/processing/test_idefics3.py | 2 +- .../multimodal/processing/test_internvl.py | 2 +- .../multimodal/processing/test_llama4.py | 2 +- .../multimodal/processing/test_llava_next.py | 6 +- .../processing/test_llava_onevision.py | 6 +- .../processing/test_minimax_vl_01.py | 4 +- .../multimodal/processing/test_mllama4.py | 2 +- .../multimodal/processing/test_nemotron_vl.py | 2 +- .../multimodal/processing/test_phi3v.py | 2 +- .../multimodal/processing/test_phi4mm.py | 2 +- .../multimodal/processing/test_qwen2_vl.py | 2 +- .../multimodal/processing/test_smolvlm.py | 2 +- .../processing/test_tensor_schema.py | 24 +-- .../processing/test_transformers.py | 5 +- tests/models/multimodal/test_mapping.py | 33 +-- tests/models/registry.py | 33 ++- tests/models/utils.py | 17 +- tests/multimodal/test_cache.py | 27 ++- tests/multimodal/test_processing.py | 24 ++- tests/multimodal/test_registry.py | 4 +- tests/test_config.py | 131 +++++++----- tests/test_inputs.py | 7 +- tests/v1/attention/utils.py | 2 + tests/v1/core/test_kv_cache_utils.py | 20 +- tests/v1/core/test_scheduler.py | 2 + tests/v1/core/utils.py | 2 + tests/v1/engine/test_engine_core.py | 2 + .../engine/test_process_multi_modal_uuids.py | 24 ++- tests/v1/kv_connector/unit/utils.py | 2 + tests/v1/spec_decode/test_eagle.py | 2 + tests/v1/spec_decode/test_mtp.py | 2 + tests/v1/spec_decode/test_ngram.py | 2 + .../test_backend_guidance.py | 12 +- .../test_reasoning_structured_output.py | 35 ++-- tests/v1/tpu/worker/test_tpu_model_runner.py | 2 + tests/v1/worker/test_gpu_model_runner.py | 3 + vllm/config/__init__.py | 3 + vllm/config/model.py | 141 +++---------- vllm/config/multimodal.py | 4 - vllm/config/renderer.py | 109 ++++++++++ vllm/config/speculative.py | 5 - vllm/config/vllm.py | 25 ++- vllm/engine/arg_utils.py | 99 +++++---- vllm/engine/protocol.py | 3 +- vllm/entrypoints/chat_utils.py | 79 ++++---- vllm/entrypoints/llm.py | 14 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 11 +- vllm/entrypoints/openai/serving_models.py | 1 + vllm/entrypoints/openai/speech_to_text.py | 10 +- vllm/entrypoints/pooling/pooling/serving.py | 2 +- vllm/entrypoints/pooling/score/serving.py | 4 +- vllm/entrypoints/score_utils.py | 13 +- vllm/entrypoints/utils.py | 8 +- vllm/inputs/preprocess.py | 9 +- vllm/model_executor/models/adapters.py | 20 +- vllm/model_executor/models/deepseek_ocr.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 4 +- vllm/model_executor/models/gemma3n_mm.py | 8 +- vllm/model_executor/models/granite_speech.py | 14 +- vllm/model_executor/models/gritlm.py | 14 +- vllm/model_executor/models/interfaces.py | 10 +- vllm/model_executor/models/interns1.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 13 +- vllm/model_executor/models/nemotron_vl.py | 2 +- vllm/model_executor/models/pixtral.py | 2 +- vllm/model_executor/models/voxtral.py | 22 +- vllm/model_executor/models/whisper.py | 14 +- vllm/multimodal/cache.py | 22 +- vllm/multimodal/processing.py | 28 ++- vllm/multimodal/registry.py | 64 +++--- vllm/tokenizers/registry.py | 24 +-- vllm/transformers_utils/processor.py | 28 ++- vllm/v1/core/encoder_cache_manager.py | 8 +- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/async_llm.py | 7 +- vllm/v1/engine/input_processor.py | 7 +- vllm/v1/engine/llm_engine.py | 7 +- vllm/v1/spec_decode/eagle.py | 2 +- vllm/v1/structured_output/__init__.py | 18 +- vllm/v1/worker/gpu_model_runner.py | 7 +- vllm/v1/worker/tpu_model_runner.py | 7 +- vllm/v1/worker/utils.py | 19 +- 105 files changed, 969 insertions(+), 797 deletions(-) create mode 100644 vllm/config/renderer.py diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index fca941acd5076..c5605789022d4 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -22,7 +22,7 @@ Declare supported languages and capabilities: import torch from torch import nn - from vllm.config import ModelConfig, SpeechToTextConfig + from vllm.config import RendererConfig, SpeechToTextConfig from vllm.inputs.data import PromptType from vllm.model_executor.models.interfaces import SupportsTranscription @@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model: @classmethod def get_speech_to_text_config( cls, - model_config: ModelConfig, + renderer_config: RendererConfig, task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: return SpeechToTextConfig( @@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example @@ -216,7 +216,7 @@ Relevant server logic: prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - model_config=self.model_config, + renderer_config=self.renderer_config, language=language, task_type=self.task_type, request_prompt=request.prompt, diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index d9fdc3acc3d6f..77d3a24d42923 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -17,6 +17,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, PassConfig, + RendererConfig, VllmConfig, get_current_vllm_config, set_current_vllm_config, @@ -276,6 +277,7 @@ def sequence_parallelism_pass_on_test_model( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), device_config=device_config, compilation_config=compilation_config, ) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 7585915892700..52d6fd1e5d75e 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -15,6 +15,7 @@ from vllm.config import ( CompilationConfig, ModelConfig, PassConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -219,8 +220,11 @@ def test_fix_functionalization( torch.set_default_device("cuda") torch.set_default_dtype(dtype) + model_config = ModelConfig(dtype=dtype) + vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( custom_ops=["all"], pass_config=PassConfig( diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index d0ba8385f4a01..bb4ee6b8e3eca 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -15,6 +15,7 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, + RendererConfig, VllmConfig, ) from vllm.model_executor.layers.layernorm import RMSNorm @@ -154,8 +155,11 @@ def test_fusion_rmsnorm_quant( custom_ops.append("+rms_norm") if enable_quant_fp8_custom_op: custom_ops.append("+quant_fp8") + + model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index db95dff5e0fc7..f87825db29817 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -24,6 +24,7 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -325,6 +326,7 @@ def test_attention_quant_pattern( ) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=SchedulerConfig( max_num_seqs=1024, max_model_len=model_config.max_model_len, diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b655031..c95e9e3ff8ae8 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -7,7 +7,7 @@ import torch from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.compilation.pass_manager import PostGradPassManager -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig # dummy custom pass that doesn't inherit @@ -43,7 +43,11 @@ class ProperPass(InductorPass): ) def test_pass_manager_uuid(callable): # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + model_config = ModelConfig(dtype=torch.bfloat16) + config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) pass_manager = PostGradPassManager() pass_manager.configure(config) diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index e0968ac799256..4d109015be48a 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -19,6 +19,7 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -133,8 +134,10 @@ def test_qk_norm_rope_fusion( if enable_rope_custom_op: custom_ops.append("+rotary_embedding") + model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py index b190b2820451b..0d51a51a50804 100644 --- a/tests/distributed/test_kvlayout.py +++ b/tests/distributed/test_kvlayout.py @@ -5,6 +5,7 @@ from vllm.config import ( DeviceConfig, KVTransferConfig, ModelConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -47,6 +48,7 @@ def test_get_kv_connector_cache_layout_with_nixl_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): @@ -70,6 +72,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 77087ac21ea8b..b050cfdb561cf 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -3,7 +3,6 @@ import pytest -from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.tokenizers import get_tokenizer @@ -107,24 +106,11 @@ def test_get_gen_prompt( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - trust_remote_code=model_info.trust_remote_code, - revision=model_info.revision, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) - # Initialize the tokenizer tokenizer = get_tokenizer( - tokenizer_name=model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) template_content = load_chat_template(chat_template=template) @@ -143,7 +129,7 @@ def test_get_gen_prompt( tokenizer=tokenizer, conversation=mock_request.messages, chat_template=mock_request.chat_template or template_content, - model_config=model_config, + renderer_config=renderer_config, tools=None, add_generation_prompt=mock_request.add_generation_prompt, continue_final_message=mock_request.continue_final_message, diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index ea6b3d812d8fe..7310c2610ce3b 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -33,26 +33,34 @@ class MockModelConfig: """Minimal mock ModelConfig for testing.""" model: str = MODEL_NAME - tokenizer: str = MODEL_NAME trust_remote_code: bool = False - tokenizer_mode: str = "auto" max_model_len: int = 100 - tokenizer_revision: str | None = None multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) logits_processors: list[str] | None = None logits_processor_pattern: str | None = None diff_sampling_param: dict | None = None - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" - skip_tokenizer_init: bool = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} +@dataclass +class MockRendererConfig: + """Minimal mock RendererConfig for testing.""" + + model_config: MockModelConfig + + tokenizer: str = MODEL_NAME + tokenizer_mode: str = "auto" + tokenizer_revision: str | None = None + skip_tokenizer_init: bool = False + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + + class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str @@ -114,6 +122,7 @@ def mock_serving_setup(): mock_engine.add_lora.reset_mock() mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9ea65f9fa6e7a..9df8f886edd96 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -346,27 +346,33 @@ class MockHFConfig: class MockModelConfig: task = "generate" runner_type = "generate" - tokenizer = MODEL_NAME trust_remote_code = False - tokenizer_mode = "auto" max_model_len = 100 - tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() logits_processors: list[str] | None = None logits_processor_pattern = None diff_sampling_param: dict | None = None - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - skip_tokenizer_init = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} +@dataclass +class MockRendererConfig: + model_config: MockModelConfig = field(default_factory=MockModelConfig) + + tokenizer = MODEL_NAME + tokenizer_mode = "auto" + tokenizer_revision = None + skip_tokenizer_init = False + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + + def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, @@ -399,6 +405,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: @dataclass class MockEngine: model_config: MockModelConfig = field(default_factory=MockModelConfig) + renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig) input_processor: MagicMock = field(default_factory=MagicMock) io_processor: MagicMock = field(default_factory=MagicMock) @@ -429,6 +436,7 @@ async def test_serving_chat_returns_correct_model_name(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -459,6 +467,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -492,6 +501,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -537,6 +547,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -583,6 +594,7 @@ async def test_serving_chat_could_load_correct_generation_config(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -629,6 +641,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -662,6 +675,7 @@ async def test_serving_chat_data_parallel_rank_extraction(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 956a06dc5487c..6ab0942b58da8 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -7,7 +7,7 @@ from unittest.mock import Mock import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.tokenizers import MistralTokenizer @@ -19,10 +19,16 @@ def serving() -> OpenAIServing: # Create minimal mocks engine_client = Mock() + model_config = Mock(spec=ModelConfig) model_config.max_model_len = 32768 + + renderer_config = Mock(spec=RendererConfig) + renderer_config.model_config = model_config + models = Mock(spec=OpenAIServingModels) models.model_config = model_config + models.renderer_config = renderer_config models.input_processor = Mock() models.io_processor = Mock() diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index b585835a0667a..376df6cfecb9f 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -6,7 +6,7 @@ from unittest.mock import MagicMock import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import ( ErrorResponse, @@ -27,9 +27,15 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( async def _async_serving_models_init() -> OpenAIServingModels: mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute + mock_model_config = MagicMock(spec=ModelConfig) mock_model_config.max_model_len = 2048 + + mock_renderer_config = MagicMock(spec=RendererConfig) + mock_renderer_config.model_config = mock_model_config + mock_engine_client.model_config = mock_model_config + mock_engine_client.renderer_config = mock_renderer_config mock_engine_client.input_processor = MagicMock() mock_engine_client.io_processor = MagicMock() diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 527322c71ae4b..7b296eae7c5a2 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.chat_utils import ( _try_extract_ast, apply_mistral_chat_template, @@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="string", ) assert conversation == [ @@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="openai", ) assert conversation == [ @@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( ], } ], - phi3v_model_config_image_embeds, + RendererConfig(model_config=phi3v_model_config_image_embeds), content_format="string", ) @@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( ], } ], - phi3v_model_config_image_embeds, + RendererConfig(model_config=phi3v_model_config_image_embeds), content_format="string", ) @@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) assert conversation == [ @@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format( {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What about this one?"}, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="openai", ) @@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( ], }, ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl ], }, ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1945,24 +1945,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) - # Build the tokenizer tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) tools = ( @@ -1985,7 +1972,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): tokenizer, chat_template=None, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -2047,24 +2034,11 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa "enable_thinking": True, } - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) - # Build the tokenizer tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2072,7 +2046,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa tokenizer, chat_template=None, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) with pytest.raises( ValueError, match="Found unexpected chat template kwargs from request" @@ -2143,23 +2117,11 @@ def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2167,7 +2129,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=model_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -2181,7 +2143,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): None, "auto", tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2203,23 +2165,11 @@ def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2227,7 +2177,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=model_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -2241,7 +2191,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): None, "auto", tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2272,15 +2222,13 @@ def test_resolve_content_format_fallbacks(model, expected_format): ], ) def test_resolve_content_format_examples(template_path, expected_format): - model_config = ModelConfig( - PHI3V_MODEL_ID, # Dummy - tokenizer=PHI3V_MODEL_ID, # Dummy - trust_remote_code=True, - ) + model = PHI3V_MODEL_ID # Dummy + model_config = ModelConfig(model, trust_remote_code=True) + renderer_config = RendererConfig(model_config=model_config, tokenizer=model) dummy_tokenizer = get_tokenizer( - PHI3V_MODEL_ID, # Dummy - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) dummy_tokenizer.chat_template = None @@ -2297,7 +2245,7 @@ def test_resolve_content_format_examples(template_path, expected_format): None, "auto", dummy_tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2332,7 +2280,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): conversation_with_thinking, _, _ = parse_chat_messages( messages, - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="openai", ) @@ -2432,7 +2380,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ], } ], - qwen2_audio_model_config, + RendererConfig(model_config=qwen2_audio_model_config), content_format="string", ) @@ -2466,7 +2414,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ], } ], - qwen2_audio_model_config, + RendererConfig(model_config=qwen2_audio_model_config), content_format="string", ) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 081f14d6fabfb..7158120fc0217 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,7 +8,7 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.lora.layers import ( ColumnParallelLinearWithLoRA, @@ -422,7 +422,11 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + lora_config=lora_config, + ) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 @@ -525,7 +529,11 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + lora_config=lora_config, + ) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 54059ec561907..42d8c6202e79e 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -11,6 +11,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -43,6 +44,7 @@ def test_worker_apply_lora(qwen3_lora_files): vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), load_config=LoadConfig( download_dir=None, load_format="dummy", diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 489ac1e6475b9..e368671078fdd 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -42,8 +42,10 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - model_config = vllm_model.llm.llm_engine.model_config - model_tokenizer = vllm_model.llm.llm_engine.tokenizer + llm_engine = vllm_model.llm.llm_engine + model_config = llm_engine.model_config + renderer_config = llm_engine.renderer_config + tokenizer = llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -54,8 +56,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" - assert model_tokenizer.model_max_length == 512 + assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5" + assert tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) @@ -86,8 +88,10 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - model_config = vllm_model.llm.llm_engine.model_config - model_tokenizer = vllm_model.llm.llm_engine.tokenizer + llm_engine = vllm_model.llm.llm_engine + model_config = llm_engine.model_config + renderer_config = llm_engine.renderer_config + tokenizer = llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -98,8 +102,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_config.tokenizer == "intfloat/multilingual-e5-base" - assert model_tokenizer.model_max_length == 512 + assert renderer_config.tokenizer == "intfloat/multilingual-e5-base" + assert tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) @@ -128,7 +132,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name + assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name def check_model(model): assert isinstance(model, RobertaEmbeddingModel) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 0adc9b5cf25f6..11ee003585487 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -6,7 +6,7 @@ import pytest from scipy.spatial.distance import cosine from vllm import LLM, SamplingParams -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from ....utils import RemoteOpenAIServer @@ -31,7 +31,8 @@ def test_find_array(): dtype="bfloat16", seed=0, ) - pooling = GritLMMeanPool(model_config=model_config) + renderer_config = RendererConfig(model_config=model_config) + pooling = GritLMMeanPool(renderer_config=renderer_config) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2e032ac4ca526..9b2b29b758765 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -25,7 +25,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC from vllm.tokenizers import ( MistralTokenizer, TokenizerLike, - cached_tokenizer_from_config, ) from ....multimodal.utils import random_audio, random_image, random_video @@ -212,31 +211,20 @@ def _test_processing_correctness( else: model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch) model_id = model_id_or_arch + model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, + renderer_config = model_info.build_renderer_config( + model=model_id, # Ensure that the cache can fit all of the data mm_processor_cache_gb=2048, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, ) + model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + ctx = InputProcessingContext.from_config(renderer_config) cache = MultiModalProcessorOnlyCache(model_config) processing_info = factories.info(ctx) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index 51071c93531de..fdc6352e2ec83 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -40,7 +40,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {"fps": fps} @@ -79,7 +79,7 @@ def test_video_loader_consistency( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {"fps": fps} # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 1701d9dd8f011..1263d663e6af6 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -162,7 +162,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 351b9d018eec2..bf12e79a718b7 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index b4994295d3a80..51f0d2e891b3f 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -116,7 +116,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index b73246b68b36a..04bc8d3f53818 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -30,7 +30,7 @@ def test_processor_override( limit_mm_per_prompt={"image": num_imgs}, mm_processor_cache_gb=mm_processor_cache_gb, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) config = processor.info.get_hf_config() tokenizer = processor.info.get_tokenizer() hf_processor = processor.info.get_hf_processor() diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index ffe7ca17b5d61..cd01002a32af2 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) info = processor.info seen_aspect_ratios = set[float]() @@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), @@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index f5c552fe6476a..be505d95a500f 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) info = processor.info seen_aspect_ratios = set[float]() @@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), @@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 11e0001235110..17ac54fdd0a49 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -24,7 +24,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) prompt = "" * num_imgs image = Image.new("RGB", size=(364, 364)) mm_data = {"image": [image] * num_imgs} @@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index e5ff2d1391b62..9a65e2ddc85c6 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int): limit_mm_per_prompt=mm_counts, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) profiler = MultiModalProfiler(processor) decoder_dummy_data = profiler.get_decoder_dummy_data( diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 5311ab1b78c69..f3609743b7c85 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -118,7 +118,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index 8faff2611e6fe..f51bd97861783 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index 5391555c26675..271357b0d1507 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9f4cdb6789b2c..d65a270a7da3b 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -34,7 +34,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 6f77d5516d147..e0e6264de4e3a 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 5d489549c5b46..24959fa48ad6d 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -11,7 +11,7 @@ import pytest import torch.nn as nn from PIL import Image -from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config from vllm.config.multimodal import ( AudioDummyOptions, BaseDummyOptions, @@ -31,7 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.platforms import current_platform -from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype @@ -150,7 +149,10 @@ def initialize_dummy_model( backend="nccl", ) initialize_model_parallel(tensor_model_parallel_size=1) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) with set_current_vllm_config(vllm_config=vllm_config): with set_default_torch_dtype(model_config.dtype): model = model_cls(vllm_config=vllm_config) @@ -182,19 +184,12 @@ def test_model_tensor_schema(model_id: str): else: dtype = model_info.dtype - model_config = ModelConfig( + renderer_config = model_info.build_renderer_config( model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, hf_overrides=hf_overrides_fn, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, dtype=dtype, ) + model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) assert supports_multimodal(model_cls) @@ -212,10 +207,7 @@ def test_model_tensor_schema(model_id: str): if not any(inputs_parse_methods): pytest.skip(f"{model_arch} does not support tensor schema validation.") - ctx = InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + ctx = InputProcessingContext.from_config(renderer_config) processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() limit_mm_per_prompt = { diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index e2a2186f470b4..c9a90eb882da4 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -3,7 +3,7 @@ import pytest from vllm.assets.image import ImageAsset -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -13,8 +13,9 @@ def test_multimodal_processor(model_id): model=model_id, model_impl="transformers", ) + renderer_config = RendererConfig(model_config=model_config) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) + mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) image_pil = ImageAsset("cherry_blossom").pil_image mm_data = {"image": image_pil} diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 0d2eaca95504e..73de6b5f7d2f9 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -7,7 +7,6 @@ import torch import transformers from transformers import AutoConfig, PreTrainedModel -from vllm.config import ModelConfig from vllm.model_executor.models.utils import WeightsMapper from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.transformers_utils.config import try_get_safetensors_metadata @@ -50,37 +49,11 @@ def test_hf_model_weights_mapper(model_arch: str): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - is_mistral_model = model_arch in [ - "Mistral3ForConditionalGeneration", - "PixtralForConditionalGeneration", - "VoxtralForConditionalGeneration", - ] - - if not is_mistral_model or model_info.tokenizer_mode == "mistral": - tokenizer_mode = model_info.tokenizer_mode - else: - tokenizer_mode = "hf" - - model_id = model_info.default - - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=tokenizer_mode, - config_format="hf", - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + model_config = model_info.build_model_config(config_format="hf") model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - original_weights = create_repo_dummy_weights(model_id) - hf_dummy_model = create_dummy_model(model_id, model_arch) + original_weights = create_repo_dummy_weights(model_config.model) + hf_dummy_model = create_dummy_model(model_config.model, model_arch) hf_converted_weights = hf_dummy_model.named_parameters() hf_converted_buffers = hf_dummy_model.named_buffers() mapper: WeightsMapper = model_cls.hf_to_vllm_mapper diff --git a/tests/models/registry.py b/tests/models/registry.py index 020cb749341a6..e2cb5bcbc6c91 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -9,7 +9,8 @@ import pytest from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION -from vllm.config.model import ModelDType, TokenizerMode +from vllm.config.model import ModelConfig, ModelDType +from vllm.config.renderer import RendererConfig, TokenizerMode @dataclass(frozen=True) @@ -170,6 +171,36 @@ class _HfExamplesInfo: else: pytest.skip(msg) + def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig: + if model is None: + model = self.default + + return ModelConfig( + **{ + "model": model, + "revision": self.revision, + "trust_remote_code": self.trust_remote_code, + "hf_overrides": self.hf_overrides, + "enable_prompt_embeds": self.require_embed_inputs, + "enable_mm_embeds": self.require_embed_inputs, + "enforce_eager": self.enforce_eager, + "dtype": self.dtype, + **kwargs, + } + ) + + def build_renderer_config( + self, model: str | None = None, **kwargs + ) -> RendererConfig: + model_config = self.build_model_config(model, **kwargs) + + return RendererConfig( + model_config=model_config, + tokenizer=self.tokenizer or model_config.model, + tokenizer_mode=self.tokenizer_mode, + skip_tokenizer_init=self.require_embed_inputs, + ) + _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] diff --git a/tests/models/utils.py b/tests/models/utils.py index d84b4b820533e..87292cc4538d9 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -13,7 +13,6 @@ from transformers import PretrainedConfig from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext -from vllm.tokenizers import cached_tokenizer_from_config from .. import ci_envs from .registry import HF_EXAMPLE_MODELS @@ -296,30 +295,18 @@ def build_model_context( model_config_kwargs = model_config_kwargs or {} limit_mm_per_prompt = limit_mm_per_prompt or {} - model_config = ModelConfig( + renderer_config = model_info.build_renderer_config( model_id, runner=runner, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, dtype=dtype, seed=0, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_cache_gb=mm_processor_cache_gb, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, **model_config_kwargs, ) - return InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + return InputProcessingContext.from_config(renderer_config) def check_embeddings_close( diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e641b1111abaf..ce16d90130aa4 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -6,7 +6,7 @@ import numpy as np import pytest import torch -from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import ( BaseMultiModalProcessorCache, @@ -110,11 +110,14 @@ def _create_vllm_config( mm_processor_cache_gb: float, enable_ipc: bool, ): + model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb, + ) + return VllmConfig( - model_config=ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_gb=mm_processor_cache_gb, - ), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2), ) @@ -506,13 +509,15 @@ def _run_test_cache_eviction_shm( def test_cache_eviction_shm_cache(): + model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_type="shm", + mm_shm_cache_max_object_size_mb=6, + mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, + ) vllm_config = VllmConfig( - model_config=ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_type="shm", - mm_shm_cache_max_object_size_mb=6, - mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, - ), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), ) sender_cache = ShmObjectStoreSenderCache(vllm_config) receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock()) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 262ea42e4d0fa..adff572524a9b 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,7 +7,7 @@ from contextlib import nullcontext import numpy as np import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import ( InputProcessingContext, @@ -920,8 +920,9 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) + renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) + processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) processor._supported_mm_limits = {"image": num_supported} profiler = MultiModalProfiler(processor) @@ -955,8 +956,9 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) + renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) + processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) rng = np.random.RandomState(0) image = random_image(rng, min_wh=128, max_wh=256) @@ -1012,11 +1014,13 @@ def test_hf_processor_init_kwargs( inference_kwargs, expected_kwargs, ): - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=None, + model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=model_id, ) + ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor( DummyProcessor, # type: ignore[arg-type] **inference_kwargs, @@ -1045,11 +1049,13 @@ def test_hf_processor_call_kwargs( inference_kwargs, expected_kwargs, ): - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=None, + model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=model_id, ) + ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] result = ctx.call_hf_processor(processor, {}, inference_kwargs) diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py index 3b01bda7f54c8..8127fac09968b 100644 --- a/tests/multimodal/test_registry.py +++ b/tests/multimodal/test_registry.py @@ -31,4 +31,6 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected): model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) - assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected + assert ( + MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected + ) diff --git a/tests/test_config.py b/tests/test_config.py index 203447cd531fb..7464fcd1e9fe5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,6 +13,7 @@ from vllm.config import ( CompilationConfig, ModelConfig, PoolerConfig, + RendererConfig, SchedulerConfig, VllmConfig, update_config, @@ -476,27 +477,41 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True), ], ) -def test_get_and_verify_max_len( +def test_recalculate_max_model_len( model_id, max_model_len, expected_max_len, should_raise ): - """Test get_and_verify_max_len with different configurations.""" + """Test recalculate_max_model_len with different configurations.""" model_config = ModelConfig(model_id) if should_raise: with pytest.raises(ValueError): - model_config.get_and_verify_max_len(max_model_len) + model_config.recalculate_max_model_len( + max_model_len, + tokenizer=model_id, + tokenizer_revision=None, + ) else: - actual_max_len = model_config.get_and_verify_max_len(max_model_len) - assert actual_max_len == expected_max_len + model_config.recalculate_max_model_len( + max_model_len, + tokenizer=model_id, + tokenizer_revision=None, + ) + assert model_config.max_model_len == expected_max_len -class MockConfig: - """Simple mock object for testing maybe_pull_model_tokenizer_for_runai""" +class MockModelConfig: + """Simple mock object for testing maybe_pull_model_for_runai""" - def __init__(self, model: str, tokenizer: str): + def __init__(self, model: str): self.model = model - self.tokenizer = tokenizer - self.model_weights = None + + +class MockRendererConfig: + """Simple mock object for testing maybe_pull_tokenizer_for_runai""" + + def __init__(self, model_config: MockModelConfig): + self.model_config = model_config + self.tokenizer = model_config.model @pytest.mark.parametrize( @@ -514,59 +529,65 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url): mock_pull_files.return_value = None # Create first mock and run the method - config1 = MockConfig(model=s3_url, tokenizer=s3_url) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url) + model_config1 = MockModelConfig(model=s3_url) + renderer_config1 = MockRendererConfig(model_config=model_config1) + ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url) # Check that model and tokenizer point to existing directories - assert os.path.exists(config1.model), ( - f"Model directory does not exist: {config1.model}" + assert os.path.exists(model_config1.model), ( + f"Model directory does not exist: {model_config1.model}" ) - assert os.path.isdir(config1.model), ( - f"Model path is not a directory: {config1.model}" + assert os.path.isdir(model_config1.model), ( + f"Model path is not a directory: {model_config1.model}" ) - assert os.path.exists(config1.tokenizer), ( - f"Tokenizer directory does not exist: {config1.tokenizer}" + assert os.path.exists(renderer_config1.tokenizer), ( + f"Tokenizer directory does not exist: {renderer_config1.tokenizer}" ) - assert os.path.isdir(config1.tokenizer), ( - f"Tokenizer path is not a directory: {config1.tokenizer}" + assert os.path.isdir(renderer_config1.tokenizer), ( + f"Tokenizer path is not a directory: {renderer_config1.tokenizer}" ) # Verify that the paths are different from the original S3 URL - assert config1.model != s3_url, "Model path should be converted to local directory" - assert config1.tokenizer != s3_url, ( + assert model_config1.model != s3_url, ( + "Model path should be converted to local directory" + ) + assert renderer_config1.tokenizer != s3_url, ( "Tokenizer path should be converted to local directory" ) # Store the original paths - created_model_dir = config1.model - create_tokenizer_dir = config1.tokenizer + created_model_dir = model_config1.model + create_tokenizer_dir = renderer_config1.tokenizer # Create a new mock and run the method with the same S3 URL - config2 = MockConfig(model=s3_url, tokenizer=s3_url) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url) + model_config2 = MockModelConfig(model=s3_url) + renderer_config2 = MockRendererConfig(model_config=model_config2) + ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url) # Check that the new directories exist - assert os.path.exists(config2.model), ( - f"Model directory does not exist: {config2.model}" + assert os.path.exists(model_config2.model), ( + f"Model directory does not exist: {model_config2.model}" ) - assert os.path.isdir(config2.model), ( - f"Model path is not a directory: {config2.model}" + assert os.path.isdir(model_config2.model), ( + f"Model path is not a directory: {model_config2.model}" ) - assert os.path.exists(config2.tokenizer), ( - f"Tokenizer directory does not exist: {config2.tokenizer}" + assert os.path.exists(renderer_config2.tokenizer), ( + f"Tokenizer directory does not exist: {renderer_config2.tokenizer}" ) - assert os.path.isdir(config2.tokenizer), ( - f"Tokenizer path is not a directory: {config2.tokenizer}" + assert os.path.isdir(renderer_config2.tokenizer), ( + f"Tokenizer path is not a directory: {renderer_config2.tokenizer}" ) # Verify that the paths are deterministic (same as before) - assert config2.model == created_model_dir, ( + assert model_config2.model == created_model_dir, ( f"Model paths are not deterministic. " - f"Original: {created_model_dir}, New: {config2.model}" + f"Original: {created_model_dir}, New: {model_config2.model}" ) - assert config2.tokenizer == create_tokenizer_dir, ( + assert renderer_config2.tokenizer == create_tokenizer_dir, ( f"Tokenizer paths are not deterministic. " - f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}" + f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}" ) @@ -580,28 +601,36 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): s3_url2 = "s3://example-bucket-2/model/" # Create mocks with different S3 URLs and run the method - config1 = MockConfig(model=s3_url1, tokenizer=s3_url1) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1) + model_config1 = MockModelConfig(model=s3_url1) + renderer_config1 = MockRendererConfig(model_config=model_config1) + ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1) - config2 = MockConfig(model=s3_url2, tokenizer=s3_url2) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2) + model_config2 = MockModelConfig(model=s3_url2) + renderer_config2 = MockRendererConfig(model_config=model_config2) + ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2) # Verify that different URLs produce different directories - assert config1.model != config2.model, ( + assert model_config1.model != model_config2.model, ( f"Different S3 URLs should create different model directories. " - f"URL1 model: {config1.model}, URL2 model: {config2.model}" + f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}" ) - assert config1.tokenizer != config2.tokenizer, ( + assert renderer_config1.tokenizer != renderer_config2.tokenizer, ( f"Different S3 URLs should create different tokenizer directories. " - f"URL1 tokenizer: {config1.tokenizer}, " - f"URL2 tokenizer: {config2.tokenizer}" + f"URL1 tokenizer: {renderer_config1.tokenizer}, " + f"URL2 tokenizer: {renderer_config2.tokenizer}" ) # Verify that both sets of directories exist - assert os.path.exists(config1.model) and os.path.isdir(config1.model) - assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer) - assert os.path.exists(config2.model) and os.path.isdir(config2.model) - assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) + assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model) + assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir( + renderer_config1.tokenizer + ) + assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model) + assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir( + renderer_config2.tokenizer + ) @pytest.mark.parametrize( diff --git a/tests/test_inputs.py b/tests/test_inputs.py index c4339827de8b6..48fd076ab3c67 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor @@ -108,8 +108,9 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_config(model_config) - input_preprocessor = InputPreprocessor(model_config, tokenizer) + renderer_config = RendererConfig(model_config=model_config) + tokenizer = init_tokenizer_from_config(renderer_config) + input_preprocessor = InputPreprocessor(renderer_config, tokenizer) # HF processor adds sep token sep_token_id = tokenizer.vocab[tokenizer.sep_token] diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 6cab129c116c5..49307e3e5437d 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -16,6 +16,7 @@ from vllm.config import ( LoadConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -216,6 +217,7 @@ def create_vllm_config( return VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index fd5cf6d3e74aa..4a414bca591d1 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -8,7 +8,7 @@ import pytest import torch import vllm.v1.core.kv_cache_utils as kv_cache_utils -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import ( MultiModalFeatureSpec, @@ -667,7 +667,10 @@ def test_metrics_empty_stats(): def test_get_kv_cache_configs_multiple_workers(): model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) ref_kv_cache_spec = new_kv_cache_spec() same_kv_cache_specs = [ @@ -1136,6 +1139,7 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1175,6 +1179,7 @@ def test_get_max_concurrency_for_kv_cache_config(): vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1293,7 +1298,10 @@ def test_allocate_with_lookahead(): def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # all layers are full attention -> single group @@ -1584,7 +1592,11 @@ def test_get_kv_cache_config_one_worker(): def test_get_kv_cache_configs_attention_free(): kv_cache_specs: dict[str, KVCacheSpec] = {} - vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16)) + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) assert kv_cache_configs == [ KVCacheConfig( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index c6c4a5085bff7..1505415a63619 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -11,6 +11,7 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -1563,6 +1564,7 @@ def create_scheduler_with_priority( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index f5ba613d38db1..086885c298145 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -9,6 +9,7 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -132,6 +133,7 @@ def create_scheduler( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 48be8c15aba9e..c606100a12bf1 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -15,6 +15,7 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -522,6 +523,7 @@ def test_encoder_instance_zero_kv_cache( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, kv_transfer_config=kv_transfer_config, diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 1b11b8af49d17..85fab3a855fd7 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -5,7 +5,14 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.config import ( + CacheConfig, + DeviceConfig, + ModelConfig, + MultiModalConfig, + RendererConfig, + VllmConfig, +) from vllm.sampling_params import SamplingParams from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine.input_processor import InputProcessor @@ -44,22 +51,21 @@ def _mock_input_processor( monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( - skip_tokenizer_init=True, max_model_len=128, mm_processor_cache_gb=mm_cache_gb, generation_config="vllm", + ) + model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb) + + renderer_config = RendererConfig( + model_config=model_config, tokenizer="dummy", + skip_tokenizer_init=True, ) - # Minimal multimodal_config to satisfy references in - # Processor.process_inputs. - class _MockMMConfig: - def __init__(self, gb: float): - self.mm_processor_cache_gb = gb - - model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined] vllm_config = VllmConfig( model_config=model_config, + renderer_config=renderer_config, cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), device_config=DeviceConfig(device="cpu"), ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 58f1a7282352b..768b338b5fe53 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -15,6 +15,7 @@ from vllm.config import ( DeviceConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -127,6 +128,7 @@ def create_vllm_config( return VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, device_config=DeviceConfig("cpu"), diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 616e57de339e2..888ea0169b759 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -19,6 +19,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -61,6 +62,7 @@ def _create_proposer( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 3b8813ceb818a..4483c82438530 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -18,6 +18,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -46,6 +47,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 6bc412abe8695..2e365e08a4e72 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -4,6 +4,7 @@ import numpy as np from vllm.config import ( ModelConfig, + RendererConfig, SpeculativeConfig, VllmConfig, ) @@ -69,6 +70,7 @@ def test_ngram_proposer(): return NgramProposer( vllm_config=VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), speculative_config=SpeculativeConfig( prompt_lookup_min=min_n, prompt_lookup_max=max_n, diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index 4c01560fc88c3..baef2459f8df0 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -6,7 +6,7 @@ from concurrent.futures import Future import pytest from transformers import AutoTokenizer -from vllm.config import StructuredOutputsConfig, VllmConfig +from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.speculative import SpeculativeConfig @@ -72,8 +72,11 @@ def test_backend_guidance_rollback_terminated(): def test_grammar_bitmask_with_specdec(): tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) prompt = tokenizer.encode('{"a": "b"}') + + model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=ModelConfig(tokenizer=TOKENIZER), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3), ) @@ -137,8 +140,11 @@ def test_grammar_init_async_and_sync(async_grammar): # Use "external_launcher" for sync mode, None for async mode executor_backend = None if async_grammar else "external_launcher" + + model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=ModelConfig(tokenizer=TOKENIZER), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), parallel_config=ParallelConfig(distributed_executor_backend=executor_backend), ) diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 70047a993c3f9..5901d38d1b78b 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -7,7 +7,7 @@ from unittest.mock import Mock import pytest -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.reasoning import ReasoningParser from vllm.v1.request import Request from vllm.v1.structured_output import StructuredOutputManager @@ -17,19 +17,26 @@ class TestReasoningStructuredOutput: """Test reasoning-aware structured output functionality.""" @pytest.fixture - def mock_model_config(self): - """Create a mock ModelConfig.""" - config = Mock(spec=ModelConfig) - config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls - config.get_vocab_size = Mock(return_value=50000) + def mock_renderer_config(self): + """Create a mock RendererConfig.""" + renderer_config = Mock(spec=RendererConfig) + renderer_config.skip_tokenizer_init = ( + True # Skip tokenizer init to avoid network calls + ) + + model_config = Mock(spec=ModelConfig) + model_config.get_vocab_size = Mock(return_value=50000) + model_config.trust_remote_code = False # Add missing runner_type attribute that tokenizer initialization expects - config.runner_type = "generate" + model_config.runner_type = "generate" + renderer_config.model_config = model_config + # Add other attributes that tokenizer initialization might need - config.tokenizer = "test-tokenizer" - config.tokenizer_mode = "auto" - config.trust_remote_code = False - config.tokenizer_revision = None - return config + renderer_config.tokenizer = "test-tokenizer" + renderer_config.tokenizer_mode = "auto" + renderer_config.tokenizer_revision = None + + return renderer_config @pytest.fixture def mock_scheduler_config(self): @@ -39,10 +46,10 @@ class TestReasoningStructuredOutput: return config @pytest.fixture - def mock_vllm_config(self, mock_model_config, mock_scheduler_config): + def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config): """Create a mock VllmConfig.""" config = Mock(spec=VllmConfig) - config.model_config = mock_model_config + config.renderer_config = mock_renderer_config config.scheduler_config = mock_scheduler_config config.structured_outputs_config = Mock() config.structured_outputs_config.reasoning_parser = None diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index cfc06666e7984..080d23863652d 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -7,6 +7,7 @@ from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -45,6 +46,7 @@ def get_vllm_config(): ) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 7b8c4268a5237..464e3ab99c76d 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -13,6 +13,7 @@ from vllm.config import ( CacheConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -101,6 +102,7 @@ def get_vllm_config(): parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, @@ -811,6 +813,7 @@ def test_hybrid_attention_mamba_tensor_shapes(): attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0f84f3ca9d3e3..a4f9fd8d28292 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig +from vllm.config.renderer import RendererConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -81,6 +82,8 @@ __all__ = [ "ParallelConfig", # From vllm.config.pooler "PoolerConfig", + # From vllm.config.renderer + "RendererConfig", # From vllm.config.scheduler "SchedulerConfig", # From vllm.config.speculative diff --git a/vllm/config/model.py b/vllm/config/model.py index 509a9c5e162f7..b0d4fb8e01e64 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -36,7 +36,6 @@ from vllm.transformers_utils.config import ( uses_xdrope_dim, ) from vllm.transformers_utils.gguf_utils import ( - is_gguf, is_remote_gguf, maybe_patch_hf_config_from_gguf, split_remote_gguf, @@ -83,7 +82,6 @@ TaskOption = Literal[ "transcription", "draft", ] -TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -131,18 +129,6 @@ class ModelConfig: Note that the model may support other tasks using the same model runner. """ - tokenizer: SkipValidation[str] = None # type: ignore - """Name or path of the Hugging Face tokenizer to use. If unspecified, model - name or path will be used.""" - tokenizer_mode: TokenizerMode | str = "auto" - """Tokenizer mode:\n - - "auto" will use the tokenizer from `mistral_common` for Mistral models - if available, otherwise it will use the "hf" tokenizer.\n - - "hf" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" @@ -168,13 +154,6 @@ class ModelConfig: hf_config_path: str | None = None """Name or path of the Hugging Face config to use. If unspecified, model name or path will be used.""" - allowed_local_media_path: str = "" - """Allowing API requests to read local images or videos from directories - specified by the server file system. This is a security risk. Should only - be enabled in trusted environments.""" - allowed_media_domains: list[str] | None = None - """If set, only media URLs that belong to this domain can be used for - multi-modal inputs. """ revision: str | None = None """The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" @@ -182,10 +161,6 @@ class ModelConfig: """The specific revision to use for the model code on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - tokenizer_revision: str | None = None - """The specific revision to use for the tokenizer on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" max_model_len: SkipValidation[int] = None # type: ignore """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -230,10 +205,6 @@ class ModelConfig: preventing potential numerical issues. Note that even if this is set to False, cascade attention will be only used when the heuristic tells that it's beneficial.""" - skip_tokenizer_init: bool = False - """Skip initialization of tokenizer and detokenizer. Expects valid - `prompt_token_ids` and `None` for prompt from the input. The generated - output will contain token ids.""" enable_prompt_embeds: bool = False """If `True`, enables passing text embeddings as inputs via the `prompt_embeds` key. @@ -294,8 +265,6 @@ class ModelConfig: logits_processors: list[str | type[LogitsProcessor]] | None = None """One or more logits processors' fully-qualified class names or class definitions""" - io_processor_plugin: str | None = None - """IOProcessor plugin name to load at model startup""" # Pooler config pooler_config: PoolerConfig | None = None @@ -308,7 +277,6 @@ class ModelConfig: from the architecture of `self.model`.""" limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None enable_mm_embeds: InitVar[bool | None] = None - media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None mm_processor_kwargs: InitVar[dict[str, Any] | None] = None mm_processor_cache_gb: InitVar[float | None] = None mm_processor_cache_type: InitVar[MMCacheType | None] = None @@ -335,18 +303,12 @@ class ModelConfig: "runner", "convert", "task", - "tokenizer", - "tokenizer_mode", "seed", "hf_config_path", - "allowed_local_media_path", - "allowed_media_domains", - "tokenizer_revision", "spec_target_max_model_len", "enforce_eager", "logprobs_mode", "disable_cascade_attn", - "skip_tokenizer_init", "served_model_name", "config_format", "hf_token", @@ -354,11 +316,9 @@ class ModelConfig: "logits_processor_pattern", "override_attention_dtype", "logits_processors", - "io_processor_plugin", "pooler_config", "multimodal_config", "limit_mm_per_prompt", - "media_io_kwargs", "mm_processor_kwargs", "mm_processor_cache_gb", "mm_processor_cache_type", @@ -423,7 +383,6 @@ class ModelConfig: # Multimodal config init vars limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, - media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, mm_processor_cache_gb: float | None, mm_processor_cache_type: MMCacheType | None, @@ -438,13 +397,8 @@ class ModelConfig: self.served_model_name = get_served_model_name( self.model, self.served_model_name ) - self.model = maybe_model_redirect(self.model) - # The tokenizer is consistent with the model by default. - if self.tokenizer is None: - self.tokenizer = self.model - if self.tokenizer_revision is None: - self.tokenizer_revision = self.revision - self.tokenizer = maybe_model_redirect(self.tokenizer) + self.original_model = self.model + self.model = maybe_model_redirect(self.original_model) if isinstance(self.hf_config_path, str): self.hf_config_path = maybe_model_redirect(self.hf_config_path) @@ -465,7 +419,7 @@ class ModelConfig: hf_overrides_kw[key] = value hf_overrides_fn = None - self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + self.maybe_pull_model_for_runai(self.model) from vllm.platforms import current_platform @@ -648,7 +602,8 @@ class ModelConfig: ) self.original_max_model_len = self.max_model_len - self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + self.recalculate_max_model_len(self.original_max_model_len) + # Init multimodal config if needed if self._model_info.supports_multimodal: if ( @@ -664,7 +619,6 @@ class ModelConfig: mm_config_kwargs = dict( limit_per_prompt=limit_mm_per_prompt, enable_mm_embeds=enable_mm_embeds, - media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, mm_processor_cache_gb=mm_processor_cache_gb, mm_processor_cache_type=mm_processor_cache_type, @@ -682,16 +636,8 @@ class ModelConfig: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) - # Multimodal GGUF models must use original repo for mm processing - if is_gguf(self.tokenizer) and self.is_multimodal_model: - raise ValueError( - "Loading a multimodal GGUF model needs to use original " - "tokenizer. Please specify the unquantized hf model's " - "repo name or path using the --tokenizer argument." - ) - if self.disable_sliding_window: - # Set after get_and_verify_max_len to ensure that max_model_len + # Set after recalculate_max_model_len to ensure that max_model_len # can be correctly capped to sliding window size self.hf_text_config.sliding_window = None @@ -715,10 +661,9 @@ class ModelConfig: @model_validator(mode="after") def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": - if not isinstance(self.tokenizer, str): - raise ValueError("tokenizer must be a string after __post_init__.") if not isinstance(self.max_model_len, int): raise ValueError("max_model_len must be an integer after __post_init__.") + return self def _get_transformers_backend_cls(self) -> str: @@ -767,49 +712,17 @@ class ModelConfig: """The architecture vllm actually used.""" return self._architecture - def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: - """Pull model/tokenizer from Object Storage to temporary - directory when needed. - - Args: - model: Model name or path - tokenizer: Tokenizer name or path - """ - - if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): + def maybe_pull_model_for_runai(self, model: str) -> None: + """Pull model from Object Storage to temporary directory when needed.""" + if not is_runai_obj_uri(model): return - if is_runai_obj_uri(model): - object_storage_model = ObjectStorageModel(url=model) - object_storage_model.pull_files( - model, allow_pattern=["*.model", "*.py", "*.json"] - ) - self.model_weights = model - self.model = object_storage_model.dir - - # If tokenizer is same as model, download to same directory - if model == tokenizer: - object_storage_model.pull_files( - model, - ignore_pattern=[ - "*.pt", - "*.safetensors", - "*.bin", - "*.tensors", - "*.pth", - ], - ) - self.tokenizer = object_storage_model.dir - return - - # Only download tokenizer if needed and not already handled - if is_runai_obj_uri(tokenizer): - object_storage_tokenizer = ObjectStorageModel(url=tokenizer) - object_storage_tokenizer.pull_files( - model, - ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], - ) - self.tokenizer = object_storage_tokenizer.dir + object_storage_model = ObjectStorageModel(url=model) + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"] + ) + self.model_weights = model + self.model = object_storage_model.dir def _get_encoder_config(self): model = self.model @@ -1712,30 +1625,38 @@ class ModelConfig: return dense_modules[-1]["out_features"] return self.get_hidden_size() - def get_and_verify_max_len(self, max_model_len: int): + def recalculate_max_model_len( + self, + original_max_model_len: int | None, + *, + tokenizer: str | None = None, + tokenizer_revision: str | None = None, + ) -> None: # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. + # NOTE: For simplicity we assume `args.model == args.tokenizer` + # since this is tokenizer_config = None if ( self.runner_type == "pooling" and getattr(self.hf_config, "position_embedding_type", "") == "absolute" ): tokenizer_config = try_get_tokenizer_config( - self.tokenizer, + tokenizer or self.model, trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision, + revision=tokenizer_revision or self.revision, ) - max_model_len = _get_and_verify_max_len( + + self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, tokenizer_config=tokenizer_config, - max_model_len=max_model_len, + max_model_len=original_max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window=self.get_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config, ) - logger.info("Using max model len %s", max_model_len) - return max_model_len + logger.info("Using max model len %s", self.max_model_len) @property def attn_type(self) -> AttnTypeStr: diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8a2936de96d6f..37e2f6b4d419a 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -79,10 +79,6 @@ class MultiModalConfig: WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users!""" - media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" mm_processor_kwargs: dict[str, object] | None = None """Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py new file mode 100644 index 0000000000000..36a922b93ca05 --- /dev/null +++ b/vllm/config/renderer.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Literal + +from pydantic import Field, SkipValidation +from pydantic.dataclasses import dataclass + +from vllm.config.model import ModelConfig +from vllm.config.utils import config +from vllm.transformers_utils.gguf_utils import is_gguf +from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri +from vllm.transformers_utils.utils import maybe_model_redirect + +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] + + +@config +@dataclass +class RendererConfig: + """Configuration for the renderer.""" + + # NOTE: In reality, this is a required argument. + # We provide a dummy default value here to generate the CLI args. + model_config: SkipValidation[ModelConfig] = None # type: ignore + """Provides model context to the renderer.""" + + tokenizer: str = "" + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode | str = "auto" + """Tokenizer mode:\n + - "auto" will use the tokenizer from `mistral_common` for Mistral models + if available, otherwise it will use the "hf" tokenizer.\n + - "hf" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - Other custom values can be supported via plugins.""" + tokenizer_revision: str | None = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + + io_processor_plugin: str | None = None + """IOProcessor plugin name to load at model startup.""" + + media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + allowed_media_domains: list[str] | None = None + """If set, only media URLs that belong to this domain can be used for + multi-modal inputs. """ + + @property + def trust_remote_code(self) -> bool: + return self.model_config.trust_remote_code + + def __post_init__(self) -> None: + model_config = self.model_config + + # The tokenizer is consistent with the model by default. + if not self.tokenizer: + self.tokenizer = ( + ModelConfig.model + if model_config is None + else model_config.original_model + ) + if not self.tokenizer_revision: + self.tokenizer_revision = ( + ModelConfig.revision if model_config is None else model_config.revision + ) + + self.original_tokenizer = self.tokenizer + self.tokenizer = maybe_model_redirect(self.original_tokenizer) + self.maybe_pull_tokenizer_for_runai(self.tokenizer) + + # Multimodal GGUF models must use original repo for mm processing + is_multimodal_model = ( + ModelConfig.is_multimodal_model + if model_config is None + else model_config.is_multimodal_model + ) + if is_gguf(self.tokenizer) and is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + + def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None: + """Pull tokenizer from Object Storage to temporary directory when needed.""" + if not is_runai_obj_uri(tokenizer): + return + + object_storage_tokenizer = ObjectStorageModel(url=tokenizer) + object_storage_tokenizer.pull_files( + tokenizer, + ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], + ) + self.tokenizer = object_storage_tokenizer.dir diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index bf533bf14e55c..63b63eac907d2 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -322,16 +322,11 @@ class SpeculativeConfig: self.draft_model_config = ModelConfig( model=self.model, runner="draft", - tokenizer=self.target_model_config.tokenizer, - tokenizer_mode=self.target_model_config.tokenizer_mode, trust_remote_code=self.target_model_config.trust_remote_code, - allowed_local_media_path=self.target_model_config.allowed_local_media_path, - allowed_media_domains=self.target_model_config.allowed_media_domains, dtype=self.target_model_config.dtype, seed=self.target_model_config.seed, revision=self.revision, code_revision=self.code_revision, - tokenizer_revision=self.target_model_config.tokenizer_revision, spec_target_max_model_len=self.target_model_config.max_model_len, quantization=self.quantization, enforce_eager=self.target_model_config.enforce_eager, diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 36e4bd159dc72..417797c445b95 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -39,6 +39,7 @@ from .lora import LoRAConfig from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig +from .renderer import RendererConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -181,6 +182,8 @@ class VllmConfig: # try to download a model model_config: ModelConfig = Field(default=None) """Model configuration.""" + renderer_config: RendererConfig = Field(default_factory=RendererConfig) + """Renderer configuration.""" cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration.""" parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) @@ -741,7 +744,7 @@ class VllmConfig: from vllm.multimodal import MULTIMODAL_REGISTRY self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config) ) logger.debug( "Encoder-decoder model detected: setting " @@ -1186,11 +1189,13 @@ class VllmConfig: computed_compile_ranges_split_points ) - def recalculate_max_model_len(self, max_model_len: int): - # Can only be called in try_verify_and_update_config - model_config = self.model_config - max_model_len = model_config.get_and_verify_max_len(max_model_len) - self.model_config.max_model_len = max_model_len + def recalculate_max_model_len(self, original_max_model_len: int | None) -> None: + # Can only be called during try_verify_and_update_config + self.model_config.recalculate_max_model_len( + original_max_model_len, + tokenizer=self.renderer_config.tokenizer, + tokenizer_revision=self.renderer_config.tokenizer_revision, + ) def try_verify_and_update_config(self): if self.model_config is None: @@ -1264,11 +1269,11 @@ class VllmConfig: return ( f"model={self.model_config.model!r}, " f"speculative_config={self.speculative_config!r}, " - f"tokenizer={self.model_config.tokenizer!r}, " - f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, " - f"tokenizer_mode={self.model_config.tokenizer_mode}, " + f"tokenizer={self.renderer_config.tokenizer!r}, " + f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, " + f"tokenizer_mode={self.renderer_config.tokenizer_mode}, " f"revision={self.model_config.revision}, " - f"tokenizer_revision={self.model_config.tokenizer_revision}, " + f"tokenizer_revision={self.renderer_config.tokenizer_revision}, " f"trust_remote_code={self.model_config.trust_remote_code}, " f"dtype={self.model_config.dtype}, " f"max_seq_len={self.model_config.max_model_len}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ceac5407af6e2..bd398abb0bf81 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,11 +71,11 @@ from vllm.config.model import ( ModelDType, RunnerOption, TaskOption, - TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy +from vllm.config.renderer import RendererConfig, TokenizerMode from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel @@ -355,17 +355,12 @@ class EngineArgs: model: str = ModelConfig.model served_model_name: str | list[str] | None = ModelConfig.served_model_name - tokenizer: str | None = ModelConfig.tokenizer hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert task: TaskOption | None = ModelConfig.task - skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds - tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode trust_remote_code: bool = ModelConfig.trust_remote_code - allowed_local_media_path: str = ModelConfig.allowed_local_media_path - allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains download_dir: str | None = LoadConfig.download_dir safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy load_format: str | LoadFormats = LoadConfig.load_format @@ -449,7 +444,6 @@ class EngineArgs: code_revision: str | None = ModelConfig.code_revision hf_token: bool | str | None = ModelConfig.hf_token hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") - tokenizer_revision: str | None = ModelConfig.tokenizer_revision quantization: QuantizationMethods | None = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce @@ -458,9 +452,6 @@ class EngineArgs: ) enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings - media_io_kwargs: dict[str, dict[str, Any]] = get_field( - MultiModalConfig, "media_io_kwargs" - ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb @@ -474,9 +465,19 @@ class EngineArgs: mm_encoder_attn_backend: AttentionBackendEnum | str | None = ( MultiModalConfig.mm_encoder_attn_backend ) - io_processor_plugin: str | None = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float = MultiModalConfig.video_pruning_rate + # Renderer fields + tokenizer: str | None = None + tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode + tokenizer_revision: str | None = RendererConfig.tokenizer_revision + skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init + io_processor_plugin: str | None = None + media_io_kwargs: dict[str, dict[str, Any]] = get_field( + RendererConfig, "media_io_kwargs" + ) + allowed_local_media_path: str = RendererConfig.allowed_local_media_path + allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras @@ -627,25 +628,14 @@ class EngineArgs: model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) - model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) - model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( "--trust-remote-code", **model_kwargs["trust_remote_code"] ) model_group.add_argument("--dtype", **model_kwargs["dtype"]) model_group.add_argument("--seed", **model_kwargs["seed"]) model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"]) - model_group.add_argument( - "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"] - ) - model_group.add_argument( - "--allowed-media-domains", **model_kwargs["allowed_media_domains"] - ) model_group.add_argument("--revision", **model_kwargs["revision"]) model_group.add_argument("--code-revision", **model_kwargs["code_revision"]) - model_group.add_argument( - "--tokenizer-revision", **model_kwargs["tokenizer_revision"] - ) model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"]) model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"]) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) @@ -657,9 +647,6 @@ class EngineArgs: model_group.add_argument( "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"] ) - model_group.add_argument( - "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"] - ) model_group.add_argument( "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"] ) @@ -698,8 +685,34 @@ class EngineArgs: model_group.add_argument( "--logits-processors", **model_kwargs["logits_processors"] ) - model_group.add_argument( - "--io-processor-plugin", **model_kwargs["io_processor_plugin"] + + # Renderer arguments + renderer_kwargs = get_kwargs(RendererConfig) + renderer_group = parser.add_argument_group( + title="RendererConfig", + description=RendererConfig.__doc__, + ) + renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"]) + renderer_group.add_argument( + "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"] + ) + renderer_group.add_argument( + "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"] + ) + renderer_group.add_argument( + "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"] + ) + renderer_group.add_argument( + "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"] + ) + renderer_group.add_argument( + "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"] + ) + renderer_group.add_argument( + "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"] + ) + renderer_group.add_argument( + "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"] ) # Model loading arguments @@ -949,9 +962,6 @@ class EngineArgs: multimodal_group.add_argument( "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"] ) - multimodal_group.add_argument( - "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"] - ) multimodal_group.add_argument( "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"] ) @@ -1255,18 +1265,13 @@ class EngineArgs: runner=self.runner, convert=self.convert, task=self.task, - tokenizer=self.tokenizer, - tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, - allowed_local_media_path=self.allowed_local_media_path, - allowed_media_domains=self.allowed_media_domains, dtype=self.dtype, seed=self.seed, revision=self.revision, code_revision=self.code_revision, hf_token=self.hf_token, hf_overrides=self.hf_overrides, - tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, @@ -1274,13 +1279,11 @@ class EngineArgs: logprobs_mode=self.logprobs_mode, disable_sliding_window=self.disable_sliding_window, disable_cascade_attn=self.disable_cascade_attn, - skip_tokenizer_init=self.skip_tokenizer_init, enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, enable_mm_embeds=self.enable_mm_embeds, interleave_mm_strings=self.interleave_mm_strings, - media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, @@ -1298,7 +1301,6 @@ class EngineArgs: override_attention_dtype=self.override_attention_dtype, logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, - io_processor_plugin=self.io_processor_plugin, ) def validate_tensorizer_args(self): @@ -1394,9 +1396,25 @@ class EngineArgs: ) model_config = self.create_model_config() - self.model = model_config.model - self.tokenizer = model_config.tokenizer + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=self.tokenizer or "", + tokenizer_mode=self.tokenizer_mode, + tokenizer_revision=self.tokenizer_revision, + skip_tokenizer_init=self.skip_tokenizer_init, + io_processor_plugin=self.io_processor_plugin, + media_io_kwargs=self.media_io_kwargs, + allowed_local_media_path=self.allowed_local_media_path, + allowed_media_domains=self.allowed_media_domains, + ) + model_config.recalculate_max_model_len( + model_config.original_max_model_len, + tokenizer=renderer_config.tokenizer, + tokenizer_revision=renderer_config.tokenizer_revision, + ) + + self.model = model_config.model self._check_feature_supported(model_config) self._set_default_chunked_prefill_and_prefix_caching_args(model_config) self._set_default_max_num_seqs_and_batched_tokens_args( @@ -1768,6 +1786,7 @@ class EngineArgs: ) config = VllmConfig( model_config=model_config, + renderer_config=renderer_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..7b60e7f89861b 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Iterable, Mapping from typing import Any -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.inputs.data import PromptType from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, RequestOutput @@ -22,6 +22,7 @@ class EngineClient(ABC): """Protocol class for Clients to Engine""" vllm_config: VllmConfig + renderer_config: RendererConfig model_config: ModelConfig input_processor: InputProcessor io_processor: IOProcessor | None diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aceaa8bd45b81..5ad256c2f3eb3 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor from typing_extensions import Required, TypedDict from vllm import envs -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict @@ -452,9 +452,10 @@ This is needed because `lru_cache` does not cache when an exception happens. def _try_get_processor_chat_template( tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, - model_config: ModelConfig, + *, + trust_remote_code: bool, ) -> str | None: - cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) + cache_key = (tokenizer.name_or_path, trust_remote_code) if cache_key in _PROCESSOR_CHAT_TEMPLATES: return _PROCESSOR_CHAT_TEMPLATES[cache_key] @@ -466,7 +467,7 @@ def _try_get_processor_chat_template( PreTrainedTokenizerFast, ProcessorMixin, ), - trust_remote_code=model_config.trust_remote_code, + trust_remote_code=trust_remote_code, ) if ( isinstance(processor, ProcessorMixin) @@ -499,7 +500,10 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - chat_template = _try_get_processor_chat_template(tokenizer, model_config) + chat_template = _try_get_processor_chat_template( + tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) if chat_template is not None: return chat_template @@ -513,10 +517,10 @@ def resolve_hf_chat_template( exc_info=True, ) - # 4th priority: Predefined fallbacks + # 4th priority: Predefined fallbacks] path = get_chat_template_fallback_path( model_type=model_config.hf_config.model_type, - tokenizer_name_or_path=model_config.tokenizer, + tokenizer_name_or_path=tokenizer.name_or_path, ) if path is not None: logger.info_once( @@ -538,14 +542,14 @@ def _resolve_chat_template_content_format( tools: list[dict[str, Any]] | None, tokenizer: TokenizerLike | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> _ChatTemplateContentFormat: if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) else: hf_chat_template = None @@ -595,7 +599,7 @@ def resolve_chat_template_content_format( given_format: ChatTemplateContentFormatOption, tokenizer: TokenizerLike | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> _ChatTemplateContentFormat: if given_format != "auto": return given_format @@ -604,7 +608,7 @@ def resolve_chat_template_content_format( chat_template, tools, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) _log_chat_template_content_format( @@ -627,32 +631,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): maximum per prompt. """ - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self._model_config = model_config + self._renderer_config = renderer_config self._items_by_modality = defaultdict[str, list[_T | None]](list) self._uuids_by_modality = defaultdict[str, list[str | None]](list) @property - def model_config(self) -> ModelConfig: - return self._model_config + def renderer_config(self) -> RendererConfig: + return self._renderer_config @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls - model_cls = get_model_cls(self.model_config) + model_cls = get_model_cls(self.renderer_config.model_config) return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): - return self._model_config.allowed_local_media_path + return self._renderer_config.allowed_local_media_path @property def allowed_media_domains(self): - return self._model_config.allowed_media_domains + return self._renderer_config.allowed_media_domains @property def mm_registry(self): @@ -660,7 +664,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): @cached_property def mm_processor(self): - return self.mm_registry.create_processor(self.model_config) + return self.mm_registry.create_processor(self.renderer_config) def add( self, @@ -851,19 +855,20 @@ class MultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - multimodal_config = self._tracker.model_config.multimodal_config - media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) - self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=media_io_kwargs, + media_io_kwargs=self.renderer_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) + @property + def renderer_config(self) -> RendererConfig: + return self._tracker.renderer_config + @property def model_config(self) -> ModelConfig: - return self._tracker.model_config + return self.renderer_config.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image = self._connector.fetch_image(image_url) if image_url else None @@ -963,18 +968,20 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - multimodal_config = self._tracker.model_config.multimodal_config - media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=media_io_kwargs, + media_io_kwargs=self.renderer_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) + @property + def renderer_config(self) -> RendererConfig: + return self._tracker.renderer_config + @property def model_config(self) -> ModelConfig: - return self._tracker.model_config + return self.renderer_config.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image_coro = self._connector.fetch_image_async(image_url) if image_url else None @@ -1604,15 +1611,17 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: def parse_chat_messages( messages: list[ChatCompletionMessageParam], - model_config: ModelConfig, + renderer_config: RendererConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, MultiModalUUIDDict | None, ]: + model_config = renderer_config.model_config + conversation: list[ConversationMessage] = [] - mm_tracker = MultiModalItemTracker(model_config) + mm_tracker = MultiModalItemTracker(renderer_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1635,15 +1644,17 @@ def parse_chat_messages( def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], - model_config: ModelConfig, + renderer_config: RendererConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], Awaitable[MultiModalDataDict | None], MultiModalUUIDDict | None, ]: + model_config = renderer_config.model_config + conversation: list[ConversationMessage] = [] - mm_tracker = AsyncMultiModalItemTracker(model_config) + mm_tracker = AsyncMultiModalItemTracker(renderer_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1748,14 +1759,14 @@ def apply_hf_chat_template( chat_template: str | None, tools: list[dict[str, Any]] | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) if hf_chat_template is None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 913324fd5f9c3..6b3cb26afb626 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -29,8 +29,8 @@ from vllm.config.model import ( HfOverrides, ModelDType, RunnerOption, - TokenizerMode, ) +from vllm.config.renderer import TokenizerMode from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, @@ -343,6 +343,7 @@ class LLM: logger.info("Supported tasks: %s", supported_tasks) self.supported_tasks = supported_tasks + self.renderer_config = self.llm_engine.renderer_config self.model_config = self.llm_engine.model_config self.input_processor = self.llm_engine.input_processor self.io_processor = self.llm_engine.io_processor @@ -808,13 +809,13 @@ class LLM: list_of_messages = [cast(list[ChatCompletionMessageParam], messages)] tokenizer = self.get_tokenizer() - model_config = self.model_config + renderer_config = self.renderer_config resolved_content_format = resolve_chat_template_content_format( chat_template, tools, chat_template_content_format, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) _chat_template_kwargs: dict[str, Any] = dict( @@ -833,7 +834,7 @@ class LLM: # the chat message parsing for it. conversation, mm_data, mm_uuids = parse_chat_messages( msgs, - model_config, + renderer_config, content_format=resolved_content_format, ) @@ -847,7 +848,7 @@ class LLM: prompt_str = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - model_config=model_config, + renderer_config=renderer_config, **_chat_template_kwargs, ) # Special tokens are already included in chat templates so @@ -1290,6 +1291,7 @@ class LLM: lora_request: list[LoRARequest] | LoRARequest | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: + renderer_config = self.renderer_config model_config = self.model_config if isinstance(tokenizer, MistralTokenizer): @@ -1317,7 +1319,7 @@ class LLM: for q, d in input_pairs: _, engine_prompt = get_score_prompt( - model_config=model_config, + renderer_config=renderer_config, data_1=q, data_2=d, tokenizer=tokenizer, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7be601d824f34..d77d611a2654d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1099,7 +1099,7 @@ async def init_app_state( logger.info("Supported tasks: %s", supported_tasks) resolved_chat_template = await process_chat_template( - args.chat_template, engine_client, vllm_config.model_config + args.chat_template, engine_client, vllm_config.renderer_config ) if args.tool_server == "demo": diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3e421e21e3e80..a9e72fb00c5bd 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 99936f588f28b..d887cf48d89f9 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -291,6 +291,7 @@ class OpenAIServing: self.input_processor = self.models.input_processor self.io_processor = self.models.io_processor + self.renderer_config = self.models.renderer_config self.model_config = self.models.model_config self.max_model_len = self.model_config.max_model_len @@ -1100,18 +1101,18 @@ class OpenAIServing: Sequence[RequestPrompt], list[EngineTokensPrompt], ]: - model_config = self.model_config + renderer_config = self.renderer_config resolved_content_format = resolve_chat_template_content_format( chat_template, tool_dicts, chat_template_content_format, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, - model_config, + renderer_config, content_format=resolved_content_format, ) @@ -1138,14 +1139,14 @@ class OpenAIServing: request_prompt = tokenizer.apply_chat_template( conversation=conversation, messages=messages, - model_config=model_config, + model_config=renderer_config.model_config, **_chat_template_kwargs, ) else: request_prompt = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - model_config=model_config, + renderer_config=renderer_config, **_chat_template_kwargs, ) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 953398a9a72ae..ec65e659383d8 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -71,6 +71,7 @@ class OpenAIServingModels: self.input_processor = self.engine_client.input_processor self.io_processor = self.engine_client.io_processor + self.renderer_config = self.engine_client.renderer_config self.model_config = self.engine_client.model_config self.max_model_len = self.model_config.max_model_len diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index cea9924ebbaca..5fd79eed1909b 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing): self.task_type = task_type self.asr_config = self.model_cls.get_speech_to_text_config( - self.model_config, task_type + self.renderer_config, task_type ) self.enable_force_include_usage = enable_force_include_usage @@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing): self.tokenizer = cast( PreTrainedTokenizerBase, get_tokenizer( - tokenizer_name=self.model_config.tokenizer, - tokenizer_mode=self.model_config.tokenizer_mode, + tokenizer_name=self.renderer_config.tokenizer, + tokenizer_mode=self.renderer_config.tokenizer_mode, ), ) @@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing): prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - model_config=self.model_config, + renderer_config=self.renderer_config, language=language, task_type=self.task_type, request_prompt=request.prompt, @@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing): if res.prompt_token_ids is not None: num_prompt_tokens = len(res.prompt_token_ids) if audio_tokens := self.model_cls.get_num_audio_tokens( - audio_duration_s, self.asr_config, self.model_config + audio_duration_s, self.asr_config, self.renderer_config ): num_prompt_tokens += audio_tokens diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 7fb767e26d019..cd28ccba9ef95 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index e5a66783005a6..f657fcefd3a86 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -160,10 +160,8 @@ class ServingScores(OpenAIServing): data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, ) -> tuple[str, TokensPrompt]: - model_config = self.model_config - full_prompt, engine_prompt = get_score_prompt( - model_config=model_config, + renderer_config=self.renderer_config, data_1=data_1, data_2=data_2, tokenizer=tokenizer, diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 072ddd4c90b16..561adbe454f3c 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast from torch.nn import CosineSimilarity from typing_extensions import Required, TypedDict -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.chat_utils import ( BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam, @@ -88,9 +88,9 @@ def _validate_score_input_lens( def parse_score_data( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> tuple[str, str, MultiModalDataDict | None]: - mm_tracker = MultiModalItemTracker(model_config) + mm_tracker = MultiModalItemTracker(renderer_config) content_1 = _parse_score_content(data_1, mm_tracker) content_2 = _parse_score_content(data_2, mm_tracker) @@ -176,7 +176,7 @@ def post_process_tokens( def get_score_prompt( - model_config: ModelConfig, + renderer_config: RendererConfig, tokenizer: TokenizerLike, tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, @@ -185,11 +185,14 @@ def get_score_prompt( prompt_1, prompt_2, mm_data = parse_score_data( data_1, data_2, - model_config, + renderer_config, ) + from vllm.model_executor.model_loader import get_model_cls + model_config = renderer_config.model_config model = get_model_cls(model_config) + if supports_score_template(model): full_prompt = apply_score_template(model_config, prompt_1, prompt_2) prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index daeeb995bc749..a81f73ac9e618 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -13,7 +13,7 @@ from fastapi import Request from fastapi.responses import JSONResponse, StreamingResponse from starlette.background import BackgroundTask, BackgroundTasks -from vllm.config import ModelConfig +from vllm.config import RendererConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( @@ -288,7 +288,7 @@ def process_lora_modules( async def process_chat_template( args_chat_template: Path | str | None, engine_client: EngineClient, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> str | None: resolved_chat_template = load_chat_template(args_chat_template) if resolved_chat_template is not None: @@ -305,7 +305,7 @@ async def process_chat_template( tokenizer=tokenizer, chat_template=None, tools=None, - model_config=model_config, + model_config=renderer_config.model_config, ) if hf_chat_template != resolved_chat_template: @@ -314,6 +314,6 @@ async def process_chat_template( "It is different from official chat template '%s'. " "This discrepancy may lead to performance degradation.", resolved_chat_template, - model_config.model, + renderer_config.model_config.model, ) return resolved_chat_template diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0372b06d0017f..f534d102fc3b7 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing import Any, cast from typing_extensions import assert_never -from vllm.config import ModelConfig +from vllm.config import RendererConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -45,14 +45,15 @@ logger = init_logger(__name__) class InputPreprocessor: def __init__( self, - model_config: ModelConfig, + renderer_config: RendererConfig, tokenizer: TokenizerLike | None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config + self.model_config = renderer_config.model_config self.tokenizer = tokenizer self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache @@ -231,7 +232,7 @@ class InputPreprocessor: def _get_mm_processor(self) -> BaseMultiModalProcessor: if not hasattr(self, "_mm_processor"): self._mm_processor = self.mm_registry.create_processor( - self.model_config, + self.renderer_config, tokenizer=self.tokenizer, cache=self.mm_processor_cache, ) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 007d847ac3b7b..a2700bd5a5016 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax( from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - model_config = model.vllm_config.model_config + renderer_config = model.vllm_config.renderer_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax( from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + revision=renderer_config.tokenizer_revision, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, ) false_id = tokenizer.convert_tokens_to_ids(tokens[0]) @@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - model_config = model.vllm_config.model_config + renderer_config = model.vllm_config.renderer_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + revision=renderer_config.tokenizer_revision, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, ) token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 1f07381c0cbd0..bd472474982ff 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.projector_config = config.projector_config self.text_config = config.text_config - model_config = vllm_config.model_config - tokenizer = cached_tokenizer_from_config(model_config) + renderer_config = vllm_config.renderer_config + tokenizer = cached_tokenizer_from_config(renderer_config) self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN] self.sam_model = build_sam_vit_b() diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 9f8faf9ed91ce..be03e1df81d22 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.projector_config = config.projector_config self.text_config = config.text_config - model_config = vllm_config.model_config - tokenizer = cached_tokenizer_from_config(model_config) + renderer_config = vllm_config.renderer_config + tokenizer = cached_tokenizer_from_config(renderer_config) self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN] self.vision = self._init_vision_module( diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 7036118ada084..f82529d849a64 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -18,7 +18,7 @@ from transformers.models.gemma3n import ( ) from transformers.models.siglip import SiglipImageProcessorFast -from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration( cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: Optional[str], task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -798,7 +798,9 @@ class Gemma3nForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: return SpeechToTextConfig( # Let's set this to 30 as suggested in the docs for now, although diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index a4e50f4086281..96645f20b79df 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -34,7 +34,7 @@ import torch.nn.functional as F from torch import nn from transformers import BatchFeature, PretrainedConfig -from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear @@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, + renderer_config: RendererConfig, stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration( else: raise ValueError(f"Unsupported task type {task_type}") - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) chat = [dict(role="user", content=user_prompt)] prompt = tokenizer.apply_chat_template( chat, @@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """Get the number of audio tokens for an audio duration in sec.""" - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) hop_length = processor.audio_processor.melspec_kwargs["hop_length"] proj_win_size = processor.audio_processor.projector_window_size ds_rate = processor.audio_processor.projector_downsample_rate @@ -903,7 +903,9 @@ class GraniteSpeechForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: """Get the stt config for this model.""" # Default settings are reasonable for this model and we don't currently diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 2aba626a7c737..b9f3ac8aee5f7 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -6,7 +6,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import RendererConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.pooler import ( DispatchPooler, @@ -29,12 +29,12 @@ logger = init_logger(__name__) class GritLMMeanPool(nn.Module): """As `MeanPool`, but only includes non-instruction tokens.""" - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config - tokenizer = cached_tokenizer_from_config(self.model_config) + tokenizer = cached_tokenizer_from_config(self.renderer_config) # Collect the tokens needed for pattern matching. # "▁<" is different from "_<". The former uses "▁" to indicate that @@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module): class GritLMPooler(Pooler): - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self.pooling = GritLMMeanPool(model_config) + self.pooling = GritLMMeanPool(renderer_config) self.head = PoolerHead(PoolerNormalize()) def get_supported_tasks(self) -> Set[PoolingTask]: @@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM): self.pooler = DispatchPooler( { "token_embed": Pooler.for_token_embed(pooler_config), - "embed": GritLMPooler(vllm_config.model_config), + "embed": GritLMPooler(vllm_config.renderer_config), } ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 607ff55835f1d..4df91aaf8b7f2 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -19,7 +19,7 @@ from torch import Tensor from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs -from vllm.config import ModelConfig, SpeechToTextConfig +from vllm.config import RendererConfig, SpeechToTextConfig from vllm.inputs import TokensPrompt from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -887,7 +887,7 @@ class SupportsTranscription(Protocol): cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -930,7 +930,9 @@ class SupportsTranscription(Protocol): @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"] + cls, + renderer_config: RendererConfig, + task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: """Get the speech to text config for the ASR model.""" ... @@ -940,7 +942,7 @@ class SupportsTranscription(Protocol): cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 18985cefbf5ea..d75637da1d0e2 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs) hf_processor.video_processor = cached_video_processor_from_config( - self.ctx.model_config, + self.ctx.renderer_config, processor_cls=InternVLVideoProcessor, size=hf_processor.image_processor.size, **kwargs, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 6dfab595e5b92..4daaefd0c2709 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1169,16 +1169,17 @@ class NemotronH_Nano_VL_V2( self.mlp1 = self.mlp1.to(self.language_model.config.dtype) self.config = config - self.model_config = vllm_config.model_config # Pre-tokenize special tokens for video processing # to avoid repeated tokenization - tokenizer = cached_tokenizer_from_config(vllm_config.model_config) - self._img_start_token_ids = tokenizer.encode( + self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) + self._img_start_token_ids = self._tokenizer.encode( IMG_START, add_special_tokens=False ) - self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False) - self._img_context_token_ids = tokenizer.encode( + self._img_end_token_ids = self._tokenizer.encode( + IMG_END, add_special_tokens=False + ) + self._img_context_token_ids = self._tokenizer.encode( IMG_CONTEXT, add_special_tokens=False ) @@ -1364,7 +1365,7 @@ class NemotronH_Nano_VL_V2( input_embeds for the LLM. """ device = video_embeddings.device - tokenizer = cached_tokenizer_from_config(self.model_config) + tokenizer = self._tokenizer # Generate video replacement token IDs using get_video_repl # This tokenizes each frame separator independently, then uses pre-tokenized diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 391980fc61f9e..797793e658f7b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( - self.ctx.model_config, + self.ctx.renderer_config, **kwargs, ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index faf2d80d24bba..ebe743fa82a00 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -193,7 +193,7 @@ class PixtralProcessorAdapter: class PixtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 7b408248ec74c..0acd564e2e54f 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder from transformers import BatchFeature, TensorType, WhisperConfig from transformers.tokenization_utils_base import TextInput -from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -176,7 +176,7 @@ class VoxtralProcessorAdapter: class VoxtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") @@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) # update quant config to so that ignored module and target module names # match the vLLM model names @@ -450,9 +450,11 @@ class VoxtralForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) audio_config = tokenizer.instruct.audio_encoder.audio_config max_audio_clip_s = audio_config.chunk_length_s sample_rate = audio_config.sampling_rate @@ -468,17 +470,17 @@ class VoxtralForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, + renderer_config: RendererConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, ) -> PromptType: - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless req = TranscriptionRequest( - model=model_config.model, + model=renderer_config.model_config.model, audio=RawAudio.from_audio(audio), language=language, ) @@ -494,14 +496,14 @@ class VoxtralForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR model, without running a forward pass. This is used for estimating the amount of processing for this audio. """ - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) adapter = VoxtralProcessorAdapter(tokenizer) return adapter.get_num_audio_tokens( int(audio_duration_s * stt_config.sample_rate) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b2feff1335151..6f526e3956fff 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layers.cross_attention import CrossAttention -from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType @@ -811,7 +811,7 @@ class WhisperForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, # not needed here + renderer_config: RendererConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -847,9 +847,11 @@ class WhisperForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) return SpeechToTextConfig( max_audio_clip_s=processor.feature_extractor.chunk_length, @@ -861,9 +863,9 @@ class WhisperForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) hop_length = processor.feature_extractor.hop_length assert hop_length is not None # NOTE(NickLucche) user can't pass encoder diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 67bdf5e1557f9..9c838fe679582 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -31,7 +31,7 @@ from .inputs import ( ) if TYPE_CHECKING: - from vllm.config import ModelConfig, VllmConfig + from vllm.config import ModelConfig, RendererConfig, VllmConfig from .processing import ResolvedPromptUpdate from .registry import MultiModalRegistry @@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache): def _enable_processor_cache( - model_config: "ModelConfig", + renderer_config: "RendererConfig", mm_registry: "MultiModalRegistry", ) -> bool: - if not mm_registry.supports_multimodal_inputs(model_config): + if not mm_registry.supports_multimodal_inputs(renderer_config): return False - mm_config = model_config.get_multimodal_config() + mm_config = renderer_config.model_config.get_multimodal_config() return mm_config.mm_processor_cache_gb > 0 @@ -599,7 +599,7 @@ def processor_cache_from_config( """Return a `BaseMultiModalProcessorCache`, if enabled.""" model_config = vllm_config.model_config - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -611,14 +611,14 @@ def processor_cache_from_config( def processor_only_cache_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", mm_registry: "MultiModalRegistry", ): """Return a `MultiModalProcessorOnlyCache`, if enabled.""" - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(renderer_config, mm_registry): return None - return MultiModalProcessorOnlyCache(model_config) + return MultiModalProcessorOnlyCache(renderer_config.model_config) class BaseMultiModalReceiverCache( @@ -787,7 +787,7 @@ def engine_receiver_cache_from_config( """ model_config = vllm_config.model_config - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -809,9 +809,7 @@ def worker_receiver_cache_from_config( Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and mm_processor_cache_type=="shm". """ - model_config = vllm_config.model_config - - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0390773783961..81ceb76a4b961 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -23,7 +23,7 @@ import torch from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.collection_utils import flatten_2d_lists, full_groupby from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -53,7 +53,7 @@ if TYPE_CHECKING: from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig from .cache import BaseMultiModalProcessorCache from .profiling import BaseDummyInputsBuilder @@ -63,6 +63,7 @@ else: ProcessorMixin = object ModelConfig = object + RendererConfig = object BaseMultiModalProcessorCache = object @@ -945,12 +946,29 @@ class InputProcessingContext: modify the inputs. """ - model_config: ModelConfig - """The configuration of the model.""" + renderer_config: RendererConfig + """The configuration of the renderer.""" tokenizer: TokenizerLike | None """The tokenizer used to tokenize the inputs.""" + @classmethod + def from_config( + cls, + renderer_config: RendererConfig, + *, + tokenizer: TokenizerLike | None = None, + ): + if tokenizer is None and not renderer_config.skip_tokenizer_init: + tokenizer = cached_tokenizer_from_config(renderer_config) + + return cls(renderer_config, tokenizer) + + @property + def model_config(self) -> ModelConfig: + """The configuration of the model.""" + return self.renderer_config.model_config + def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( @@ -1047,7 +1065,7 @@ class InputProcessingContext: typ = ProcessorMixin return cached_processor_from_config( - self.model_config, + self.renderer_config, processor_cls=typ, tokenizer=self.tokenizer, **kwargs, diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 00a84f9dec4f7..e49aaa5045c62 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike from .cache import BaseMultiModalProcessorCache from .processing import ( @@ -22,7 +22,7 @@ from .profiling import ( ) if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig from vllm.model_executor.models.interfaces import SupportsMultiModal logger = init_logger(__name__) @@ -114,17 +114,18 @@ class MultiModalRegistry: return mm_options if len(mm_options) > 0 else None - def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: + def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool: """ Checks if the model supports multimodal inputs. Returns True if the model is multimodal with any non-zero supported modalities, otherwise returns False, effectively running in text-only mode. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return False - info = self._create_processing_info(model_config, tokenizer=None) + info = self._create_processing_info(renderer_config, tokenizer=None) supported_modalities = info.get_supported_mm_limits() mm_config = model_config.get_multimodal_config() @@ -144,7 +145,7 @@ class MultiModalRegistry: def get_max_tokens_per_item_by_modality( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, cache: BaseMultiModalProcessorCache | None = None, profiler_limits: Mapping[str, int] | None = None, @@ -153,10 +154,11 @@ class MultiModalRegistry: Get the maximum number of tokens per data item from each modality based on underlying model configuration. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len @@ -171,7 +173,7 @@ class MultiModalRegistry: def get_mm_limits_per_prompt( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, cache: BaseMultiModalProcessorCache | None = None, ) -> Mapping[str, int]: @@ -179,10 +181,11 @@ class MultiModalRegistry: Get the maximum number of multi-modal input instances for each modality that are allowed per prompt for a model class. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -228,30 +231,21 @@ class MultiModalRegistry: assert hasattr(model_cls, "_processor_factory") return cast("SupportsMultiModal", model_cls) - def _create_processing_ctx( - self, - model_config: "ModelConfig", - tokenizer: TokenizerLike | None = None, - ) -> InputProcessingContext: - if tokenizer is None and not model_config.skip_tokenizer_init: - tokenizer = cached_tokenizer_from_config(model_config) - - return InputProcessingContext(model_config, tokenizer) - def _create_processing_info( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, tokenizer: TokenizerLike | None = None, ) -> BaseProcessingInfo: - model_cls = self._get_model_cls(model_config) + model_cls = self._get_model_cls(renderer_config.model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) + + ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) return factories.info(ctx) def create_processor( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, tokenizer: TokenizerLike | None = None, cache: BaseMultiModalProcessorCache | None = None, @@ -259,19 +253,19 @@ class MultiModalRegistry: """ Create a multi-modal processor for a specific model and tokenizer. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) - + ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) return factories.build_processor(ctx, cache=cache) def get_decoder_dummy_data( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -280,15 +274,15 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by `model_config`. + The model is identified by `renderer_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(model_config) + mm_options = self._extract_mm_options(renderer_config.model_config) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) @@ -304,7 +298,7 @@ class MultiModalRegistry: def get_encoder_dummy_data( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -313,15 +307,15 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by `model_config`. + The model is identified by `renderer_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(model_config) + mm_options = self._extract_mm_options(renderer_config.model_config) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) @@ -336,13 +330,15 @@ class MultiModalRegistry: return dummy_data - def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: + def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int: """ Get the maximum length of the encoder input for encoder-decoder models. """ + model_config = renderer_config.model_config if not model_config.is_encoder_decoder: return 0 - max_tokens = self.get_max_tokens_per_item_by_modality(model_config) + + max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config) if not max_tokens: # TODO - this function assumes encoder-decoder models are # multimodal. This will need to change when adding support for more diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 1d44feeee500f..c9575511af8c9 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import RendererConfig logger = init_logger(__name__) @@ -205,18 +205,18 @@ def get_tokenizer( cached_get_tokenizer = lru_cache(get_tokenizer) -def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): +def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs): return cached_get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + tokenizer_mode=renderer_config.tokenizer_mode, + revision=renderer_config.tokenizer_revision, + trust_remote_code=renderer_config.trust_remote_code, **kwargs, ) -def init_tokenizer_from_config(model_config: "ModelConfig"): - runner_type = model_config.runner_type +def init_tokenizer_from_config(renderer_config: "RendererConfig"): + runner_type = renderer_config.model_config.runner_type if runner_type == "generate" or runner_type == "draft": truncation_side = "left" elif runner_type == "pooling": @@ -225,9 +225,9 @@ def init_tokenizer_from_config(model_config: "ModelConfig"): assert_never(runner_type) return get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, + renderer_config.tokenizer, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, + revision=renderer_config.tokenizer_revision, truncation_side=truncation_side, ) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index e9864b0c1531d..bdebd2686bae2 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor) @@ -233,17 +233,18 @@ def cached_get_processor_without_dynamic_kwargs( def cached_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: + model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( + assert not is_gguf(renderer_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load processor." ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + model = renderer_config.tokenizer + revision = renderer_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -297,9 +298,11 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor) def cached_feature_extractor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", **kwargs: Any, ): + model_config = renderer_config.model_config + return cached_get_feature_extractor( model_config.model, revision=model_config.revision, @@ -348,16 +351,17 @@ cached_get_image_processor = lru_cache(get_image_processor) def cached_image_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", **kwargs: Any, ): + model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( + assert not is_gguf(renderer_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load image processor." ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + model = renderer_config.tokenizer + revision = renderer_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -411,10 +415,12 @@ cached_get_video_processor = lru_cache(get_video_processor) def cached_video_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", processor_cls: type[_V] | None = None, **kwargs: Any, ): + model_config = renderer_config.model_config + return cached_get_video_processor( model_config.model, revision=model_config.revision, diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 3959e9a59a53b..21315b85f22aa 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry from vllm.v1.request import Request if TYPE_CHECKING: - from vllm.config import ModelConfig, SchedulerConfig + from vllm.config import RendererConfig, SchedulerConfig logger = init_logger(__name__) @@ -250,7 +250,7 @@ class EncoderCacheManager: def compute_encoder_budget( - model_config: "ModelConfig", + renderer_config: "RendererConfig", scheduler_config: "SchedulerConfig", mm_registry: MultiModalRegistry, ) -> tuple[int, int]: @@ -263,9 +263,9 @@ def compute_encoder_budget( - Space budget for encoder cache size, measured in number of tokens from the input sequence. """ - if mm_registry.supports_multimodal_inputs(model_config): + if mm_registry.supports_multimodal_inputs(renderer_config): max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - model_config + renderer_config ) return compute_mm_encoder_budget( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 0a8efa2fd512f..96073efc5f56a 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface): # This can be changed when we make encoder cache for embedding caching # across requests. encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=vllm_config.model_config, + renderer_config=vllm_config.renderer_config, scheduler_config=vllm_config.scheduler_config, mm_registry=mm_registry, ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fd7e04dc02082..b76f9c0595d61 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -91,6 +91,7 @@ class AsyncLLM(EngineClient): # Ensure we can serialize custom transformer configs maybe_register_config_serialize_by_value() + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config @@ -108,15 +109,15 @@ class AsyncLLM(EngineClient): "enabling logging without default stat loggers." ) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = init_tokenizer_from_config(self.renderer_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, + self.renderer_config.io_processor_plugin, ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index e6a94f4e3de5d..a2f6ba5be8c17 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -43,6 +43,7 @@ class InputProcessor: mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -54,7 +55,7 @@ class InputProcessor: self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry) self.input_preprocessor = InputPreprocessor( - self.model_config, + self.renderer_config, tokenizer, mm_registry, mm_processor_cache=self.mm_processor_cache, @@ -252,7 +253,7 @@ class InputProcessor: if not params.structured_outputs or not self.structured_outputs_config: return - if self.model_config.skip_tokenizer_init and params.structured_outputs: + if self.renderer_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) @@ -582,7 +583,7 @@ class InputProcessor: if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( - model_config, + self.renderer_config, tokenizer=tokenizer, ) assert isinstance(mm_processor, EncDecMultiModalProcessor) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c31291005477..ba0e1cf25cb08 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -60,6 +60,7 @@ class LLMEngine: ) -> None: self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -83,15 +84,15 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = init_tokenizer_from_config(self.renderer_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, + self.renderer_config.io_processor_plugin, ) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 31428db2d3afc..7976418510aab 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -85,7 +85,7 @@ class EagleProposer: # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - vllm_config.model_config + vllm_config.renderer_config ) self.attn_metadata_builder: AttentionMetadataBuilder | None = None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 5ee88178cdf60..36aa3d9bb3f94 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -63,7 +63,7 @@ class StructuredOutputManager: max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) - if not self.vllm_config.model_config.skip_tokenizer_init: + if not vllm_config.renderer_config.skip_tokenizer_init: # The default max_workers if not specified is the number of # CPUs * 5, which is way too high since these tasks are CPU-bound, # not I/O bound. We also know we would never dominate CPU usage @@ -71,21 +71,15 @@ class StructuredOutputManager: # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_config( - model_config=self.vllm_config.model_config - ) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) + self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config) + reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser reasoning_parser_plugin = ( - self.vllm_config.structured_outputs_config.reasoning_parser_plugin + vllm_config.structured_outputs_config.reasoning_parser_plugin ) if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) + reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoning_parser @@ -93,7 +87,7 @@ class StructuredOutputManager: self.reasoner = reasoner_cls(tokenizer=self.tokenizer) self.enable_in_reasoning = ( - self.vllm_config.structured_outputs_config.enable_in_reasoning + vllm_config.structured_outputs_config.enable_in_reasoning ) def grammar_init(self, request: Request) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a50360ab08694..b3c8d4da22b63 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -271,6 +271,7 @@ class GPUModelRunner( device: torch.device, ): self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config @@ -335,7 +336,7 @@ class GPUModelRunner( self.uses_mrope = model_config.uses_mrope self.uses_xdrope_dim = model_config.uses_xdrope_dim self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - model_config + self.renderer_config ) if self.model_config.is_encoder_decoder: @@ -558,7 +559,7 @@ class GPUModelRunner( self.mm_budget = ( MultiModalBudget( - self.model_config, + self.renderer_config, self.scheduler_config, self.mm_registry, ) @@ -3873,7 +3874,7 @@ class GPUModelRunner( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + renderer_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 283f21b779e38..7e2a6af68e6af 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -143,6 +143,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): original_parallel_config: ParallelConfig | None = None, ): self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -222,7 +223,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - model_config + self.renderer_config ) # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." @@ -353,7 +354,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_budget = ( MultiModalBudget( - self.model_config, + self.renderer_config, self.scheduler_config, self.mm_registry, ) @@ -2038,7 +2039,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + renderer_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 0b0e2006d73d2..44418b9985e2e 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,7 @@ import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import RendererConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -23,24 +23,29 @@ class MultiModalBudget: def __init__( self, - model_config: ModelConfig, + renderer_config: RendererConfig, scheduler_config: SchedulerConfig, mm_registry: MultiModalRegistry, ) -> None: super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config + self.model_config = renderer_config.model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry - self.cache = cache = processor_only_cache_from_config(model_config, mm_registry) + self.cache = cache = processor_only_cache_from_config( + renderer_config, mm_registry + ) - self.max_model_len = model_config.max_model_len + self.max_model_len = self.model_config.max_model_len self.max_num_reqs = scheduler_config.max_num_seqs - self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache) + self.mm_limits = mm_registry.get_mm_limits_per_prompt( + renderer_config, cache=cache + ) max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - model_config, + renderer_config, cache=cache, profiler_limits=self.mm_limits, ) From e83b7e379c11bf136c1b96bc6a67b6d2207cfde4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 7 Dec 2025 16:00:22 +0800 Subject: [PATCH 047/133] Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199) --- docs/contributing/model/transcription.md | 12 +- .../distributed/test_sequence_parallelism.py | 2 - tests/compile/test_functionalization.py | 6 +- tests/compile/test_fusion.py | 6 +- tests/compile/test_fusion_attn.py | 2 - tests/compile/test_pass_manager.py | 8 +- tests/compile/test_qk_norm_rope_fusion.py | 5 +- tests/distributed/test_kvlayout.py | 3 - .../entrypoints/openai/test_chat_template.py | 22 +- .../entrypoints/openai/test_lora_resolvers.py | 21 +- tests/entrypoints/openai/test_serving_chat.py | 28 +-- .../entrypoints/openai/test_serving_engine.py | 8 +- .../entrypoints/openai/test_serving_models.py | 8 +- tests/entrypoints/test_chat_utils.py | 190 +++++++++++------- tests/lora/test_lora_manager.py | 14 +- tests/lora/test_worker.py | 2 - .../test_model_load_with_params.py | 22 +- tests/models/language/pooling/test_gritlm.py | 5 +- .../multimodal/processing/test_common.py | 22 +- .../multimodal/processing/test_glm4_1v.py | 4 +- .../multimodal/processing/test_h2ovl.py | 2 +- .../multimodal/processing/test_idefics3.py | 2 +- .../multimodal/processing/test_internvl.py | 2 +- .../multimodal/processing/test_llama4.py | 2 +- .../multimodal/processing/test_llava_next.py | 6 +- .../processing/test_llava_onevision.py | 6 +- .../processing/test_minimax_vl_01.py | 4 +- .../multimodal/processing/test_mllama4.py | 2 +- .../multimodal/processing/test_nemotron_vl.py | 2 +- .../multimodal/processing/test_phi3v.py | 2 +- .../multimodal/processing/test_phi4mm.py | 2 +- .../multimodal/processing/test_qwen2_vl.py | 2 +- .../multimodal/processing/test_smolvlm.py | 2 +- .../processing/test_tensor_schema.py | 24 ++- .../processing/test_transformers.py | 5 +- tests/models/multimodal/test_mapping.py | 33 ++- tests/models/registry.py | 33 +-- tests/models/utils.py | 17 +- tests/multimodal/test_cache.py | 27 +-- tests/multimodal/test_processing.py | 24 +-- tests/multimodal/test_registry.py | 4 +- tests/test_config.py | 131 +++++------- tests/test_inputs.py | 7 +- tests/v1/attention/utils.py | 2 - tests/v1/core/test_kv_cache_utils.py | 20 +- tests/v1/core/test_scheduler.py | 2 - tests/v1/core/utils.py | 2 - tests/v1/engine/test_engine_core.py | 2 - .../engine/test_process_multi_modal_uuids.py | 24 +-- tests/v1/kv_connector/unit/utils.py | 2 - tests/v1/spec_decode/test_eagle.py | 2 - tests/v1/spec_decode/test_mtp.py | 2 - tests/v1/spec_decode/test_ngram.py | 2 - .../test_backend_guidance.py | 12 +- .../test_reasoning_structured_output.py | 35 ++-- tests/v1/tpu/worker/test_tpu_model_runner.py | 2 - tests/v1/worker/test_gpu_model_runner.py | 3 - vllm/config/__init__.py | 3 - vllm/config/model.py | 141 ++++++++++--- vllm/config/multimodal.py | 4 + vllm/config/renderer.py | 109 ---------- vllm/config/speculative.py | 5 + vllm/config/vllm.py | 25 +-- vllm/engine/arg_utils.py | 99 ++++----- vllm/engine/protocol.py | 3 +- vllm/entrypoints/chat_utils.py | 79 ++++---- vllm/entrypoints/llm.py | 14 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 11 +- vllm/entrypoints/openai/serving_models.py | 1 - vllm/entrypoints/openai/speech_to_text.py | 10 +- vllm/entrypoints/pooling/pooling/serving.py | 2 +- vllm/entrypoints/pooling/score/serving.py | 4 +- vllm/entrypoints/score_utils.py | 13 +- vllm/entrypoints/utils.py | 8 +- vllm/inputs/preprocess.py | 9 +- vllm/model_executor/models/adapters.py | 20 +- vllm/model_executor/models/deepseek_ocr.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 4 +- vllm/model_executor/models/gemma3n_mm.py | 8 +- vllm/model_executor/models/granite_speech.py | 14 +- vllm/model_executor/models/gritlm.py | 14 +- vllm/model_executor/models/interfaces.py | 10 +- vllm/model_executor/models/interns1.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 13 +- vllm/model_executor/models/nemotron_vl.py | 2 +- vllm/model_executor/models/pixtral.py | 2 +- vllm/model_executor/models/voxtral.py | 22 +- vllm/model_executor/models/whisper.py | 14 +- vllm/multimodal/cache.py | 22 +- vllm/multimodal/processing.py | 28 +-- vllm/multimodal/registry.py | 64 +++--- vllm/tokenizers/registry.py | 24 +-- vllm/transformers_utils/processor.py | 28 +-- vllm/v1/core/encoder_cache_manager.py | 8 +- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/async_llm.py | 7 +- vllm/v1/engine/input_processor.py | 7 +- vllm/v1/engine/llm_engine.py | 7 +- vllm/v1/spec_decode/eagle.py | 2 +- vllm/v1/structured_output/__init__.py | 18 +- vllm/v1/worker/gpu_model_runner.py | 7 +- vllm/v1/worker/tpu_model_runner.py | 7 +- vllm/v1/worker/utils.py | 19 +- 105 files changed, 797 insertions(+), 969 deletions(-) delete mode 100644 vllm/config/renderer.py diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index c5605789022d4..fca941acd5076 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -22,7 +22,7 @@ Declare supported languages and capabilities: import torch from torch import nn - from vllm.config import RendererConfig, SpeechToTextConfig + from vllm.config import ModelConfig, SpeechToTextConfig from vllm.inputs.data import PromptType from vllm.model_executor.models.interfaces import SupportsTranscription @@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model: @classmethod def get_speech_to_text_config( cls, - renderer_config: RendererConfig, + model_config: ModelConfig, task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: return SpeechToTextConfig( @@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example @@ -216,7 +216,7 @@ Relevant server logic: prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - renderer_config=self.renderer_config, + model_config=self.model_config, language=language, task_type=self.task_type, request_prompt=request.prompt, diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index 77d3a24d42923..d9fdc3acc3d6f 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -17,7 +17,6 @@ from vllm.config import ( DeviceConfig, ModelConfig, PassConfig, - RendererConfig, VllmConfig, get_current_vllm_config, set_current_vllm_config, @@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model( vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), device_config=device_config, compilation_config=compilation_config, ) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 52d6fd1e5d75e..7585915892700 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -15,7 +15,6 @@ from vllm.config import ( CompilationConfig, ModelConfig, PassConfig, - RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -220,11 +219,8 @@ def test_fix_functionalization( torch.set_default_device("cuda") torch.set_default_dtype(dtype) - model_config = ModelConfig(dtype=dtype) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), + model_config=ModelConfig(dtype=dtype), compilation_config=CompilationConfig( custom_ops=["all"], pass_config=PassConfig( diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index bb4ee6b8e3eca..d0ba8385f4a01 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -15,7 +15,6 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, - RendererConfig, VllmConfig, ) from vllm.model_executor.layers.layernorm import RMSNorm @@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant( custom_ops.append("+rms_norm") if enable_quant_fp8_custom_op: custom_ops.append("+quant_fp8") - - model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), + model_config=ModelConfig(dtype=dtype), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index f87825db29817..db95dff5e0fc7 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -24,7 +24,6 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, - RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -326,7 +325,6 @@ def test_attention_quant_pattern( ) vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), scheduler_config=SchedulerConfig( max_num_seqs=1024, max_model_len=model_config.max_model_len, diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index c95e9e3ff8ae8..6d0ba6b655031 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -7,7 +7,7 @@ import torch from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.compilation.pass_manager import PostGradPassManager -from vllm.config import ModelConfig, RendererConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig # dummy custom pass that doesn't inherit @@ -43,11 +43,7 @@ class ProperPass(InductorPass): ) def test_pass_manager_uuid(callable): # Some passes need dtype to be set - model_config = ModelConfig(dtype=torch.bfloat16) - config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - ) + config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) pass_manager = PostGradPassManager() pass_manager.configure(config) diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 4d109015be48a..e0968ac799256 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -19,7 +19,6 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, - RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion( if enable_rope_custom_op: custom_ops.append("+rotary_embedding") - model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), + model_config=ModelConfig(dtype=dtype), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py index 0d51a51a50804..b190b2820451b 100644 --- a/tests/distributed/test_kvlayout.py +++ b/tests/distributed/test_kvlayout.py @@ -5,7 +5,6 @@ from vllm.config import ( DeviceConfig, KVTransferConfig, ModelConfig, - RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): @@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index b050cfdb561cf..77087ac21ea8b 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -3,6 +3,7 @@ import pytest +from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.tokenizers import get_tokenizer @@ -106,11 +107,24 @@ def test_get_gen_prompt( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config(model) + model_config = ModelConfig( + model, + tokenizer=model_info.tokenizer or model, + tokenizer_mode=model_info.tokenizer_mode, + trust_remote_code=model_info.trust_remote_code, + revision=model_info.revision, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, + ) + # Initialize the tokenizer tokenizer = get_tokenizer( - renderer_config.tokenizer, - trust_remote_code=renderer_config.trust_remote_code, + tokenizer_name=model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, ) template_content = load_chat_template(chat_template=template) @@ -129,7 +143,7 @@ def test_get_gen_prompt( tokenizer=tokenizer, conversation=mock_request.messages, chat_template=mock_request.chat_template or template_content, - renderer_config=renderer_config, + model_config=model_config, tools=None, add_generation_prompt=mock_request.add_generation_prompt, continue_final_message=mock_request.continue_final_message, diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 7310c2610ce3b..ea6b3d812d8fe 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -33,34 +33,26 @@ class MockModelConfig: """Minimal mock ModelConfig for testing.""" model: str = MODEL_NAME + tokenizer: str = MODEL_NAME trust_remote_code: bool = False + tokenizer_mode: str = "auto" max_model_len: int = 100 + tokenizer_revision: str | None = None multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) logits_processors: list[str] | None = None logits_processor_pattern: str | None = None diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" + skip_tokenizer_init: bool = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} -@dataclass -class MockRendererConfig: - """Minimal mock RendererConfig for testing.""" - - model_config: MockModelConfig - - tokenizer: str = MODEL_NAME - tokenizer_mode: str = "auto" - tokenizer_revision: str | None = None - skip_tokenizer_init: bool = False - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None - - class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str @@ -122,7 +114,6 @@ def mock_serving_setup(): mock_engine.add_lora.reset_mock() mock_engine.model_config = MockModelConfig() - mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9df8f886edd96..9ea65f9fa6e7a 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -346,33 +346,27 @@ class MockHFConfig: class MockModelConfig: task = "generate" runner_type = "generate" + tokenizer = MODEL_NAME trust_remote_code = False + tokenizer_mode = "auto" max_model_len = 100 + tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() logits_processors: list[str] | None = None logits_processor_pattern = None diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} -@dataclass -class MockRendererConfig: - model_config: MockModelConfig = field(default_factory=MockModelConfig) - - tokenizer = MODEL_NAME - tokenizer_mode = "auto" - tokenizer_revision = None - skip_tokenizer_init = False - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None - - def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, @@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: @dataclass class MockEngine: model_config: MockModelConfig = field(default_factory=MockModelConfig) - renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig) input_processor: MagicMock = field(default_factory=MagicMock) io_processor: MagicMock = field(default_factory=MagicMock) @@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() - mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() - mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config - mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config - mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config - mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config - mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() - mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 6ab0942b58da8..956a06dc5487c 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -7,7 +7,7 @@ from unittest.mock import Mock import pytest -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.tokenizers import MistralTokenizer @@ -19,16 +19,10 @@ def serving() -> OpenAIServing: # Create minimal mocks engine_client = Mock() - model_config = Mock(spec=ModelConfig) model_config.max_model_len = 32768 - - renderer_config = Mock(spec=RendererConfig) - renderer_config.model_config = model_config - models = Mock(spec=OpenAIServingModels) models.model_config = model_config - models.renderer_config = renderer_config models.input_processor = Mock() models.io_processor = Mock() diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 376df6cfecb9f..b585835a0667a 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -6,7 +6,7 @@ from unittest.mock import MagicMock import pytest -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import ( ErrorResponse, @@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( async def _async_serving_models_init() -> OpenAIServingModels: mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute - mock_model_config = MagicMock(spec=ModelConfig) mock_model_config.max_model_len = 2048 - - mock_renderer_config = MagicMock(spec=RendererConfig) - mock_renderer_config.model_config = mock_model_config - mock_engine_client.model_config = mock_model_config - mock_engine_client.renderer_config = mock_renderer_config mock_engine_client.input_processor = MagicMock() mock_engine_client.io_processor = MagicMock() diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 7b296eae7c5a2..527322c71ae4b 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import ( _try_extract_ast, apply_mistral_chat_template, @@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - RendererConfig(model_config=mistral_model_config), + mistral_model_config, content_format="string", ) assert conversation == [ @@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - RendererConfig(model_config=mistral_model_config), + mistral_model_config, content_format="openai", ) assert conversation == [ @@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( ], } ], - RendererConfig(model_config=phi3v_model_config_image_embeds), + phi3v_model_config_image_embeds, content_format="string", ) @@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( ], } ], - RendererConfig(model_config=audio_embeds_model_config), + audio_embeds_model_config, content_format="string", ) @@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string( ], } ], - RendererConfig(model_config=audio_embeds_model_config), + audio_embeds_model_config, content_format="string", ) @@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async( ], } ], - RendererConfig(model_config=audio_embeds_model_config), + audio_embeds_model_config, content_format="string", ) @@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( ], } ], - RendererConfig(model_config=phi3v_model_config_image_embeds), + phi3v_model_config_image_embeds, content_format="string", ) @@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) assert conversation == [ @@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages( ], }, ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( ], }, ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format( {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What about this one?"}, ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="openai", ) @@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ], }, ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( ], } ], - RendererConfig(model_config=phi3v_model_config), + phi3v_model_config, content_format="string", ) @@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave( ], } ], - RendererConfig(model_config=phi3v_model_config_mm_interleaved), + phi3v_model_config_mm_interleaved, content_format="string", ) @@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( ], } ], - RendererConfig(model_config=phi3v_model_config_mm_interleaved), + phi3v_model_config_mm_interleaved, content_format="string", ) @@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( ], } ], - RendererConfig(model_config=phi3v_model_config_mm_interleaved), + phi3v_model_config_mm_interleaved, content_format="string", ) @@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( ], }, ], - RendererConfig(model_config=phi3v_model_config_mm_interleaved), + phi3v_model_config_mm_interleaved, content_format="string", ) @@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl ], }, ], - RendererConfig(model_config=phi3v_model_config_mm_interleaved), + phi3v_model_config_mm_interleaved, content_format="string", ) @@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ], }, ], - RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), + qwen25omni_model_config_mm_interleaved, content_format="string", ) @@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl ], }, ], - RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), + qwen25omni_model_config_mm_interleaved, content_format="string", ) @@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes ], }, ], - RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), + qwen25omni_model_config_mm_interleaved, content_format="string", ) @@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message ], }, ], - RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), + qwen25omni_model_config_mm_interleaved, content_format="string", ) @@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ], } ], - RendererConfig(model_config=phi3v_model_config_mm_interleaved), + phi3v_model_config_mm_interleaved, content_format="string", ) @@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config(model) + model_config = ModelConfig( + model, + tokenizer=model_info.tokenizer or model, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, + ) + # Build the tokenizer tokenizer = get_tokenizer( - renderer_config.tokenizer, - trust_remote_code=renderer_config.trust_remote_code, + model, + trust_remote_code=model_config.trust_remote_code, ) tools = ( @@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): tokenizer, chat_template=None, tools=tools, - model_config=renderer_config.model_config, + model_config=model_config, ) assert isinstance(chat_template, str) @@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa "enable_thinking": True, } - renderer_config = model_info.build_renderer_config(model) + model_config = ModelConfig( + model, + tokenizer=model_info.tokenizer or model, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, + ) + # Build the tokenizer tokenizer = get_tokenizer( - renderer_config.tokenizer, - trust_remote_code=renderer_config.trust_remote_code, + model, + trust_remote_code=model_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa tokenizer, chat_template=None, tools=tools, - model_config=renderer_config.model_config, + model_config=model_config, ) with pytest.raises( ValueError, match="Found unexpected chat template kwargs from request" @@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config(model) + model_config = ModelConfig( + model, + tokenizer=model_info.tokenizer or model, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, + ) tokenizer = get_tokenizer( - renderer_config.tokenizer, - trust_remote_code=renderer_config.trust_remote_code, + model, + trust_remote_code=model_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=renderer_config.model_config, + model_config=model_config, ) assert isinstance(chat_template, str) @@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): None, "auto", tokenizer, - renderer_config=renderer_config, + model_config=model_config, ) assert resolved_format == expected_format @@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config(model) + model_config = ModelConfig( + model, + tokenizer=model_info.tokenizer or model, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, + ) tokenizer = get_tokenizer( - renderer_config.tokenizer, - trust_remote_code=renderer_config.trust_remote_code, + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=renderer_config.model_config, + model_config=model_config, ) assert isinstance(chat_template, str) @@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): None, "auto", tokenizer, - renderer_config=renderer_config, + model_config=model_config, ) assert resolved_format == expected_format @@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format): ], ) def test_resolve_content_format_examples(template_path, expected_format): - model = PHI3V_MODEL_ID # Dummy - model_config = ModelConfig(model, trust_remote_code=True) - renderer_config = RendererConfig(model_config=model_config, tokenizer=model) + model_config = ModelConfig( + PHI3V_MODEL_ID, # Dummy + tokenizer=PHI3V_MODEL_ID, # Dummy + trust_remote_code=True, + ) dummy_tokenizer = get_tokenizer( - renderer_config.tokenizer, - trust_remote_code=renderer_config.trust_remote_code, + PHI3V_MODEL_ID, # Dummy + trust_remote_code=model_config.trust_remote_code, ) dummy_tokenizer.chat_template = None @@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format): None, "auto", dummy_tokenizer, - renderer_config=renderer_config, + model_config=model_config, ) assert resolved_format == expected_format @@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): conversation_with_thinking, _, _ = parse_chat_messages( messages, - RendererConfig(model_config=mistral_model_config), + mistral_model_config, content_format="openai", ) @@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ], } ], - RendererConfig(model_config=qwen2_audio_model_config), + qwen2_audio_model_config, content_format="string", ) @@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ], } ], - RendererConfig(model_config=qwen2_audio_model_config), + qwen2_audio_model_config, content_format="string", ) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 7158120fc0217..081f14d6fabfb 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,7 +8,7 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm.config import ModelConfig, RendererConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.lora.layers import ( ColumnParallelLinearWithLoRA, @@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - lora_config=lora_config, - ) + vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 @@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - lora_config=lora_config, - ) + vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 42d8c6202e79e..54059ec561907 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -11,7 +11,6 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, - RendererConfig, SchedulerConfig, VllmConfig, ) @@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files): vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), load_config=LoadConfig( download_dir=None, load_format="dummy", diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index e368671078fdd..489ac1e6475b9 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - llm_engine = vllm_model.llm.llm_engine - model_config = llm_engine.model_config - renderer_config = llm_engine.renderer_config - tokenizer = llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5" - assert tokenizer.model_max_length == 512 + assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" + assert model_tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) @@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - llm_engine = vllm_model.llm.llm_engine - model_config = llm_engine.model_config - renderer_config = llm_engine.renderer_config - tokenizer = llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert renderer_config.tokenizer == "intfloat/multilingual-e5-base" - assert tokenizer.model_max_length == 512 + assert model_config.tokenizer == "intfloat/multilingual-e5-base" + assert model_tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) @@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name + assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name def check_model(model): assert isinstance(model, RobertaEmbeddingModel) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 11ee003585487..0adc9b5cf25f6 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -6,7 +6,7 @@ import pytest from scipy.spatial.distance import cosine from vllm import LLM, SamplingParams -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from ....utils import RemoteOpenAIServer @@ -31,8 +31,7 @@ def test_find_array(): dtype="bfloat16", seed=0, ) - renderer_config = RendererConfig(model_config=model_config) - pooling = GritLMMeanPool(renderer_config=renderer_config) + pooling = GritLMMeanPool(model_config=model_config) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 9b2b29b758765..2e032ac4ca526 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC from vllm.tokenizers import ( MistralTokenizer, TokenizerLike, + cached_tokenizer_from_config, ) from ....multimodal.utils import random_audio, random_image, random_video @@ -211,20 +212,31 @@ def _test_processing_correctness( else: model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch) model_id = model_id_or_arch - model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - renderer_config = model_info.build_renderer_config( - model=model_id, + model_config = ModelConfig( + model_id, + tokenizer=model_info.tokenizer or model_id, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, # Ensure that the cache can fit all of the data mm_processor_cache_gb=2048, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, ) - model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = InputProcessingContext.from_config(renderer_config) + ctx = InputProcessingContext( + model_config, + tokenizer=cached_tokenizer_from_config(model_config), + ) cache = MultiModalProcessorOnlyCache(model_config) processing_info = factories.info(ctx) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index fdc6352e2ec83..51071c93531de 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -40,7 +40,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {"fps": fps} @@ -79,7 +79,7 @@ def test_video_loader_consistency( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {"fps": fps} # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 1263d663e6af6..1701d9dd8f011 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -162,7 +162,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index bf12e79a718b7..351b9d018eec2 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 51f0d2e891b3f..b4994295d3a80 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -116,7 +116,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 04bc8d3f53818..b73246b68b36a 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -30,7 +30,7 @@ def test_processor_override( limit_mm_per_prompt={"image": num_imgs}, mm_processor_cache_gb=mm_processor_cache_gb, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) config = processor.info.get_hf_config() tokenizer = processor.info.get_tokenizer() hf_processor = processor.info.get_hf_processor() diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index cd01002a32af2..ffe7ca17b5d61 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) info = processor.info seen_aspect_ratios = set[float]() @@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) image_ratios = [ (171, 152), @@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index be505d95a500f..f5c552fe6476a 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) info = processor.info seen_aspect_ratios = set[float]() @@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) image_ratios = [ (171, 152), @@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 17ac54fdd0a49..11e0001235110 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -24,7 +24,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) prompt = "" * num_imgs image = Image.new("RGB", size=(364, 364)) mm_data = {"image": [image] * num_imgs} @@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) image_ratios = [ (171, 152), diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index 9a65e2ddc85c6..e5ff2d1391b62 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int): limit_mm_per_prompt=mm_counts, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) profiler = MultiModalProfiler(processor) decoder_dummy_data = profiler.get_decoder_dummy_data( diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index f3609743b7c85..5311ab1b78c69 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -118,7 +118,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index f51bd97861783..8faff2611e6fe 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index 271357b0d1507..5391555c26675 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index d65a270a7da3b..9f4cdb6789b2c 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -34,7 +34,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index e0e6264de4e3a..6f77d5516d147 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 24959fa48ad6d..5d489549c5b46 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -11,7 +11,7 @@ import pytest import torch.nn as nn from PIL import Image -from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.config.multimodal import ( AudioDummyOptions, BaseDummyOptions, @@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.platforms import current_platform +from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype @@ -149,10 +150,7 @@ def initialize_dummy_model( backend="nccl", ) initialize_model_parallel(tensor_model_parallel_size=1) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - ) + vllm_config = VllmConfig(model_config=model_config) with set_current_vllm_config(vllm_config=vllm_config): with set_default_torch_dtype(model_config.dtype): model = model_cls(vllm_config=vllm_config) @@ -184,12 +182,19 @@ def test_model_tensor_schema(model_id: str): else: dtype = model_info.dtype - renderer_config = model_info.build_renderer_config( + model_config = ModelConfig( model_id, + tokenizer=model_info.tokenizer or model_id, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, hf_overrides=hf_overrides_fn, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, dtype=dtype, ) - model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) assert supports_multimodal(model_cls) @@ -207,7 +212,10 @@ def test_model_tensor_schema(model_id: str): if not any(inputs_parse_methods): pytest.skip(f"{model_arch} does not support tensor schema validation.") - ctx = InputProcessingContext.from_config(renderer_config) + ctx = InputProcessingContext( + model_config, + tokenizer=cached_tokenizer_from_config(model_config), + ) processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() limit_mm_per_prompt = { diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index c9a90eb882da4..e2a2186f470b4 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -3,7 +3,7 @@ import pytest from vllm.assets.image import ImageAsset -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -13,9 +13,8 @@ def test_multimodal_processor(model_id): model=model_id, model_impl="transformers", ) - renderer_config = RendererConfig(model_config=model_config) - mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) + mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) image_pil = ImageAsset("cherry_blossom").pil_image mm_data = {"image": image_pil} diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 73de6b5f7d2f9..0d2eaca95504e 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -7,6 +7,7 @@ import torch import transformers from transformers import AutoConfig, PreTrainedModel +from vllm.config import ModelConfig from vllm.model_executor.models.utils import WeightsMapper from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.transformers_utils.config import try_get_safetensors_metadata @@ -49,11 +50,37 @@ def test_hf_model_weights_mapper(model_arch: str): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - model_config = model_info.build_model_config(config_format="hf") + is_mistral_model = model_arch in [ + "Mistral3ForConditionalGeneration", + "PixtralForConditionalGeneration", + "VoxtralForConditionalGeneration", + ] + + if not is_mistral_model or model_info.tokenizer_mode == "mistral": + tokenizer_mode = model_info.tokenizer_mode + else: + tokenizer_mode = "hf" + + model_id = model_info.default + + model_config = ModelConfig( + model_id, + tokenizer=model_info.tokenizer or model_id, + tokenizer_mode=tokenizer_mode, + config_format="hf", + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype, + ) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - original_weights = create_repo_dummy_weights(model_config.model) - hf_dummy_model = create_dummy_model(model_config.model, model_arch) + original_weights = create_repo_dummy_weights(model_id) + hf_dummy_model = create_dummy_model(model_id, model_arch) hf_converted_weights = hf_dummy_model.named_parameters() hf_converted_buffers = hf_dummy_model.named_buffers() mapper: WeightsMapper = model_cls.hf_to_vllm_mapper diff --git a/tests/models/registry.py b/tests/models/registry.py index e2cb5bcbc6c91..020cb749341a6 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -9,8 +9,7 @@ import pytest from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION -from vllm.config.model import ModelConfig, ModelDType -from vllm.config.renderer import RendererConfig, TokenizerMode +from vllm.config.model import ModelDType, TokenizerMode @dataclass(frozen=True) @@ -171,36 +170,6 @@ class _HfExamplesInfo: else: pytest.skip(msg) - def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig: - if model is None: - model = self.default - - return ModelConfig( - **{ - "model": model, - "revision": self.revision, - "trust_remote_code": self.trust_remote_code, - "hf_overrides": self.hf_overrides, - "enable_prompt_embeds": self.require_embed_inputs, - "enable_mm_embeds": self.require_embed_inputs, - "enforce_eager": self.enforce_eager, - "dtype": self.dtype, - **kwargs, - } - ) - - def build_renderer_config( - self, model: str | None = None, **kwargs - ) -> RendererConfig: - model_config = self.build_model_config(model, **kwargs) - - return RendererConfig( - model_config=model_config, - tokenizer=self.tokenizer or model_config.model, - tokenizer_mode=self.tokenizer_mode, - skip_tokenizer_init=self.require_embed_inputs, - ) - _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] diff --git a/tests/models/utils.py b/tests/models/utils.py index 87292cc4538d9..d84b4b820533e 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -13,6 +13,7 @@ from transformers import PretrainedConfig from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext +from vllm.tokenizers import cached_tokenizer_from_config from .. import ci_envs from .registry import HF_EXAMPLE_MODELS @@ -295,18 +296,30 @@ def build_model_context( model_config_kwargs = model_config_kwargs or {} limit_mm_per_prompt = limit_mm_per_prompt or {} - renderer_config = model_info.build_renderer_config( + model_config = ModelConfig( model_id, runner=runner, + tokenizer=model_info.tokenizer or model_id, + tokenizer_mode=model_info.tokenizer_mode, + revision=model_info.revision, + trust_remote_code=model_info.trust_remote_code, dtype=dtype, seed=0, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_cache_gb=mm_processor_cache_gb, + hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, + enforce_eager=model_info.enforce_eager, **model_config_kwargs, ) - return InputProcessingContext.from_config(renderer_config) + return InputProcessingContext( + model_config, + tokenizer=cached_tokenizer_from_config(model_config), + ) def check_embeddings_close( diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index ce16d90130aa4..e641b1111abaf 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -6,7 +6,7 @@ import numpy as np import pytest import torch -from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig +from vllm.config import ModelConfig, ParallelConfig, VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import ( BaseMultiModalProcessorCache, @@ -110,14 +110,11 @@ def _create_vllm_config( mm_processor_cache_gb: float, enable_ipc: bool, ): - model_config = ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_gb=mm_processor_cache_gb, - ) - return VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), + model_config=ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb, + ), parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2), ) @@ -509,15 +506,13 @@ def _run_test_cache_eviction_shm( def test_cache_eviction_shm_cache(): - model_config = ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_type="shm", - mm_shm_cache_max_object_size_mb=6, - mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, - ) vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), + model_config=ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_type="shm", + mm_shm_cache_max_object_size_mb=6, + mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, + ), ) sender_cache = ShmObjectStoreSenderCache(vllm_config) receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock()) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index adff572524a9b..262ea42e4d0fa 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,7 +7,7 @@ from contextlib import nullcontext import numpy as np import pytest -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import ( InputProcessingContext, @@ -920,9 +920,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) - renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(model_config) processor._supported_mm_limits = {"image": num_supported} profiler = MultiModalProfiler(processor) @@ -956,9 +955,8 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) - renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) + processor = MULTIMODAL_REGISTRY.create_processor(model_config) rng = np.random.RandomState(0) image = random_image(rng, min_wh=128, max_wh=256) @@ -1014,13 +1012,11 @@ def test_hf_processor_init_kwargs( inference_kwargs, expected_kwargs, ): - model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) - renderer_config = RendererConfig( - model_config=model_config, - tokenizer=model_id, + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=None, ) - ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor( DummyProcessor, # type: ignore[arg-type] **inference_kwargs, @@ -1049,13 +1045,11 @@ def test_hf_processor_call_kwargs( inference_kwargs, expected_kwargs, ): - model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) - renderer_config = RendererConfig( - model_config=model_config, - tokenizer=model_id, + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=None, ) - ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] result = ctx.call_hf_processor(processor, {}, inference_kwargs) diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py index 8127fac09968b..3b01bda7f54c8 100644 --- a/tests/multimodal/test_registry.py +++ b/tests/multimodal/test_registry.py @@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected): model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) - assert ( - MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected - ) + assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected diff --git a/tests/test_config.py b/tests/test_config.py index 7464fcd1e9fe5..203447cd531fb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,7 +13,6 @@ from vllm.config import ( CompilationConfig, ModelConfig, PoolerConfig, - RendererConfig, SchedulerConfig, VllmConfig, update_config, @@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True), ], ) -def test_recalculate_max_model_len( +def test_get_and_verify_max_len( model_id, max_model_len, expected_max_len, should_raise ): - """Test recalculate_max_model_len with different configurations.""" + """Test get_and_verify_max_len with different configurations.""" model_config = ModelConfig(model_id) if should_raise: with pytest.raises(ValueError): - model_config.recalculate_max_model_len( - max_model_len, - tokenizer=model_id, - tokenizer_revision=None, - ) + model_config.get_and_verify_max_len(max_model_len) else: - model_config.recalculate_max_model_len( - max_model_len, - tokenizer=model_id, - tokenizer_revision=None, - ) - assert model_config.max_model_len == expected_max_len + actual_max_len = model_config.get_and_verify_max_len(max_model_len) + assert actual_max_len == expected_max_len -class MockModelConfig: - """Simple mock object for testing maybe_pull_model_for_runai""" +class MockConfig: + """Simple mock object for testing maybe_pull_model_tokenizer_for_runai""" - def __init__(self, model: str): + def __init__(self, model: str, tokenizer: str): self.model = model - - -class MockRendererConfig: - """Simple mock object for testing maybe_pull_tokenizer_for_runai""" - - def __init__(self, model_config: MockModelConfig): - self.model_config = model_config - self.tokenizer = model_config.model + self.tokenizer = tokenizer + self.model_weights = None @pytest.mark.parametrize( @@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url): mock_pull_files.return_value = None # Create first mock and run the method - model_config1 = MockModelConfig(model=s3_url) - renderer_config1 = MockRendererConfig(model_config=model_config1) - ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url) - RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url) + config1 = MockConfig(model=s3_url, tokenizer=s3_url) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url) # Check that model and tokenizer point to existing directories - assert os.path.exists(model_config1.model), ( - f"Model directory does not exist: {model_config1.model}" + assert os.path.exists(config1.model), ( + f"Model directory does not exist: {config1.model}" ) - assert os.path.isdir(model_config1.model), ( - f"Model path is not a directory: {model_config1.model}" + assert os.path.isdir(config1.model), ( + f"Model path is not a directory: {config1.model}" ) - assert os.path.exists(renderer_config1.tokenizer), ( - f"Tokenizer directory does not exist: {renderer_config1.tokenizer}" + assert os.path.exists(config1.tokenizer), ( + f"Tokenizer directory does not exist: {config1.tokenizer}" ) - assert os.path.isdir(renderer_config1.tokenizer), ( - f"Tokenizer path is not a directory: {renderer_config1.tokenizer}" + assert os.path.isdir(config1.tokenizer), ( + f"Tokenizer path is not a directory: {config1.tokenizer}" ) # Verify that the paths are different from the original S3 URL - assert model_config1.model != s3_url, ( - "Model path should be converted to local directory" - ) - assert renderer_config1.tokenizer != s3_url, ( + assert config1.model != s3_url, "Model path should be converted to local directory" + assert config1.tokenizer != s3_url, ( "Tokenizer path should be converted to local directory" ) # Store the original paths - created_model_dir = model_config1.model - create_tokenizer_dir = renderer_config1.tokenizer + created_model_dir = config1.model + create_tokenizer_dir = config1.tokenizer # Create a new mock and run the method with the same S3 URL - model_config2 = MockModelConfig(model=s3_url) - renderer_config2 = MockRendererConfig(model_config=model_config2) - ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url) - RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url) + config2 = MockConfig(model=s3_url, tokenizer=s3_url) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url) # Check that the new directories exist - assert os.path.exists(model_config2.model), ( - f"Model directory does not exist: {model_config2.model}" + assert os.path.exists(config2.model), ( + f"Model directory does not exist: {config2.model}" ) - assert os.path.isdir(model_config2.model), ( - f"Model path is not a directory: {model_config2.model}" + assert os.path.isdir(config2.model), ( + f"Model path is not a directory: {config2.model}" ) - assert os.path.exists(renderer_config2.tokenizer), ( - f"Tokenizer directory does not exist: {renderer_config2.tokenizer}" + assert os.path.exists(config2.tokenizer), ( + f"Tokenizer directory does not exist: {config2.tokenizer}" ) - assert os.path.isdir(renderer_config2.tokenizer), ( - f"Tokenizer path is not a directory: {renderer_config2.tokenizer}" + assert os.path.isdir(config2.tokenizer), ( + f"Tokenizer path is not a directory: {config2.tokenizer}" ) # Verify that the paths are deterministic (same as before) - assert model_config2.model == created_model_dir, ( + assert config2.model == created_model_dir, ( f"Model paths are not deterministic. " - f"Original: {created_model_dir}, New: {model_config2.model}" + f"Original: {created_model_dir}, New: {config2.model}" ) - assert renderer_config2.tokenizer == create_tokenizer_dir, ( + assert config2.tokenizer == create_tokenizer_dir, ( f"Tokenizer paths are not deterministic. " - f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}" + f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}" ) @@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): s3_url2 = "s3://example-bucket-2/model/" # Create mocks with different S3 URLs and run the method - model_config1 = MockModelConfig(model=s3_url1) - renderer_config1 = MockRendererConfig(model_config=model_config1) - ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1) - RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1) + config1 = MockConfig(model=s3_url1, tokenizer=s3_url1) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1) - model_config2 = MockModelConfig(model=s3_url2) - renderer_config2 = MockRendererConfig(model_config=model_config2) - ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2) - RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2) + config2 = MockConfig(model=s3_url2, tokenizer=s3_url2) + ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2) # Verify that different URLs produce different directories - assert model_config1.model != model_config2.model, ( + assert config1.model != config2.model, ( f"Different S3 URLs should create different model directories. " - f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}" + f"URL1 model: {config1.model}, URL2 model: {config2.model}" ) - assert renderer_config1.tokenizer != renderer_config2.tokenizer, ( + assert config1.tokenizer != config2.tokenizer, ( f"Different S3 URLs should create different tokenizer directories. " - f"URL1 tokenizer: {renderer_config1.tokenizer}, " - f"URL2 tokenizer: {renderer_config2.tokenizer}" + f"URL1 tokenizer: {config1.tokenizer}, " + f"URL2 tokenizer: {config2.tokenizer}" ) # Verify that both sets of directories exist - assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model) - assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir( - renderer_config1.tokenizer - ) - assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model) - assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir( - renderer_config2.tokenizer - ) + assert os.path.exists(config1.model) and os.path.isdir(config1.model) + assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer) + assert os.path.exists(config2.model) and os.path.isdir(config2.model) + assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) @pytest.mark.parametrize( diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 48fd076ab3c67..c4339827de8b6 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor @@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - renderer_config = RendererConfig(model_config=model_config) - tokenizer = init_tokenizer_from_config(renderer_config) - input_preprocessor = InputPreprocessor(renderer_config, tokenizer) + tokenizer = init_tokenizer_from_config(model_config) + input_preprocessor = InputPreprocessor(model_config, tokenizer) # HF processor adds sep token sep_token_id = tokenizer.vocab[tokenizer.sep_token] diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 49307e3e5437d..6cab129c116c5 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -16,7 +16,6 @@ from vllm.config import ( LoadConfig, ModelConfig, ParallelConfig, - RendererConfig, SchedulerConfig, VllmConfig, ) @@ -217,7 +216,6 @@ def create_vllm_config( return VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 4a414bca591d1..fd5cf6d3e74aa 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -8,7 +8,7 @@ import pytest import torch import vllm.v1.core.kv_cache_utils as kv_cache_utils -from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import ( MultiModalFeatureSpec, @@ -667,10 +667,7 @@ def test_metrics_empty_stats(): def test_get_kv_cache_configs_multiple_workers(): model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - ) + vllm_config = VllmConfig(model_config=model_config) ref_kv_cache_spec = new_kv_cache_spec() same_kv_cache_specs = [ @@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len) vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config(): vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead(): def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - ) + vllm_config = VllmConfig(model_config=model_config) mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # all layers are full attention -> single group @@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker(): def test_get_kv_cache_configs_attention_free(): kv_cache_specs: dict[str, KVCacheSpec] = {} - model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), - ) + vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16)) kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) assert kv_cache_configs == [ KVCacheConfig( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 1505415a63619..c6c4a5085bff7 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -11,7 +11,6 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, - RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -1564,7 +1563,6 @@ def create_scheduler_with_priority( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 086885c298145..f5ba613d38db1 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -9,7 +9,6 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, - RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -133,7 +132,6 @@ def create_scheduler( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index c606100a12bf1..48be8c15aba9e 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -15,7 +15,6 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, - RendererConfig, SchedulerConfig, VllmConfig, ) @@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache( vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, kv_transfer_config=kv_transfer_config, diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 85fab3a855fd7..1b11b8af49d17 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -5,14 +5,7 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ( - CacheConfig, - DeviceConfig, - ModelConfig, - MultiModalConfig, - RendererConfig, - VllmConfig, -) +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig from vllm.sampling_params import SamplingParams from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine.input_processor import InputProcessor @@ -51,21 +44,22 @@ def _mock_input_processor( monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( + skip_tokenizer_init=True, max_model_len=128, mm_processor_cache_gb=mm_cache_gb, generation_config="vllm", - ) - model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb) - - renderer_config = RendererConfig( - model_config=model_config, tokenizer="dummy", - skip_tokenizer_init=True, ) + # Minimal multimodal_config to satisfy references in + # Processor.process_inputs. + class _MockMMConfig: + def __init__(self, gb: float): + self.mm_processor_cache_gb = gb + + model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined] vllm_config = VllmConfig( model_config=model_config, - renderer_config=renderer_config, cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), device_config=DeviceConfig(device="cpu"), ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 768b338b5fe53..58f1a7282352b 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -15,7 +15,6 @@ from vllm.config import ( DeviceConfig, KVTransferConfig, ModelConfig, - RendererConfig, SchedulerConfig, VllmConfig, ) @@ -128,7 +127,6 @@ def create_vllm_config( return VllmConfig( scheduler_config=scheduler_config, model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, device_config=DeviceConfig("cpu"), diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 888ea0169b759..616e57de339e2 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -19,7 +19,6 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, - RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -62,7 +61,6 @@ def _create_proposer( vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 4483c82438530..3b8813ceb818a 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -18,7 +18,6 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, - RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 2e365e08a4e72..6bc412abe8695 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -4,7 +4,6 @@ import numpy as np from vllm.config import ( ModelConfig, - RendererConfig, SpeculativeConfig, VllmConfig, ) @@ -70,7 +69,6 @@ def test_ngram_proposer(): return NgramProposer( vllm_config=VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), speculative_config=SpeculativeConfig( prompt_lookup_min=min_n, prompt_lookup_max=max_n, diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index baef2459f8df0..4c01560fc88c3 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -6,7 +6,7 @@ from concurrent.futures import Future import pytest from transformers import AutoTokenizer -from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig +from vllm.config import StructuredOutputsConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.speculative import SpeculativeConfig @@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated(): def test_grammar_bitmask_with_specdec(): tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) prompt = tokenizer.encode('{"a": "b"}') - - model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), + model_config=ModelConfig(tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3), ) @@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar): # Use "external_launcher" for sync mode, None for async mode executor_backend = None if async_grammar else "external_launcher" - - model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=model_config, - renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), + model_config=ModelConfig(tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), parallel_config=ParallelConfig(distributed_executor_backend=executor_backend), ) diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 5901d38d1b78b..70047a993c3f9 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -7,7 +7,7 @@ from unittest.mock import Mock import pytest -from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.reasoning import ReasoningParser from vllm.v1.request import Request from vllm.v1.structured_output import StructuredOutputManager @@ -17,26 +17,19 @@ class TestReasoningStructuredOutput: """Test reasoning-aware structured output functionality.""" @pytest.fixture - def mock_renderer_config(self): - """Create a mock RendererConfig.""" - renderer_config = Mock(spec=RendererConfig) - renderer_config.skip_tokenizer_init = ( - True # Skip tokenizer init to avoid network calls - ) - - model_config = Mock(spec=ModelConfig) - model_config.get_vocab_size = Mock(return_value=50000) - model_config.trust_remote_code = False + def mock_model_config(self): + """Create a mock ModelConfig.""" + config = Mock(spec=ModelConfig) + config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls + config.get_vocab_size = Mock(return_value=50000) # Add missing runner_type attribute that tokenizer initialization expects - model_config.runner_type = "generate" - renderer_config.model_config = model_config - + config.runner_type = "generate" # Add other attributes that tokenizer initialization might need - renderer_config.tokenizer = "test-tokenizer" - renderer_config.tokenizer_mode = "auto" - renderer_config.tokenizer_revision = None - - return renderer_config + config.tokenizer = "test-tokenizer" + config.tokenizer_mode = "auto" + config.trust_remote_code = False + config.tokenizer_revision = None + return config @pytest.fixture def mock_scheduler_config(self): @@ -46,10 +39,10 @@ class TestReasoningStructuredOutput: return config @pytest.fixture - def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config): + def mock_vllm_config(self, mock_model_config, mock_scheduler_config): """Create a mock VllmConfig.""" config = Mock(spec=VllmConfig) - config.renderer_config = mock_renderer_config + config.model_config = mock_model_config config.scheduler_config = mock_scheduler_config config.structured_outputs_config = Mock() config.structured_outputs_config.reasoning_parser = None diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 080d23863652d..cfc06666e7984 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -7,7 +7,6 @@ from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, ModelConfig, - RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -46,7 +45,6 @@ def get_vllm_config(): ) vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 464e3ab99c76d..7b8c4268a5237 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -13,7 +13,6 @@ from vllm.config import ( CacheConfig, ModelConfig, ParallelConfig, - RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -102,7 +101,6 @@ def get_vllm_config(): parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, @@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes(): attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER) vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index a4f9fd8d28292..0f84f3ca9d3e3 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig -from vllm.config.renderer import RendererConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -82,8 +81,6 @@ __all__ = [ "ParallelConfig", # From vllm.config.pooler "PoolerConfig", - # From vllm.config.renderer - "RendererConfig", # From vllm.config.scheduler "SchedulerConfig", # From vllm.config.speculative diff --git a/vllm/config/model.py b/vllm/config/model.py index b0d4fb8e01e64..509a9c5e162f7 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -36,6 +36,7 @@ from vllm.transformers_utils.config import ( uses_xdrope_dim, ) from vllm.transformers_utils.gguf_utils import ( + is_gguf, is_remote_gguf, maybe_patch_hf_config_from_gguf, split_remote_gguf, @@ -82,6 +83,7 @@ TaskOption = Literal[ "transcription", "draft", ] +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -129,6 +131,18 @@ class ModelConfig: Note that the model may support other tasks using the same model runner. """ + tokenizer: SkipValidation[str] = None # type: ignore + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode | str = "auto" + """Tokenizer mode:\n + - "auto" will use the tokenizer from `mistral_common` for Mistral models + if available, otherwise it will use the "hf" tokenizer.\n + - "hf" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" @@ -154,6 +168,13 @@ class ModelConfig: hf_config_path: str | None = None """Name or path of the Hugging Face config to use. If unspecified, model name or path will be used.""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + allowed_media_domains: list[str] | None = None + """If set, only media URLs that belong to this domain can be used for + multi-modal inputs. """ revision: str | None = None """The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" @@ -161,6 +182,10 @@ class ModelConfig: """The specific revision to use for the model code on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" + tokenizer_revision: str | None = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" max_model_len: SkipValidation[int] = None # type: ignore """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -205,6 +230,10 @@ class ModelConfig: preventing potential numerical issues. Note that even if this is set to False, cascade attention will be only used when the heuristic tells that it's beneficial.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" enable_prompt_embeds: bool = False """If `True`, enables passing text embeddings as inputs via the `prompt_embeds` key. @@ -265,6 +294,8 @@ class ModelConfig: logits_processors: list[str | type[LogitsProcessor]] | None = None """One or more logits processors' fully-qualified class names or class definitions""" + io_processor_plugin: str | None = None + """IOProcessor plugin name to load at model startup""" # Pooler config pooler_config: PoolerConfig | None = None @@ -277,6 +308,7 @@ class ModelConfig: from the architecture of `self.model`.""" limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None enable_mm_embeds: InitVar[bool | None] = None + media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None mm_processor_kwargs: InitVar[dict[str, Any] | None] = None mm_processor_cache_gb: InitVar[float | None] = None mm_processor_cache_type: InitVar[MMCacheType | None] = None @@ -303,12 +335,18 @@ class ModelConfig: "runner", "convert", "task", + "tokenizer", + "tokenizer_mode", "seed", "hf_config_path", + "allowed_local_media_path", + "allowed_media_domains", + "tokenizer_revision", "spec_target_max_model_len", "enforce_eager", "logprobs_mode", "disable_cascade_attn", + "skip_tokenizer_init", "served_model_name", "config_format", "hf_token", @@ -316,9 +354,11 @@ class ModelConfig: "logits_processor_pattern", "override_attention_dtype", "logits_processors", + "io_processor_plugin", "pooler_config", "multimodal_config", "limit_mm_per_prompt", + "media_io_kwargs", "mm_processor_kwargs", "mm_processor_cache_gb", "mm_processor_cache_type", @@ -383,6 +423,7 @@ class ModelConfig: # Multimodal config init vars limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, + media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, mm_processor_cache_gb: float | None, mm_processor_cache_type: MMCacheType | None, @@ -397,8 +438,13 @@ class ModelConfig: self.served_model_name = get_served_model_name( self.model, self.served_model_name ) - self.original_model = self.model - self.model = maybe_model_redirect(self.original_model) + self.model = maybe_model_redirect(self.model) + # The tokenizer is consistent with the model by default. + if self.tokenizer is None: + self.tokenizer = self.model + if self.tokenizer_revision is None: + self.tokenizer_revision = self.revision + self.tokenizer = maybe_model_redirect(self.tokenizer) if isinstance(self.hf_config_path, str): self.hf_config_path = maybe_model_redirect(self.hf_config_path) @@ -419,7 +465,7 @@ class ModelConfig: hf_overrides_kw[key] = value hf_overrides_fn = None - self.maybe_pull_model_for_runai(self.model) + self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) from vllm.platforms import current_platform @@ -602,8 +648,7 @@ class ModelConfig: ) self.original_max_model_len = self.max_model_len - self.recalculate_max_model_len(self.original_max_model_len) - + self.max_model_len = self.get_and_verify_max_len(self.max_model_len) # Init multimodal config if needed if self._model_info.supports_multimodal: if ( @@ -619,6 +664,7 @@ class ModelConfig: mm_config_kwargs = dict( limit_per_prompt=limit_mm_per_prompt, enable_mm_embeds=enable_mm_embeds, + media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, mm_processor_cache_gb=mm_processor_cache_gb, mm_processor_cache_type=mm_processor_cache_type, @@ -636,8 +682,16 @@ class ModelConfig: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + # Multimodal GGUF models must use original repo for mm processing + if is_gguf(self.tokenizer) and self.is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + if self.disable_sliding_window: - # Set after recalculate_max_model_len to ensure that max_model_len + # Set after get_and_verify_max_len to ensure that max_model_len # can be correctly capped to sliding window size self.hf_text_config.sliding_window = None @@ -661,9 +715,10 @@ class ModelConfig: @model_validator(mode="after") def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + if not isinstance(self.tokenizer, str): + raise ValueError("tokenizer must be a string after __post_init__.") if not isinstance(self.max_model_len, int): raise ValueError("max_model_len must be an integer after __post_init__.") - return self def _get_transformers_backend_cls(self) -> str: @@ -712,17 +767,49 @@ class ModelConfig: """The architecture vllm actually used.""" return self._architecture - def maybe_pull_model_for_runai(self, model: str) -> None: - """Pull model from Object Storage to temporary directory when needed.""" - if not is_runai_obj_uri(model): + def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: + """Pull model/tokenizer from Object Storage to temporary + directory when needed. + + Args: + model: Model name or path + tokenizer: Tokenizer name or path + """ + + if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): return - object_storage_model = ObjectStorageModel(url=model) - object_storage_model.pull_files( - model, allow_pattern=["*.model", "*.py", "*.json"] - ) - self.model_weights = model - self.model = object_storage_model.dir + if is_runai_obj_uri(model): + object_storage_model = ObjectStorageModel(url=model) + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"] + ) + self.model_weights = model + self.model = object_storage_model.dir + + # If tokenizer is same as model, download to same directory + if model == tokenizer: + object_storage_model.pull_files( + model, + ignore_pattern=[ + "*.pt", + "*.safetensors", + "*.bin", + "*.tensors", + "*.pth", + ], + ) + self.tokenizer = object_storage_model.dir + return + + # Only download tokenizer if needed and not already handled + if is_runai_obj_uri(tokenizer): + object_storage_tokenizer = ObjectStorageModel(url=tokenizer) + object_storage_tokenizer.pull_files( + model, + ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], + ) + self.tokenizer = object_storage_tokenizer.dir def _get_encoder_config(self): model = self.model @@ -1625,38 +1712,30 @@ class ModelConfig: return dense_modules[-1]["out_features"] return self.get_hidden_size() - def recalculate_max_model_len( - self, - original_max_model_len: int | None, - *, - tokenizer: str | None = None, - tokenizer_revision: str | None = None, - ) -> None: + def get_and_verify_max_len(self, max_model_len: int): # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. - # NOTE: For simplicity we assume `args.model == args.tokenizer` - # since this is tokenizer_config = None if ( self.runner_type == "pooling" and getattr(self.hf_config, "position_embedding_type", "") == "absolute" ): tokenizer_config = try_get_tokenizer_config( - tokenizer or self.model, + self.tokenizer, trust_remote_code=self.trust_remote_code, - revision=tokenizer_revision or self.revision, + revision=self.tokenizer_revision, ) - - self.max_model_len = _get_and_verify_max_len( + max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, tokenizer_config=tokenizer_config, - max_model_len=original_max_model_len, + max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window=self.get_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config, ) - logger.info("Using max model len %s", self.max_model_len) + logger.info("Using max model len %s", max_model_len) + return max_model_len @property def attn_type(self) -> AttnTypeStr: diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 37e2f6b4d419a..8a2936de96d6f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -79,6 +79,10 @@ class MultiModalConfig: WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users!""" + media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" mm_processor_kwargs: dict[str, object] | None = None """Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py deleted file mode 100644 index 36a922b93ca05..0000000000000 --- a/vllm/config/renderer.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Literal - -from pydantic import Field, SkipValidation -from pydantic.dataclasses import dataclass - -from vllm.config.model import ModelConfig -from vllm.config.utils import config -from vllm.transformers_utils.gguf_utils import is_gguf -from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri -from vllm.transformers_utils.utils import maybe_model_redirect - -TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] - - -@config -@dataclass -class RendererConfig: - """Configuration for the renderer.""" - - # NOTE: In reality, this is a required argument. - # We provide a dummy default value here to generate the CLI args. - model_config: SkipValidation[ModelConfig] = None # type: ignore - """Provides model context to the renderer.""" - - tokenizer: str = "" - """Name or path of the Hugging Face tokenizer to use. If unspecified, model - name or path will be used.""" - tokenizer_mode: TokenizerMode | str = "auto" - """Tokenizer mode:\n - - "auto" will use the tokenizer from `mistral_common` for Mistral models - if available, otherwise it will use the "hf" tokenizer.\n - - "hf" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - - Other custom values can be supported via plugins.""" - tokenizer_revision: str | None = None - """The specific revision to use for the tokenizer on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" - skip_tokenizer_init: bool = False - """Skip initialization of tokenizer and detokenizer. Expects valid - `prompt_token_ids` and `None` for prompt from the input. The generated - output will contain token ids.""" - - io_processor_plugin: str | None = None - """IOProcessor plugin name to load at model startup.""" - - media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" - allowed_local_media_path: str = "" - """Allowing API requests to read local images or videos from directories - specified by the server file system. This is a security risk. Should only - be enabled in trusted environments.""" - allowed_media_domains: list[str] | None = None - """If set, only media URLs that belong to this domain can be used for - multi-modal inputs. """ - - @property - def trust_remote_code(self) -> bool: - return self.model_config.trust_remote_code - - def __post_init__(self) -> None: - model_config = self.model_config - - # The tokenizer is consistent with the model by default. - if not self.tokenizer: - self.tokenizer = ( - ModelConfig.model - if model_config is None - else model_config.original_model - ) - if not self.tokenizer_revision: - self.tokenizer_revision = ( - ModelConfig.revision if model_config is None else model_config.revision - ) - - self.original_tokenizer = self.tokenizer - self.tokenizer = maybe_model_redirect(self.original_tokenizer) - self.maybe_pull_tokenizer_for_runai(self.tokenizer) - - # Multimodal GGUF models must use original repo for mm processing - is_multimodal_model = ( - ModelConfig.is_multimodal_model - if model_config is None - else model_config.is_multimodal_model - ) - if is_gguf(self.tokenizer) and is_multimodal_model: - raise ValueError( - "Loading a multimodal GGUF model needs to use original " - "tokenizer. Please specify the unquantized hf model's " - "repo name or path using the --tokenizer argument." - ) - - def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None: - """Pull tokenizer from Object Storage to temporary directory when needed.""" - if not is_runai_obj_uri(tokenizer): - return - - object_storage_tokenizer = ObjectStorageModel(url=tokenizer) - object_storage_tokenizer.pull_files( - tokenizer, - ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], - ) - self.tokenizer = object_storage_tokenizer.dir diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 63b63eac907d2..bf533bf14e55c 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -322,11 +322,16 @@ class SpeculativeConfig: self.draft_model_config = ModelConfig( model=self.model, runner="draft", + tokenizer=self.target_model_config.tokenizer, + tokenizer_mode=self.target_model_config.tokenizer_mode, trust_remote_code=self.target_model_config.trust_remote_code, + allowed_local_media_path=self.target_model_config.allowed_local_media_path, + allowed_media_domains=self.target_model_config.allowed_media_domains, dtype=self.target_model_config.dtype, seed=self.target_model_config.seed, revision=self.revision, code_revision=self.code_revision, + tokenizer_revision=self.target_model_config.tokenizer_revision, spec_target_max_model_len=self.target_model_config.max_model_len, quantization=self.quantization, enforce_eager=self.target_model_config.enforce_eager, diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 417797c445b95..36e4bd159dc72 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -39,7 +39,6 @@ from .lora import LoRAConfig from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig -from .renderer import RendererConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -182,8 +181,6 @@ class VllmConfig: # try to download a model model_config: ModelConfig = Field(default=None) """Model configuration.""" - renderer_config: RendererConfig = Field(default_factory=RendererConfig) - """Renderer configuration.""" cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration.""" parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) @@ -744,7 +741,7 @@ class VllmConfig: from vllm.multimodal import MULTIMODAL_REGISTRY self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config) + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) ) logger.debug( "Encoder-decoder model detected: setting " @@ -1189,13 +1186,11 @@ class VllmConfig: computed_compile_ranges_split_points ) - def recalculate_max_model_len(self, original_max_model_len: int | None) -> None: - # Can only be called during try_verify_and_update_config - self.model_config.recalculate_max_model_len( - original_max_model_len, - tokenizer=self.renderer_config.tokenizer, - tokenizer_revision=self.renderer_config.tokenizer_revision, - ) + def recalculate_max_model_len(self, max_model_len: int): + # Can only be called in try_verify_and_update_config + model_config = self.model_config + max_model_len = model_config.get_and_verify_max_len(max_model_len) + self.model_config.max_model_len = max_model_len def try_verify_and_update_config(self): if self.model_config is None: @@ -1269,11 +1264,11 @@ class VllmConfig: return ( f"model={self.model_config.model!r}, " f"speculative_config={self.speculative_config!r}, " - f"tokenizer={self.renderer_config.tokenizer!r}, " - f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, " - f"tokenizer_mode={self.renderer_config.tokenizer_mode}, " + f"tokenizer={self.model_config.tokenizer!r}, " + f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, " + f"tokenizer_mode={self.model_config.tokenizer_mode}, " f"revision={self.model_config.revision}, " - f"tokenizer_revision={self.renderer_config.tokenizer_revision}, " + f"tokenizer_revision={self.model_config.tokenizer_revision}, " f"trust_remote_code={self.model_config.trust_remote_code}, " f"dtype={self.model_config.dtype}, " f"max_seq_len={self.model_config.max_model_len}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bd398abb0bf81..ceac5407af6e2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,11 +71,11 @@ from vllm.config.model import ( ModelDType, RunnerOption, TaskOption, + TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy -from vllm.config.renderer import RendererConfig, TokenizerMode from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel @@ -355,12 +355,17 @@ class EngineArgs: model: str = ModelConfig.model served_model_name: str | list[str] | None = ModelConfig.served_model_name + tokenizer: str | None = ModelConfig.tokenizer hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert task: TaskOption | None = ModelConfig.task + skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds + tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode trust_remote_code: bool = ModelConfig.trust_remote_code + allowed_local_media_path: str = ModelConfig.allowed_local_media_path + allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains download_dir: str | None = LoadConfig.download_dir safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy load_format: str | LoadFormats = LoadConfig.load_format @@ -444,6 +449,7 @@ class EngineArgs: code_revision: str | None = ModelConfig.code_revision hf_token: bool | str | None = ModelConfig.hf_token hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") + tokenizer_revision: str | None = ModelConfig.tokenizer_revision quantization: QuantizationMethods | None = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce @@ -452,6 +458,9 @@ class EngineArgs: ) enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings + media_io_kwargs: dict[str, dict[str, Any]] = get_field( + MultiModalConfig, "media_io_kwargs" + ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb @@ -465,19 +474,9 @@ class EngineArgs: mm_encoder_attn_backend: AttentionBackendEnum | str | None = ( MultiModalConfig.mm_encoder_attn_backend ) + io_processor_plugin: str | None = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float = MultiModalConfig.video_pruning_rate - # Renderer fields - tokenizer: str | None = None - tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode - tokenizer_revision: str | None = RendererConfig.tokenizer_revision - skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init - io_processor_plugin: str | None = None - media_io_kwargs: dict[str, dict[str, Any]] = get_field( - RendererConfig, "media_io_kwargs" - ) - allowed_local_media_path: str = RendererConfig.allowed_local_media_path - allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras @@ -628,14 +627,25 @@ class EngineArgs: model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) + model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) + model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( "--trust-remote-code", **model_kwargs["trust_remote_code"] ) model_group.add_argument("--dtype", **model_kwargs["dtype"]) model_group.add_argument("--seed", **model_kwargs["seed"]) model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"]) + model_group.add_argument( + "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"] + ) + model_group.add_argument( + "--allowed-media-domains", **model_kwargs["allowed_media_domains"] + ) model_group.add_argument("--revision", **model_kwargs["revision"]) model_group.add_argument("--code-revision", **model_kwargs["code_revision"]) + model_group.add_argument( + "--tokenizer-revision", **model_kwargs["tokenizer_revision"] + ) model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"]) model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"]) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) @@ -647,6 +657,9 @@ class EngineArgs: model_group.add_argument( "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"] ) + model_group.add_argument( + "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"] + ) model_group.add_argument( "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"] ) @@ -685,34 +698,8 @@ class EngineArgs: model_group.add_argument( "--logits-processors", **model_kwargs["logits_processors"] ) - - # Renderer arguments - renderer_kwargs = get_kwargs(RendererConfig) - renderer_group = parser.add_argument_group( - title="RendererConfig", - description=RendererConfig.__doc__, - ) - renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"]) - renderer_group.add_argument( - "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"] - ) - renderer_group.add_argument( - "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"] - ) - renderer_group.add_argument( - "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"] - ) - renderer_group.add_argument( - "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"] - ) - renderer_group.add_argument( - "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"] - ) - renderer_group.add_argument( - "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"] - ) - renderer_group.add_argument( - "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"] + model_group.add_argument( + "--io-processor-plugin", **model_kwargs["io_processor_plugin"] ) # Model loading arguments @@ -962,6 +949,9 @@ class EngineArgs: multimodal_group.add_argument( "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"] ) + multimodal_group.add_argument( + "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"] + ) multimodal_group.add_argument( "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"] ) @@ -1265,13 +1255,18 @@ class EngineArgs: runner=self.runner, convert=self.convert, task=self.task, + tokenizer=self.tokenizer, + tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, + allowed_local_media_path=self.allowed_local_media_path, + allowed_media_domains=self.allowed_media_domains, dtype=self.dtype, seed=self.seed, revision=self.revision, code_revision=self.code_revision, hf_token=self.hf_token, hf_overrides=self.hf_overrides, + tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, @@ -1279,11 +1274,13 @@ class EngineArgs: logprobs_mode=self.logprobs_mode, disable_sliding_window=self.disable_sliding_window, disable_cascade_attn=self.disable_cascade_attn, + skip_tokenizer_init=self.skip_tokenizer_init, enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, enable_mm_embeds=self.enable_mm_embeds, interleave_mm_strings=self.interleave_mm_strings, + media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, @@ -1301,6 +1298,7 @@ class EngineArgs: override_attention_dtype=self.override_attention_dtype, logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, + io_processor_plugin=self.io_processor_plugin, ) def validate_tensorizer_args(self): @@ -1396,25 +1394,9 @@ class EngineArgs: ) model_config = self.create_model_config() - renderer_config = RendererConfig( - model_config=model_config, - tokenizer=self.tokenizer or "", - tokenizer_mode=self.tokenizer_mode, - tokenizer_revision=self.tokenizer_revision, - skip_tokenizer_init=self.skip_tokenizer_init, - io_processor_plugin=self.io_processor_plugin, - media_io_kwargs=self.media_io_kwargs, - allowed_local_media_path=self.allowed_local_media_path, - allowed_media_domains=self.allowed_media_domains, - ) - - model_config.recalculate_max_model_len( - model_config.original_max_model_len, - tokenizer=renderer_config.tokenizer, - tokenizer_revision=renderer_config.tokenizer_revision, - ) - self.model = model_config.model + self.tokenizer = model_config.tokenizer + self._check_feature_supported(model_config) self._set_default_chunked_prefill_and_prefix_caching_args(model_config) self._set_default_max_num_seqs_and_batched_tokens_args( @@ -1786,7 +1768,6 @@ class EngineArgs: ) config = VllmConfig( model_config=model_config, - renderer_config=renderer_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 7b60e7f89861b..d94951a0cffc8 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Iterable, Mapping from typing import Any -from vllm.config import ModelConfig, RendererConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.inputs.data import PromptType from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, RequestOutput @@ -22,7 +22,6 @@ class EngineClient(ABC): """Protocol class for Clients to Engine""" vllm_config: VllmConfig - renderer_config: RendererConfig model_config: ModelConfig input_processor: InputProcessor io_processor: IOProcessor | None diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 5ad256c2f3eb3..aceaa8bd45b81 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor from typing_extensions import Required, TypedDict from vllm import envs -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict @@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens. def _try_get_processor_chat_template( tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, - *, - trust_remote_code: bool, + model_config: ModelConfig, ) -> str | None: - cache_key = (tokenizer.name_or_path, trust_remote_code) + cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) if cache_key in _PROCESSOR_CHAT_TEMPLATES: return _PROCESSOR_CHAT_TEMPLATES[cache_key] @@ -467,7 +466,7 @@ def _try_get_processor_chat_template( PreTrainedTokenizerFast, ProcessorMixin, ), - trust_remote_code=trust_remote_code, + trust_remote_code=model_config.trust_remote_code, ) if ( isinstance(processor, ProcessorMixin) @@ -500,10 +499,7 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - chat_template = _try_get_processor_chat_template( - tokenizer, - trust_remote_code=model_config.trust_remote_code, - ) + chat_template = _try_get_processor_chat_template(tokenizer, model_config) if chat_template is not None: return chat_template @@ -517,10 +513,10 @@ def resolve_hf_chat_template( exc_info=True, ) - # 4th priority: Predefined fallbacks] + # 4th priority: Predefined fallbacks path = get_chat_template_fallback_path( model_type=model_config.hf_config.model_type, - tokenizer_name_or_path=tokenizer.name_or_path, + tokenizer_name_or_path=model_config.tokenizer, ) if path is not None: logger.info_once( @@ -542,14 +538,14 @@ def _resolve_chat_template_content_format( tools: list[dict[str, Any]] | None, tokenizer: TokenizerLike | None, *, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> _ChatTemplateContentFormat: if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=renderer_config.model_config, + model_config=model_config, ) else: hf_chat_template = None @@ -599,7 +595,7 @@ def resolve_chat_template_content_format( given_format: ChatTemplateContentFormatOption, tokenizer: TokenizerLike | None, *, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> _ChatTemplateContentFormat: if given_format != "auto": return given_format @@ -608,7 +604,7 @@ def resolve_chat_template_content_format( chat_template, tools, tokenizer, - renderer_config=renderer_config, + model_config=model_config, ) _log_chat_template_content_format( @@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): maximum per prompt. """ - def __init__(self, renderer_config: RendererConfig): + def __init__(self, model_config: ModelConfig): super().__init__() - self._renderer_config = renderer_config + self._model_config = model_config self._items_by_modality = defaultdict[str, list[_T | None]](list) self._uuids_by_modality = defaultdict[str, list[str | None]](list) @property - def renderer_config(self) -> RendererConfig: - return self._renderer_config + def model_config(self) -> ModelConfig: + return self._model_config @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls - model_cls = get_model_cls(self.renderer_config.model_config) + model_cls = get_model_cls(self.model_config) return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): - return self._renderer_config.allowed_local_media_path + return self._model_config.allowed_local_media_path @property def allowed_media_domains(self): - return self._renderer_config.allowed_media_domains + return self._model_config.allowed_media_domains @property def mm_registry(self): @@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): @cached_property def mm_processor(self): - return self.mm_registry.create_processor(self.renderer_config) + return self.mm_registry.create_processor(self.model_config) def add( self, @@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) + self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=self.renderer_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) - @property - def renderer_config(self) -> RendererConfig: - return self._tracker.renderer_config - @property def model_config(self) -> ModelConfig: - return self.renderer_config.model_config + return self._tracker.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image = self._connector.fetch_image(image_url) if image_url else None @@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker + multimodal_config = self._tracker.model_config.multimodal_config + media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=self.renderer_config.media_io_kwargs, + media_io_kwargs=media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) - @property - def renderer_config(self) -> RendererConfig: - return self._tracker.renderer_config - @property def model_config(self) -> ModelConfig: - return self.renderer_config.model_config + return self._tracker.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image_coro = self._connector.fetch_image_async(image_url) if image_url else None @@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: def parse_chat_messages( messages: list[ChatCompletionMessageParam], - renderer_config: RendererConfig, + model_config: ModelConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, MultiModalUUIDDict | None, ]: - model_config = renderer_config.model_config - conversation: list[ConversationMessage] = [] - mm_tracker = MultiModalItemTracker(renderer_config) + mm_tracker = MultiModalItemTracker(model_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1644,17 +1635,15 @@ def parse_chat_messages( def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], - renderer_config: RendererConfig, + model_config: ModelConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], Awaitable[MultiModalDataDict | None], MultiModalUUIDDict | None, ]: - model_config = renderer_config.model_config - conversation: list[ConversationMessage] = [] - mm_tracker = AsyncMultiModalItemTracker(renderer_config) + mm_tracker = AsyncMultiModalItemTracker(model_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1759,14 +1748,14 @@ def apply_hf_chat_template( chat_template: str | None, tools: list[dict[str, Any]] | None, *, - renderer_config: RendererConfig, + model_config: ModelConfig, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=renderer_config.model_config, + model_config=model_config, ) if hf_chat_template is None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 6b3cb26afb626..913324fd5f9c3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -29,8 +29,8 @@ from vllm.config.model import ( HfOverrides, ModelDType, RunnerOption, + TokenizerMode, ) -from vllm.config.renderer import TokenizerMode from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, @@ -343,7 +343,6 @@ class LLM: logger.info("Supported tasks: %s", supported_tasks) self.supported_tasks = supported_tasks - self.renderer_config = self.llm_engine.renderer_config self.model_config = self.llm_engine.model_config self.input_processor = self.llm_engine.input_processor self.io_processor = self.llm_engine.io_processor @@ -809,13 +808,13 @@ class LLM: list_of_messages = [cast(list[ChatCompletionMessageParam], messages)] tokenizer = self.get_tokenizer() - renderer_config = self.renderer_config + model_config = self.model_config resolved_content_format = resolve_chat_template_content_format( chat_template, tools, chat_template_content_format, tokenizer, - renderer_config=renderer_config, + model_config=model_config, ) _chat_template_kwargs: dict[str, Any] = dict( @@ -834,7 +833,7 @@ class LLM: # the chat message parsing for it. conversation, mm_data, mm_uuids = parse_chat_messages( msgs, - renderer_config, + model_config, content_format=resolved_content_format, ) @@ -848,7 +847,7 @@ class LLM: prompt_str = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - renderer_config=renderer_config, + model_config=model_config, **_chat_template_kwargs, ) # Special tokens are already included in chat templates so @@ -1291,7 +1290,6 @@ class LLM: lora_request: list[LoRARequest] | LoRARequest | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: - renderer_config = self.renderer_config model_config = self.model_config if isinstance(tokenizer, MistralTokenizer): @@ -1319,7 +1317,7 @@ class LLM: for q, d in input_pairs: _, engine_prompt = get_score_prompt( - renderer_config=renderer_config, + model_config=model_config, data_1=q, data_2=d, tokenizer=tokenizer, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d77d611a2654d..7be601d824f34 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1099,7 +1099,7 @@ async def init_app_state( logger.info("Supported tasks: %s", supported_tasks) resolved_chat_template = await process_chat_template( - args.chat_template, engine_client, vllm_config.renderer_config + args.chat_template, engine_client, vllm_config.model_config ) if args.tool_server == "demo": diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a9e72fb00c5bd..3e421e21e3e80 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - if self.renderer_config.skip_tokenizer_init: + if self.model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d887cf48d89f9..99936f588f28b 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -291,7 +291,6 @@ class OpenAIServing: self.input_processor = self.models.input_processor self.io_processor = self.models.io_processor - self.renderer_config = self.models.renderer_config self.model_config = self.models.model_config self.max_model_len = self.model_config.max_model_len @@ -1101,18 +1100,18 @@ class OpenAIServing: Sequence[RequestPrompt], list[EngineTokensPrompt], ]: - renderer_config = self.renderer_config + model_config = self.model_config resolved_content_format = resolve_chat_template_content_format( chat_template, tool_dicts, chat_template_content_format, tokenizer, - renderer_config=renderer_config, + model_config=model_config, ) conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, - renderer_config, + model_config, content_format=resolved_content_format, ) @@ -1139,14 +1138,14 @@ class OpenAIServing: request_prompt = tokenizer.apply_chat_template( conversation=conversation, messages=messages, - model_config=renderer_config.model_config, + model_config=model_config, **_chat_template_kwargs, ) else: request_prompt = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - renderer_config=renderer_config, + model_config=model_config, **_chat_template_kwargs, ) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index ec65e659383d8..953398a9a72ae 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -71,7 +71,6 @@ class OpenAIServingModels: self.input_processor = self.engine_client.input_processor self.io_processor = self.engine_client.io_processor - self.renderer_config = self.engine_client.renderer_config self.model_config = self.engine_client.model_config self.max_model_len = self.model_config.max_model_len diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 5fd79eed1909b..cea9924ebbaca 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing): self.task_type = task_type self.asr_config = self.model_cls.get_speech_to_text_config( - self.renderer_config, task_type + self.model_config, task_type ) self.enable_force_include_usage = enable_force_include_usage @@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing): self.tokenizer = cast( PreTrainedTokenizerBase, get_tokenizer( - tokenizer_name=self.renderer_config.tokenizer, - tokenizer_mode=self.renderer_config.tokenizer_mode, + tokenizer_name=self.model_config.tokenizer, + tokenizer_mode=self.model_config.tokenizer_mode, ), ) @@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing): prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - renderer_config=self.renderer_config, + model_config=self.model_config, language=language, task_type=self.task_type, request_prompt=request.prompt, @@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing): if res.prompt_token_ids is not None: num_prompt_tokens = len(res.prompt_token_ids) if audio_tokens := self.model_cls.get_num_audio_tokens( - audio_duration_s, self.asr_config, self.renderer_config + audio_duration_s, self.asr_config, self.model_config ): num_prompt_tokens += audio_tokens diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index cd28ccba9ef95..7fb767e26d019 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - if self.renderer_config.skip_tokenizer_init: + if self.model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index f657fcefd3a86..e5a66783005a6 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -160,8 +160,10 @@ class ServingScores(OpenAIServing): data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, ) -> tuple[str, TokensPrompt]: + model_config = self.model_config + full_prompt, engine_prompt = get_score_prompt( - renderer_config=self.renderer_config, + model_config=model_config, data_1=data_1, data_2=data_2, tokenizer=tokenizer, diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 561adbe454f3c..072ddd4c90b16 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast from torch.nn import CosineSimilarity from typing_extensions import Required, TypedDict -from vllm.config import ModelConfig, RendererConfig +from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import ( BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam, @@ -88,9 +88,9 @@ def _validate_score_input_lens( def parse_score_data( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> tuple[str, str, MultiModalDataDict | None]: - mm_tracker = MultiModalItemTracker(renderer_config) + mm_tracker = MultiModalItemTracker(model_config) content_1 = _parse_score_content(data_1, mm_tracker) content_2 = _parse_score_content(data_2, mm_tracker) @@ -176,7 +176,7 @@ def post_process_tokens( def get_score_prompt( - renderer_config: RendererConfig, + model_config: ModelConfig, tokenizer: TokenizerLike, tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, @@ -185,14 +185,11 @@ def get_score_prompt( prompt_1, prompt_2, mm_data = parse_score_data( data_1, data_2, - renderer_config, + model_config, ) - from vllm.model_executor.model_loader import get_model_cls - model_config = renderer_config.model_config model = get_model_cls(model_config) - if supports_score_template(model): full_prompt = apply_score_template(model_config, prompt_1, prompt_2) prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index a81f73ac9e618..daeeb995bc749 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -13,7 +13,7 @@ from fastapi import Request from fastapi.responses import JSONResponse, StreamingResponse from starlette.background import BackgroundTask, BackgroundTasks -from vllm.config import RendererConfig +from vllm.config import ModelConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( @@ -288,7 +288,7 @@ def process_lora_modules( async def process_chat_template( args_chat_template: Path | str | None, engine_client: EngineClient, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> str | None: resolved_chat_template = load_chat_template(args_chat_template) if resolved_chat_template is not None: @@ -305,7 +305,7 @@ async def process_chat_template( tokenizer=tokenizer, chat_template=None, tools=None, - model_config=renderer_config.model_config, + model_config=model_config, ) if hf_chat_template != resolved_chat_template: @@ -314,6 +314,6 @@ async def process_chat_template( "It is different from official chat template '%s'. " "This discrepancy may lead to performance degradation.", resolved_chat_template, - renderer_config.model_config.model, + model_config.model, ) return resolved_chat_template diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index f534d102fc3b7..0372b06d0017f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing import Any, cast from typing_extensions import assert_never -from vllm.config import RendererConfig +from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -45,15 +45,14 @@ logger = init_logger(__name__) class InputPreprocessor: def __init__( self, - renderer_config: RendererConfig, + model_config: ModelConfig, tokenizer: TokenizerLike | None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: super().__init__() - self.renderer_config = renderer_config - self.model_config = renderer_config.model_config + self.model_config = model_config self.tokenizer = tokenizer self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache @@ -232,7 +231,7 @@ class InputPreprocessor: def _get_mm_processor(self) -> BaseMultiModalProcessor: if not hasattr(self, "_mm_processor"): self._mm_processor = self.mm_registry.create_processor( - self.renderer_config, + self.model_config, tokenizer=self.tokenizer, cache=self.mm_processor_cache, ) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index a2700bd5a5016..007d847ac3b7b 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax( from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - renderer_config = model.vllm_config.renderer_config + model_config = model.vllm_config.model_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax( from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - renderer_config.tokenizer, - revision=renderer_config.tokenizer_revision, - tokenizer_mode=renderer_config.tokenizer_mode, - trust_remote_code=renderer_config.trust_remote_code, + model_config.tokenizer, + revision=model_config.tokenizer_revision, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, ) false_id = tokenizer.convert_tokens_to_ids(tokens[0]) @@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - renderer_config = model.vllm_config.renderer_config + model_config = model.vllm_config.model_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - renderer_config.tokenizer, - revision=renderer_config.tokenizer_revision, - tokenizer_mode=renderer_config.tokenizer_mode, - trust_remote_code=renderer_config.trust_remote_code, + model_config.tokenizer, + revision=model_config.tokenizer_revision, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, ) token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index bd472474982ff..1f07381c0cbd0 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.projector_config = config.projector_config self.text_config = config.text_config - renderer_config = vllm_config.renderer_config - tokenizer = cached_tokenizer_from_config(renderer_config) + model_config = vllm_config.model_config + tokenizer = cached_tokenizer_from_config(model_config) self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN] self.sam_model = build_sam_vit_b() diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index be03e1df81d22..9f8faf9ed91ce 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.projector_config = config.projector_config self.text_config = config.text_config - renderer_config = vllm_config.renderer_config - tokenizer = cached_tokenizer_from_config(renderer_config) + model_config = vllm_config.model_config + tokenizer = cached_tokenizer_from_config(model_config) self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN] self.vision = self._init_vision_module( diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index f82529d849a64..7036118ada084 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -18,7 +18,7 @@ from transformers.models.gemma3n import ( ) from transformers.models.siglip import SiglipImageProcessorFast -from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig +from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration( cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, language: Optional[str], task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -798,9 +798,7 @@ class Gemma3nForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, - renderer_config: RendererConfig, - task_type: str, + cls, model_config: ModelConfig, task_type: str ) -> SpeechToTextConfig: return SpeechToTextConfig( # Let's set this to 30 as suggested in the docs for now, although diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 96645f20b79df..a4e50f4086281 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -34,7 +34,7 @@ import torch.nn.functional as F from torch import nn from transformers import BatchFeature, PretrainedConfig -from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear @@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - renderer_config: RendererConfig, + model_config: ModelConfig, stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration( else: raise ValueError(f"Unsupported task type {task_type}") - tokenizer = cached_tokenizer_from_config(renderer_config) + tokenizer = cached_tokenizer_from_config(model_config) chat = [dict(role="user", content=user_prompt)] prompt = tokenizer.apply_chat_template( chat, @@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> int | None: """Get the number of audio tokens for an audio duration in sec.""" - processor = cached_processor_from_config(renderer_config) + processor = cached_processor_from_config(model_config) hop_length = processor.audio_processor.melspec_kwargs["hop_length"] proj_win_size = processor.audio_processor.projector_window_size ds_rate = processor.audio_processor.projector_downsample_rate @@ -903,9 +903,7 @@ class GraniteSpeechForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, - renderer_config: RendererConfig, - task_type: str, + cls, model_config: ModelConfig, task_type: str ) -> SpeechToTextConfig: """Get the stt config for this model.""" # Default settings are reasonable for this model and we don't currently diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index b9f3ac8aee5f7..2aba626a7c737 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -6,7 +6,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import RendererConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.pooler import ( DispatchPooler, @@ -29,12 +29,12 @@ logger = init_logger(__name__) class GritLMMeanPool(nn.Module): """As `MeanPool`, but only includes non-instruction tokens.""" - def __init__(self, renderer_config: RendererConfig): + def __init__(self, model_config: ModelConfig): super().__init__() - self.renderer_config = renderer_config + self.model_config = model_config - tokenizer = cached_tokenizer_from_config(self.renderer_config) + tokenizer = cached_tokenizer_from_config(self.model_config) # Collect the tokens needed for pattern matching. # "▁<" is different from "_<". The former uses "▁" to indicate that @@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module): class GritLMPooler(Pooler): - def __init__(self, renderer_config: RendererConfig): + def __init__(self, model_config: ModelConfig): super().__init__() - self.pooling = GritLMMeanPool(renderer_config) + self.pooling = GritLMMeanPool(model_config) self.head = PoolerHead(PoolerNormalize()) def get_supported_tasks(self) -> Set[PoolingTask]: @@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM): self.pooler = DispatchPooler( { "token_embed": Pooler.for_token_embed(pooler_config), - "embed": GritLMPooler(vllm_config.renderer_config), + "embed": GritLMPooler(vllm_config.model_config), } ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 4df91aaf8b7f2..607ff55835f1d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -19,7 +19,7 @@ from torch import Tensor from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs -from vllm.config import RendererConfig, SpeechToTextConfig +from vllm.config import ModelConfig, SpeechToTextConfig from vllm.inputs import TokensPrompt from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -887,7 +887,7 @@ class SupportsTranscription(Protocol): cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -930,9 +930,7 @@ class SupportsTranscription(Protocol): @classmethod def get_speech_to_text_config( - cls, - renderer_config: RendererConfig, - task_type: Literal["transcribe", "translate"], + cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"] ) -> SpeechToTextConfig: """Get the speech to text config for the ASR model.""" ... @@ -942,7 +940,7 @@ class SupportsTranscription(Protocol): cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index d75637da1d0e2..18985cefbf5ea 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs) hf_processor.video_processor = cached_video_processor_from_config( - self.ctx.renderer_config, + self.ctx.model_config, processor_cls=InternVLVideoProcessor, size=hf_processor.image_processor.size, **kwargs, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 4daaefd0c2709..6dfab595e5b92 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1169,17 +1169,16 @@ class NemotronH_Nano_VL_V2( self.mlp1 = self.mlp1.to(self.language_model.config.dtype) self.config = config + self.model_config = vllm_config.model_config # Pre-tokenize special tokens for video processing # to avoid repeated tokenization - self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) - self._img_start_token_ids = self._tokenizer.encode( + tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + self._img_start_token_ids = tokenizer.encode( IMG_START, add_special_tokens=False ) - self._img_end_token_ids = self._tokenizer.encode( - IMG_END, add_special_tokens=False - ) - self._img_context_token_ids = self._tokenizer.encode( + self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False) + self._img_context_token_ids = tokenizer.encode( IMG_CONTEXT, add_special_tokens=False ) @@ -1365,7 +1364,7 @@ class NemotronH_Nano_VL_V2( input_embeds for the LLM. """ device = video_embeddings.device - tokenizer = self._tokenizer + tokenizer = cached_tokenizer_from_config(self.model_config) # Generate video replacement token IDs using get_video_repl # This tokenizes each frame separator independently, then uses pre-tokenized diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 797793e658f7b..391980fc61f9e 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( - self.ctx.renderer_config, + self.ctx.model_config, **kwargs, ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index ebe743fa82a00..faf2d80d24bba 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -193,7 +193,7 @@ class PixtralProcessorAdapter: class PixtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) + tokenizer = cached_tokenizer_from_config(self.ctx.model_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 0acd564e2e54f..7b408248ec74c 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder from transformers import BatchFeature, TensorType, WhisperConfig from transformers.tokenization_utils_base import TextInput -from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig +from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -176,7 +176,7 @@ class VoxtralProcessorAdapter: class VoxtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) + tokenizer = cached_tokenizer_from_config(self.ctx.model_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") @@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) + self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) # update quant config to so that ignored module and target module names # match the vLLM model names @@ -450,11 +450,9 @@ class VoxtralForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, - renderer_config: RendererConfig, - task_type: str, + cls, model_config: ModelConfig, task_type: str ) -> SpeechToTextConfig: - tokenizer = cached_tokenizer_from_config(renderer_config) + tokenizer = cached_tokenizer_from_config(model_config) audio_config = tokenizer.instruct.audio_encoder.audio_config max_audio_clip_s = audio_config.chunk_length_s sample_rate = audio_config.sampling_rate @@ -470,17 +468,17 @@ class VoxtralForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - renderer_config: RendererConfig, # not needed here + model_config: ModelConfig, stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, ) -> PromptType: - tokenizer = cached_tokenizer_from_config(renderer_config) + tokenizer = cached_tokenizer_from_config(model_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless req = TranscriptionRequest( - model=renderer_config.model_config.model, + model=model_config.model, audio=RawAudio.from_audio(audio), language=language, ) @@ -496,14 +494,14 @@ class VoxtralForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR model, without running a forward pass. This is used for estimating the amount of processing for this audio. """ - tokenizer = cached_tokenizer_from_config(renderer_config) + tokenizer = cached_tokenizer_from_config(model_config) adapter = VoxtralProcessorAdapter(tokenizer) return adapter.get_num_audio_tokens( int(audio_duration_s * stt_config.sample_rate) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 6f526e3956fff..b2feff1335151 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layers.cross_attention import CrossAttention -from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType @@ -811,7 +811,7 @@ class WhisperForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - renderer_config: RendererConfig, # not needed here + model_config: ModelConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -847,11 +847,9 @@ class WhisperForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, - renderer_config: RendererConfig, - task_type: str, + cls, model_config: ModelConfig, task_type: str ) -> SpeechToTextConfig: - processor = cached_processor_from_config(renderer_config) + processor = cached_processor_from_config(model_config) return SpeechToTextConfig( max_audio_clip_s=processor.feature_extractor.chunk_length, @@ -863,9 +861,9 @@ class WhisperForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> int | None: - processor = cached_processor_from_config(renderer_config) + processor = cached_processor_from_config(model_config) hop_length = processor.feature_extractor.hop_length assert hop_length is not None # NOTE(NickLucche) user can't pass encoder diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 9c838fe679582..67bdf5e1557f9 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -31,7 +31,7 @@ from .inputs import ( ) if TYPE_CHECKING: - from vllm.config import ModelConfig, RendererConfig, VllmConfig + from vllm.config import ModelConfig, VllmConfig from .processing import ResolvedPromptUpdate from .registry import MultiModalRegistry @@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache): def _enable_processor_cache( - renderer_config: "RendererConfig", + model_config: "ModelConfig", mm_registry: "MultiModalRegistry", ) -> bool: - if not mm_registry.supports_multimodal_inputs(renderer_config): + if not mm_registry.supports_multimodal_inputs(model_config): return False - mm_config = renderer_config.model_config.get_multimodal_config() + mm_config = model_config.get_multimodal_config() return mm_config.mm_processor_cache_gb > 0 @@ -599,7 +599,7 @@ def processor_cache_from_config( """Return a `BaseMultiModalProcessorCache`, if enabled.""" model_config = vllm_config.model_config - if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): + if not _enable_processor_cache(model_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -611,14 +611,14 @@ def processor_cache_from_config( def processor_only_cache_from_config( - renderer_config: "RendererConfig", + model_config: "ModelConfig", mm_registry: "MultiModalRegistry", ): """Return a `MultiModalProcessorOnlyCache`, if enabled.""" - if not _enable_processor_cache(renderer_config, mm_registry): + if not _enable_processor_cache(model_config, mm_registry): return None - return MultiModalProcessorOnlyCache(renderer_config.model_config) + return MultiModalProcessorOnlyCache(model_config) class BaseMultiModalReceiverCache( @@ -787,7 +787,7 @@ def engine_receiver_cache_from_config( """ model_config = vllm_config.model_config - if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): + if not _enable_processor_cache(model_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -809,7 +809,9 @@ def worker_receiver_cache_from_config( Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and mm_processor_cache_type=="shm". """ - if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): + model_config = vllm_config.model_config + + if not _enable_processor_cache(model_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 81ceb76a4b961..0390773783961 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -23,7 +23,7 @@ import torch from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.collection_utils import flatten_2d_lists, full_groupby from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -53,7 +53,7 @@ if TYPE_CHECKING: from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin - from vllm.config import ModelConfig, RendererConfig + from vllm.config import ModelConfig from .cache import BaseMultiModalProcessorCache from .profiling import BaseDummyInputsBuilder @@ -63,7 +63,6 @@ else: ProcessorMixin = object ModelConfig = object - RendererConfig = object BaseMultiModalProcessorCache = object @@ -946,29 +945,12 @@ class InputProcessingContext: modify the inputs. """ - renderer_config: RendererConfig - """The configuration of the renderer.""" + model_config: ModelConfig + """The configuration of the model.""" tokenizer: TokenizerLike | None """The tokenizer used to tokenize the inputs.""" - @classmethod - def from_config( - cls, - renderer_config: RendererConfig, - *, - tokenizer: TokenizerLike | None = None, - ): - if tokenizer is None and not renderer_config.skip_tokenizer_init: - tokenizer = cached_tokenizer_from_config(renderer_config) - - return cls(renderer_config, tokenizer) - - @property - def model_config(self) -> ModelConfig: - """The configuration of the model.""" - return self.renderer_config.model_config - def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( @@ -1065,7 +1047,7 @@ class InputProcessingContext: typ = ProcessorMixin return cached_processor_from_config( - self.renderer_config, + self.model_config, processor_cls=typ, tokenizer=self.tokenizer, **kwargs, diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index e49aaa5045c62..00a84f9dec4f7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from .cache import BaseMultiModalProcessorCache from .processing import ( @@ -22,7 +22,7 @@ from .profiling import ( ) if TYPE_CHECKING: - from vllm.config import ModelConfig, RendererConfig + from vllm.config import ModelConfig from vllm.model_executor.models.interfaces import SupportsMultiModal logger = init_logger(__name__) @@ -114,18 +114,17 @@ class MultiModalRegistry: return mm_options if len(mm_options) > 0 else None - def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool: + def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: """ Checks if the model supports multimodal inputs. Returns True if the model is multimodal with any non-zero supported modalities, otherwise returns False, effectively running in text-only mode. """ - model_config = renderer_config.model_config if not model_config.is_multimodal_model: return False - info = self._create_processing_info(renderer_config, tokenizer=None) + info = self._create_processing_info(model_config, tokenizer=None) supported_modalities = info.get_supported_mm_limits() mm_config = model_config.get_multimodal_config() @@ -145,7 +144,7 @@ class MultiModalRegistry: def get_max_tokens_per_item_by_modality( self, - renderer_config: "RendererConfig", + model_config: "ModelConfig", *, cache: BaseMultiModalProcessorCache | None = None, profiler_limits: Mapping[str, int] | None = None, @@ -154,11 +153,10 @@ class MultiModalRegistry: Get the maximum number of tokens per data item from each modality based on underlying model configuration. """ - model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(renderer_config, cache=cache) + processor = self.create_processor(model_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len @@ -173,7 +171,7 @@ class MultiModalRegistry: def get_mm_limits_per_prompt( self, - renderer_config: "RendererConfig", + model_config: "ModelConfig", *, cache: BaseMultiModalProcessorCache | None = None, ) -> Mapping[str, int]: @@ -181,11 +179,10 @@ class MultiModalRegistry: Get the maximum number of multi-modal input instances for each modality that are allowed per prompt for a model class. """ - model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(renderer_config, cache=cache) + processor = self.create_processor(model_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -231,21 +228,30 @@ class MultiModalRegistry: assert hasattr(model_cls, "_processor_factory") return cast("SupportsMultiModal", model_cls) + def _create_processing_ctx( + self, + model_config: "ModelConfig", + tokenizer: TokenizerLike | None = None, + ) -> InputProcessingContext: + if tokenizer is None and not model_config.skip_tokenizer_init: + tokenizer = cached_tokenizer_from_config(model_config) + + return InputProcessingContext(model_config, tokenizer) + def _create_processing_info( self, - renderer_config: "RendererConfig", + model_config: "ModelConfig", *, tokenizer: TokenizerLike | None = None, ) -> BaseProcessingInfo: - model_cls = self._get_model_cls(renderer_config.model_config) + model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - - ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) + ctx = self._create_processing_ctx(model_config, tokenizer) return factories.info(ctx) def create_processor( self, - renderer_config: "RendererConfig", + model_config: "ModelConfig", *, tokenizer: TokenizerLike | None = None, cache: BaseMultiModalProcessorCache | None = None, @@ -253,19 +259,19 @@ class MultiModalRegistry: """ Create a multi-modal processor for a specific model and tokenizer. """ - model_config = renderer_config.model_config if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) + ctx = self._create_processing_ctx(model_config, tokenizer) + return factories.build_processor(ctx, cache=cache) def get_decoder_dummy_data( self, - renderer_config: "RendererConfig", + model_config: "ModelConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -274,15 +280,15 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by `renderer_config`. + The model is identified by `model_config`. """ - processor = self.create_processor(renderer_config, cache=cache) + processor = self.create_processor(model_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(renderer_config.model_config) + mm_options = self._extract_mm_options(model_config) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) @@ -298,7 +304,7 @@ class MultiModalRegistry: def get_encoder_dummy_data( self, - renderer_config: "RendererConfig", + model_config: "ModelConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -307,15 +313,15 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by `renderer_config`. + The model is identified by `model_config`. """ - processor = self.create_processor(renderer_config, cache=cache) + processor = self.create_processor(model_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(renderer_config.model_config) + mm_options = self._extract_mm_options(model_config) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) @@ -330,15 +336,13 @@ class MultiModalRegistry: return dummy_data - def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int: + def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: """ Get the maximum length of the encoder input for encoder-decoder models. """ - model_config = renderer_config.model_config if not model_config.is_encoder_decoder: return 0 - - max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config) + max_tokens = self.get_max_tokens_per_item_by_modality(model_config) if not max_tokens: # TODO - this function assumes encoder-decoder models are # multimodal. This will need to change when adding support for more diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index c9575511af8c9..1d44feeee500f 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike if TYPE_CHECKING: - from vllm.config import RendererConfig + from vllm.config import ModelConfig logger = init_logger(__name__) @@ -205,18 +205,18 @@ def get_tokenizer( cached_get_tokenizer = lru_cache(get_tokenizer) -def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs): +def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): return cached_get_tokenizer( - renderer_config.tokenizer, - tokenizer_mode=renderer_config.tokenizer_mode, - revision=renderer_config.tokenizer_revision, - trust_remote_code=renderer_config.trust_remote_code, + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + revision=model_config.tokenizer_revision, + trust_remote_code=model_config.trust_remote_code, **kwargs, ) -def init_tokenizer_from_config(renderer_config: "RendererConfig"): - runner_type = renderer_config.model_config.runner_type +def init_tokenizer_from_config(model_config: "ModelConfig"): + runner_type = model_config.runner_type if runner_type == "generate" or runner_type == "draft": truncation_side = "left" elif runner_type == "pooling": @@ -225,9 +225,9 @@ def init_tokenizer_from_config(renderer_config: "RendererConfig"): assert_never(runner_type) return get_tokenizer( - renderer_config.tokenizer, - tokenizer_mode=renderer_config.tokenizer_mode, - trust_remote_code=renderer_config.trust_remote_code, - revision=renderer_config.tokenizer_revision, + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision, truncation_side=truncation_side, ) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index bdebd2686bae2..e9864b0c1531d 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: - from vllm.config import ModelConfig, RendererConfig + from vllm.config import ModelConfig _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor) @@ -233,18 +233,17 @@ def cached_get_processor_without_dynamic_kwargs( def cached_processor_from_config( - renderer_config: "RendererConfig", + model_config: "ModelConfig", processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: - model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(renderer_config.tokenizer), ( + assert not is_gguf(model_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load processor." ) - model = renderer_config.tokenizer - revision = renderer_config.tokenizer_revision + model = model_config.tokenizer + revision = model_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -298,11 +297,9 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor) def cached_feature_extractor_from_config( - renderer_config: "RendererConfig", + model_config: "ModelConfig", **kwargs: Any, ): - model_config = renderer_config.model_config - return cached_get_feature_extractor( model_config.model, revision=model_config.revision, @@ -351,17 +348,16 @@ cached_get_image_processor = lru_cache(get_image_processor) def cached_image_processor_from_config( - renderer_config: "RendererConfig", + model_config: "ModelConfig", **kwargs: Any, ): - model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(renderer_config.tokenizer), ( + assert not is_gguf(model_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load image processor." ) - model = renderer_config.tokenizer - revision = renderer_config.tokenizer_revision + model = model_config.tokenizer + revision = model_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -415,12 +411,10 @@ cached_get_video_processor = lru_cache(get_video_processor) def cached_video_processor_from_config( - renderer_config: "RendererConfig", + model_config: "ModelConfig", processor_cls: type[_V] | None = None, **kwargs: Any, ): - model_config = renderer_config.model_config - return cached_get_video_processor( model_config.model, revision=model_config.revision, diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 21315b85f22aa..3959e9a59a53b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry from vllm.v1.request import Request if TYPE_CHECKING: - from vllm.config import RendererConfig, SchedulerConfig + from vllm.config import ModelConfig, SchedulerConfig logger = init_logger(__name__) @@ -250,7 +250,7 @@ class EncoderCacheManager: def compute_encoder_budget( - renderer_config: "RendererConfig", + model_config: "ModelConfig", scheduler_config: "SchedulerConfig", mm_registry: MultiModalRegistry, ) -> tuple[int, int]: @@ -263,9 +263,9 @@ def compute_encoder_budget( - Space budget for encoder cache size, measured in number of tokens from the input sequence. """ - if mm_registry.supports_multimodal_inputs(renderer_config): + if mm_registry.supports_multimodal_inputs(model_config): max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - renderer_config + model_config ) return compute_mm_encoder_budget( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 96073efc5f56a..0a8efa2fd512f 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface): # This can be changed when we make encoder cache for embedding caching # across requests. encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - renderer_config=vllm_config.renderer_config, + model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, mm_registry=mm_registry, ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b76f9c0595d61..fd7e04dc02082 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -91,7 +91,6 @@ class AsyncLLM(EngineClient): # Ensure we can serialize custom transformer configs maybe_register_config_serialize_by_value() - self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config @@ -109,15 +108,15 @@ class AsyncLLM(EngineClient): "enabling logging without default stat loggers." ) - if self.renderer_config.skip_tokenizer_init: + if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.renderer_config) + tokenizer = init_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.renderer_config.io_processor_plugin, + self.model_config.io_processor_plugin, ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index a2f6ba5be8c17..e6a94f4e3de5d 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -43,7 +43,6 @@ class InputProcessor: mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: self.vllm_config = vllm_config - self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -55,7 +54,7 @@ class InputProcessor: self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry) self.input_preprocessor = InputPreprocessor( - self.renderer_config, + self.model_config, tokenizer, mm_registry, mm_processor_cache=self.mm_processor_cache, @@ -253,7 +252,7 @@ class InputProcessor: if not params.structured_outputs or not self.structured_outputs_config: return - if self.renderer_config.skip_tokenizer_init and params.structured_outputs: + if self.model_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) @@ -583,7 +582,7 @@ class InputProcessor: if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( - self.renderer_config, + model_config, tokenizer=tokenizer, ) assert isinstance(mm_processor, EncDecMultiModalProcessor) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index ba0e1cf25cb08..4c31291005477 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -60,7 +60,6 @@ class LLMEngine: ) -> None: self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config - self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -84,15 +83,15 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False - if self.renderer_config.skip_tokenizer_init: + if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.renderer_config) + tokenizer = init_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.renderer_config.io_processor_plugin, + self.model_config.io_processor_plugin, ) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 7976418510aab..31428db2d3afc 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -85,7 +85,7 @@ class EagleProposer: # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - vllm_config.renderer_config + vllm_config.model_config ) self.attn_metadata_builder: AttentionMetadataBuilder | None = None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 36aa3d9bb3f94..5ee88178cdf60 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -63,7 +63,7 @@ class StructuredOutputManager: max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) - if not vllm_config.renderer_config.skip_tokenizer_init: + if not self.vllm_config.model_config.skip_tokenizer_init: # The default max_workers if not specified is the number of # CPUs * 5, which is way too high since these tasks are CPU-bound, # not I/O bound. We also know we would never dominate CPU usage @@ -71,15 +71,21 @@ class StructuredOutputManager: # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config) - reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser + self.tokenizer = init_tokenizer_from_config( + model_config=self.vllm_config.model_config + ) + reasoning_parser = ( + self.vllm_config.structured_outputs_config.reasoning_parser + ) reasoning_parser_plugin = ( - vllm_config.structured_outputs_config.reasoning_parser_plugin + self.vllm_config.structured_outputs_config.reasoning_parser_plugin ) if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) - reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser + reasoning_parser = ( + self.vllm_config.structured_outputs_config.reasoning_parser + ) if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoning_parser @@ -87,7 +93,7 @@ class StructuredOutputManager: self.reasoner = reasoner_cls(tokenizer=self.tokenizer) self.enable_in_reasoning = ( - vllm_config.structured_outputs_config.enable_in_reasoning + self.vllm_config.structured_outputs_config.enable_in_reasoning ) def grammar_init(self, request: Request) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b3c8d4da22b63..a50360ab08694 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -271,7 +271,6 @@ class GPUModelRunner( device: torch.device, ): self.vllm_config = vllm_config - self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config @@ -336,7 +335,7 @@ class GPUModelRunner( self.uses_mrope = model_config.uses_mrope self.uses_xdrope_dim = model_config.uses_xdrope_dim self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - self.renderer_config + model_config ) if self.model_config.is_encoder_decoder: @@ -559,7 +558,7 @@ class GPUModelRunner( self.mm_budget = ( MultiModalBudget( - self.renderer_config, + self.model_config, self.scheduler_config, self.mm_registry, ) @@ -3874,7 +3873,7 @@ class GPUModelRunner( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - renderer_config=self.renderer_config, + model_config=self.model_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7e2a6af68e6af..283f21b779e38 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -143,7 +143,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): original_parallel_config: ParallelConfig | None = None, ): self.vllm_config = vllm_config - self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -223,7 +222,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - self.renderer_config + model_config ) # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." @@ -354,7 +353,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_budget = ( MultiModalBudget( - self.renderer_config, + self.model_config, self.scheduler_config, self.mm_registry, ) @@ -2039,7 +2038,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - renderer_config=self.renderer_config, + model_config=self.model_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 44418b9985e2e..0b0e2006d73d2 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,7 @@ import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import RendererConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -23,29 +23,24 @@ class MultiModalBudget: def __init__( self, - renderer_config: RendererConfig, + model_config: ModelConfig, scheduler_config: SchedulerConfig, mm_registry: MultiModalRegistry, ) -> None: super().__init__() - self.renderer_config = renderer_config - self.model_config = renderer_config.model_config + self.model_config = model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry - self.cache = cache = processor_only_cache_from_config( - renderer_config, mm_registry - ) + self.cache = cache = processor_only_cache_from_config(model_config, mm_registry) - self.max_model_len = self.model_config.max_model_len + self.max_model_len = model_config.max_model_len self.max_num_reqs = scheduler_config.max_num_seqs - self.mm_limits = mm_registry.get_mm_limits_per_prompt( - renderer_config, cache=cache - ) + self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache) max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - renderer_config, + model_config, cache=cache, profiler_limits=self.mm_limits, ) From 1b0482b9d1b8a082ff94a95516ec78349de6f515 Mon Sep 17 00:00:00 2001 From: Yifan Qiao Date: Sun, 7 Dec 2025 00:39:21 -0800 Subject: [PATCH 048/133] [Misc][Core] Remove unused `req_index` increment in scheduler (#30176) Signed-off-by: Yifan Qiao --- vllm/v1/core/sched/scheduler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 0a8efa2fd512f..d858e840039c4 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -607,7 +607,6 @@ class Scheduler(SchedulerInterface): self._update_connector_prefix_cache_stats(request) - req_index += 1 self.running.append(request) if self.log_stats: request.record_event( From 879ddb09c3b7b5a21f75e46ded148ce70f1c486e Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 7 Dec 2025 17:58:47 +0800 Subject: [PATCH 049/133] [Kernel][MoE] optimize `moe_align_block_size` (#29642) Signed-off-by: Jinzhen Lin Co-authored-by: Jee Jee Li --- .../kernels/benchmark_moe_align_block_size.py | 21 ++- csrc/moe/moe_align_sum_kernels.cu | 152 ++++++++++++------ csrc/moe/moe_ops.h | 3 +- csrc/moe/torch_bindings.cpp | 3 +- tests/kernels/moe/test_moe.py | 16 +- .../kernels/moe/test_moe_align_block_size.py | 22 ++- vllm/_custom_ops.py | 2 + .../layers/fused_moe/fused_marlin_moe.py | 6 +- .../layers/fused_moe/fused_moe.py | 9 +- .../layers/fused_moe/moe_align_block_size.py | 24 ++- 10 files changed, 195 insertions(+), 63 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py index f540cff6261a8..5f9a131f79b0e 100644 --- a/benchmarks/kernels/benchmark_moe_align_block_size.py +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: num_tokens_range = [1, 16, 256, 4096] num_experts_range = [16, 64, 224, 256, 280, 512] topk_range = [1, 2, 8] -configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range)) +ep_size_range = [1, 8] +configs = list( + itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range) +) @triton.testing.perf_report( triton.testing.Benchmark( - x_names=["num_tokens", "num_experts", "topk"], + x_names=["num_tokens", "num_experts", "topk", "ep_size"], x_vals=configs, line_arg="provider", line_vals=["vllm"], @@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range args={}, ) ) -def benchmark(num_tokens, num_experts, topk, provider): +def benchmark(num_tokens, num_experts, topk, ep_size, provider): """Benchmark function for Triton.""" block_size = 256 + torch.cuda.manual_seed_all(0) topk_ids = get_topk_ids(num_tokens, num_experts, topk) + e_map = None + if ep_size != 1: + local_e = num_experts // ep_size + e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e] + e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32) + e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32) + quantiles = [0.5, 0.2, 0.8] if provider == "vllm": ms, min_ms, max_ms = triton.testing.do_bench( - lambda: moe_align_block_size(topk_ids, block_size, num_experts), + lambda: moe_align_block_size( + topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True + ), quantiles=quantiles, ) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index b3d0c0aa58e9e..ddcdcc38b4fea 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -83,14 +83,22 @@ template __global__ void moe_align_block_size_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, - int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, + int32_t* __restrict__ total_tokens_post_pad, + int32_t* __restrict__ expert_map, int32_t num_experts, int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, - size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded) { + size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded, + bool has_expert_map) { extern __shared__ int32_t shared_counts[]; - // Initialize sorted_token_ids with numel - for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { - sorted_token_ids[it] = numel; + // Use a separate threadblock to fill sorted_token_ids. + // This is safe since the current kernel does not use sorted_token_ids. + if (blockIdx.x == 1) { + // Initialize sorted_token_ids with numel + for (size_t it = threadIdx.x; it < max_num_tokens_padded; + it += blockDim.x) { + sorted_token_ids[it] = numel; + } + return; } const int warp_id = threadIdx.x / WARP_SIZE; @@ -112,6 +120,11 @@ __global__ void moe_align_block_size_kernel( if (expert_id >= num_experts) { continue; } + if (has_expert_map) { + expert_id = expert_map[expert_id]; + // filter invalid experts + if (expert_id == -1) continue; + } int warp_idx = expert_id / experts_per_warp; int expert_offset = expert_id % experts_per_warp; atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); @@ -163,7 +176,8 @@ template __global__ void count_and_sort_expert_tokens_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, - size_t numel, int32_t num_experts) { + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + bool has_expert_map) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t stride = blockDim.x * gridDim.x; @@ -172,6 +186,11 @@ __global__ void count_and_sort_expert_tokens_kernel( if (expert_id >= num_experts) { continue; } + if (has_expert_map) { + expert_id = expert_map[expert_id]; + // filter invalid experts + if (expert_id == -1) continue; + } int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); sorted_token_ids[rank_post_pad] = i; } @@ -193,50 +212,69 @@ __global__ void moe_sum_kernel( } } -template +template __global__ void moe_align_block_size_small_batch_expert_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, - int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts, - int32_t block_size, size_t numel, int32_t max_num_tokens_padded) { - // Initialize sorted_token_ids with numel - for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { - sorted_token_ids[it] = numel; + int32_t* __restrict__ total_tokens_post_pad, + int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size, + size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) { + // Use an additional group of threads to fill sorted_token_ids. + // Since the current kernel will use sorted_token_ids afterward, + // we fill sorted_token_ids within the same threadblock to make + // synchronization easier. + if (threadIdx.x < fill_threads) { + // Initialize sorted_token_ids with numel + for (size_t it = threadIdx.x; it < max_num_tokens_padded; + it += fill_threads) { + sorted_token_ids[it] = numel; + } + // Three __syncthreads() corresponding to the other threads + __syncthreads(); + __syncthreads(); + __syncthreads(); + return; } - const size_t tid = threadIdx.x; - const size_t stride = blockDim.x; + const size_t tid = threadIdx.x - fill_threads; + const size_t stride = blockDim.x - fill_threads; extern __shared__ int32_t shared_mem[]; int32_t* cumsum = shared_mem; int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1); for (int i = 0; i < num_experts; ++i) { - tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0; + tokens_cnts[(tid + 1) * num_experts + i] = 0; } for (size_t i = tid; i < numel; i += stride) { - ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]]; + int32_t expert_id = topk_ids[i]; + if (has_expert_map) { + expert_id = expert_map[expert_id]; + // filter invalid expert + if (expert_id == -1) continue; + } + ++tokens_cnts[(tid + 1) * num_experts + expert_id]; } __syncthreads(); - if (threadIdx.x < num_experts) { - tokens_cnts[threadIdx.x] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[i * num_experts + threadIdx.x] += - tokens_cnts[(i - 1) * num_experts + threadIdx.x]; + if (tid < num_experts) { + tokens_cnts[tid] = 0; + for (int i = 1; i <= stride; ++i) { + tokens_cnts[i * num_experts + tid] += + tokens_cnts[(i - 1) * num_experts + tid]; } } __syncthreads(); - if (threadIdx.x == 0) { + if (tid == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { cumsum[i] = cumsum[i - 1] + - CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) * + CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) * block_size; } *total_tokens_post_pad = static_cast(cumsum[num_experts]); @@ -244,26 +282,30 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( __syncthreads(); - if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; + if (tid < num_experts) { + for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) { + expert_ids[i / block_size] = tid; } } // Fill remaining expert_ids with 0 - const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x; + const size_t fill_start_idx = cumsum[num_experts] / block_size + tid; const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); - for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) { + for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) { expert_ids[i] = 0; } for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i]; + if (has_expert_map) { + expert_id = expert_map[expert_id]; + // filter invalid expert + if (expert_id == -1) continue; + } int32_t rank_post_pad = - tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id]; + tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id]; sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[threadIdx.x * num_experts + expert_id]; + ++tokens_cnts[tid * num_experts + expert_id]; } } @@ -275,7 +317,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad) { + torch::Tensor num_tokens_post_pad, + std::optional maybe_expert_map) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); int64_t padded_num_experts = @@ -287,14 +330,19 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, // BlockScan uses 1024 threads and assigns one thread per expert. TORCH_CHECK(padded_num_experts < 1024, "padded_num_experts must be less than 1024"); + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + bool has_expert_map = maybe_expert_map.has_value(); + torch::Tensor expert_map; + if (has_expert_map) { + expert_map = maybe_expert_map.value(); + } else { + expert_map = torch::empty({0}, options_int); + } VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `cumsum` tensors - auto options_int = - torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); - torch::Tensor cumsum_buffer = - torch::empty({num_experts + 1}, options_int); bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64); @@ -304,30 +352,41 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, ((threads + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + // threadIdx.x >= fill_threads: counting experts and aligning + // threadIdx.x < fill_threads: filling sorted_token_ids + constexpr int32_t fill_threads = 256; auto small_batch_expert_kernel = vllm::moe::moe_align_block_size_small_batch_expert_kernel< - scalar_t>; - small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>( + scalar_t, fill_threads>; + small_batch_expert_kernel<<<1, fill_threads + threads, + shared_mem_size, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), sorted_token_ids.size(0)); + num_tokens_post_pad.data_ptr(), + expert_map.data_ptr(), num_experts, block_size, + topk_ids.numel(), sorted_token_ids.size(0), has_expert_map); } else { + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); auto align_kernel = vllm::moe::moe_align_block_size_kernel; size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp); size_t shared_mem_size = num_warps * experts_per_warp * sizeof(int32_t); - align_kernel<<<1, threads, shared_mem_size, stream>>>( + // launch two threadblocks + // blockIdx.x == 0: counting experts and aligning + // blockIdx.x == 1: filling sorted_token_ids + align_kernel<<<2, threads, shared_mem_size, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), - num_tokens_post_pad.data_ptr(), num_experts, - padded_num_experts, experts_per_warp, block_size, - topk_ids.numel(), cumsum_buffer.data_ptr(), - sorted_token_ids.size(0)); + num_tokens_post_pad.data_ptr(), + expert_map.data_ptr(), num_experts, padded_num_experts, + experts_per_warp, block_size, topk_ids.numel(), + cumsum_buffer.data_ptr(), sorted_token_ids.size(0), + has_expert_map); const int block_threads = std::min(256, (int)threads); const int num_blocks = @@ -340,7 +399,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sort_kernel<<>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), - cumsum_buffer.data_ptr(), topk_ids.numel(), num_experts); + cumsum_buffer.data_ptr(), expert_map.data_ptr(), + topk_ids.numel(), num_experts, has_expert_map); } }); } diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 11c6875f7f1d0..4c7accf03440a 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -11,7 +11,8 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output); void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad); + torch::Tensor num_tokens_post_pad, + std::optional maybe_expert_map); void batched_moe_align_block_size(int64_t max_tokens_per_batch, int64_t block_size, diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index e0a8280722f3c..fca57c31caf8e 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "moe_align_block_size(Tensor topk_ids, int num_experts," " int block_size, Tensor! sorted_token_ids," " Tensor! experts_ids," - " Tensor! num_tokens_post_pad) -> ()"); + " Tensor! num_tokens_post_pad," + " Tensor? maybe_expert_map) -> ()"); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); // Aligning the number of tokens to be processed by each expert such diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index bacf6f37f2b08..82659276af37c 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -955,9 +955,22 @@ def test_fused_marlin_moe_with_bias(m): torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0) -def test_moe_align_block_size_opcheck(): +@pytest.mark.parametrize("ep_size", [1, 2]) +def test_moe_align_block_size_opcheck(ep_size): num_experts = 4 block_size = 4 + + expert_map = None + if ep_size != 1: + local_num_experts = num_experts // ep_size + expert_ids = torch.randint( + 0, num_experts, (local_num_experts,), device="cuda", dtype=torch.int32 + ) + expert_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32) + expert_map[expert_ids] = torch.arange( + local_num_experts, device="cuda", dtype=torch.int32 + ) + topk_ids = torch.randint(0, num_experts, (3, 4), dtype=torch.int32, device="cuda") max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) @@ -980,6 +993,7 @@ def test_moe_align_block_size_opcheck(): sorted_ids, expert_ids, num_tokens_post_pad, + expert_map, ), ) diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index 8975f00bd4c6e..1abfc11fb460e 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -106,6 +106,8 @@ def torch_moe_align_block_size( max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) if pad_sorted_ids: max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + if topk_ids.numel() < num_experts: + max_num_tokens_padded = topk_ids.numel() * block_size flattened_token_indices = torch.arange( topk_ids.numel(), device=topk_ids.device, dtype=torch.int32 @@ -126,6 +128,8 @@ def torch_moe_align_block_size( ) for expert_id in range(num_experts): original_count = expert_token_counts[expert_id] + if expert_map is not None and expert_map[expert_id] == -1: + continue if original_count > 0: expert_padded_counts[expert_id] = ( (original_count + block_size - 1) // block_size @@ -143,6 +147,9 @@ def torch_moe_align_block_size( current_pos = 0 current_block = 0 for expert_id in range(num_experts): + if expert_map is not None and expert_map[expert_id] == -1: + continue + expert_mask = sorted_expert_ids == expert_id expert_tokens = sorted_token_indices[expert_mask] num_expert_tokens = expert_tokens.shape[0] @@ -153,7 +160,13 @@ def torch_moe_align_block_size( ) expert_blocks_needed = expert_padded_counts[expert_id] // block_size - expert_ids[current_block : current_block + expert_blocks_needed] = expert_id + + expert_id_new = expert_id + if expert_map is not None: + expert_id_new = expert_map[expert_id] + expert_ids[current_block : current_block + expert_blocks_needed] = ( + expert_id_new + ) current_pos += expert_padded_counts[expert_id] current_block += expert_blocks_needed @@ -163,8 +176,6 @@ def torch_moe_align_block_size( [total_padded_tokens], dtype=torch.int32, device=topk_ids.device ) - if expert_map is not None: - expert_ids = expert_map[expert_ids] return sorted_token_ids, expert_ids, num_tokens_post_pad @@ -229,9 +240,9 @@ def test_moe_align_block_size( ) -@pytest.mark.parametrize("m", [16, 32]) +@pytest.mark.parametrize("m", [16, 32, 2048]) @pytest.mark.parametrize("topk", [2, 4]) -@pytest.mark.parametrize("num_experts", [8]) +@pytest.mark.parametrize("num_experts", [8, 64]) @pytest.mark.parametrize("block_size", [64]) @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_moe_align_block_size_with_expert_map( @@ -253,6 +264,7 @@ def test_moe_align_block_size_with_expert_map( block_size=block_size, num_experts=num_experts, expert_map=expert_map, + ignore_invalid_experts=True, ) golden_sorted_ids, golden_expert_ids, golden_num_tokens = ( torch_moe_align_block_size( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e60158898685a..94e27545239f4 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1877,6 +1877,7 @@ def moe_align_block_size( sorted_token_ids: torch.Tensor, experts_ids: torch.Tensor, num_tokens_post_pad: torch.Tensor, + expert_map: torch.Tensor | None = None, ) -> None: torch.ops._moe_C.moe_align_block_size( topk_ids, @@ -1885,6 +1886,7 @@ def moe_align_block_size( sorted_token_ids, experts_ids, num_tokens_post_pad, + expert_map, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 9c377db720132..92d72b75656cd 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -316,7 +316,11 @@ def fused_marlin_moe( if global_num_experts == -1: global_num_experts = E sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( - topk_ids, block_size_m, global_num_experts, expert_map + topk_ids, + block_size_m, + global_num_experts, + expert_map, + ignore_invalid_experts=True, ) assert activation is not None diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index df208eae2e71c..f3c158ee2f9dc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1887,7 +1887,11 @@ def fused_experts_impl( ) sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( - curr_topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map + curr_topk_ids, + config["BLOCK_SIZE_M"], + global_num_experts, + expert_map, + ignore_invalid_experts=True, ) invoke_fused_moe_kernel( @@ -1946,6 +1950,9 @@ def fused_experts_impl( block_shape=block_shape, ) + if expert_map is not None: + intermediate_cache3.zero_() + invoke_fused_moe_kernel( qintermediate_cache2, w2, diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index 7f6155997264d..7fc8bfcf824d9 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -14,6 +14,7 @@ def moe_align_block_size( num_experts: int, expert_map: torch.Tensor | None = None, pad_sorted_ids: bool = False, + ignore_invalid_experts: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns the token distribution across experts to be compatible with block @@ -35,7 +36,13 @@ def moe_align_block_size( expert parallel shard. If the expert is not in the current expert parallel shard, the mapping is set to -1. - pad_sorted_ids: A flag indicating whether the sorted_token_ids length - should be padded to a multiple of block_size, + should be padded to a multiple of block_size, + - ignore_invalid_experts: A flag indicating whether to ignore invalid + experts. When False, all expert_ids in topk_ids will participate in + counting and ranking, but invalid experts in expert_ids will be marked + as -1. When True, all invalid expert_ids in topk_ids will be ignored + and will not participate in counting or ranking, and there will be no + -1 in expert_ids. Returns: - sorted_token_ids: A tensor containing the sorted token indices according @@ -67,6 +74,10 @@ def moe_align_block_size( max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) if pad_sorted_ids: max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + if topk_ids.numel() < num_experts: + max_num_tokens_padded = min( + topk_ids.numel() * block_size, max_num_tokens_padded + ) sorted_ids = torch.empty( (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device ) @@ -77,9 +88,16 @@ def moe_align_block_size( num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) ops.moe_align_block_size( - topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + expert_map if ignore_invalid_experts else None, ) - if expert_map is not None: + + if expert_map is not None and not ignore_invalid_experts: expert_ids = expert_map[expert_ids] return sorted_ids, expert_ids, num_tokens_post_pad From b0f4866a77f22720a5e295cbcffee5cbe9fc2b56 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 7 Dec 2025 20:27:11 +0800 Subject: [PATCH 050/133] [CI/Build]Temporary workaround for test_default_mm_loras timeout (#30202) Signed-off-by: Jee Jee Li --- tests/lora/test_default_mm_loras.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py index 407b29fdd1d58..1d16862b30e52 100644 --- a/tests/lora/test_default_mm_loras.py +++ b/tests/lora/test_default_mm_loras.py @@ -13,6 +13,7 @@ from huggingface_hub import snapshot_download from vllm.lora.request import LoRARequest from ..conftest import AudioTestAssets, VllmRunner +from ..utils import create_new_process_for_each_test MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct") AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora") @@ -60,6 +61,7 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs) assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix) +@create_new_process_for_each_test() def test_active_default_mm_lora( vllm_runner: type[VllmRunner], audio_assets: AudioTestAssets, @@ -74,6 +76,7 @@ def test_active_default_mm_lora( ) +@create_new_process_for_each_test() def test_inactive_default_mm_lora( vllm_runner: type[VllmRunner], audio_assets: AudioTestAssets, @@ -89,6 +92,7 @@ def test_inactive_default_mm_lora( ) +@create_new_process_for_each_test() def test_default_mm_lora_succeeds_with_redundant_lora_request( vllm_runner: type[VllmRunner], audio_assets: AudioTestAssets, @@ -103,6 +107,7 @@ def test_default_mm_lora_succeeds_with_redundant_lora_request( ) +@create_new_process_for_each_test() def test_default_mm_lora_fails_with_overridden_lora_request( vllm_runner: type[VllmRunner], audio_assets: AudioTestAssets, @@ -118,6 +123,7 @@ def test_default_mm_lora_fails_with_overridden_lora_request( ) +@create_new_process_for_each_test() def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner): class MockEngineException(Exception): pass From 541a2ef892720489f770569417bc1bc4436dbb21 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Sun, 7 Dec 2025 07:31:14 -0500 Subject: [PATCH 051/133] [Perf] Deepgemm fused layout kernel for activations, 4.3% throughput improvement, 10.7% TTFT improvement. (#29546) Signed-off-by: yewentao256 --- csrc/ops.h | 8 + .../w8a8/fp8/per_token_group_quant.cu | 185 ++++++++++++++++++ csrc/torch_bindings.cpp | 9 + .../layers/fused_moe/deep_gemm_moe.py | 41 ++-- .../layers/quantization/utils/fp8_utils.py | 80 +++++++- 5 files changed, 311 insertions(+), 12 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 4bb7857b15032..d302f04913266 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -299,6 +299,14 @@ void per_token_group_quant_int8(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double int8_min, double int8_max); + +// Fused activation quantisation + DeepGEMM-compatible UE8M0-packed scales. +void per_token_group_quant_8bit_packed(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s_packed, + int64_t group_size, double eps, + double min_8bit, double max_8bit); + #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, diff --git a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu index e3ab0676b254e..f9ac874c43730 100644 --- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu +++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu @@ -206,6 +206,191 @@ void per_token_group_quant_8bit(const torch::Tensor& input, #undef LAUNCH_KERNEL } +template +__global__ void per_token_group_quant_8bit_packed_kernel( + const T* __restrict__ input, void* __restrict__ output_q, + unsigned int* __restrict__ output_s_packed, const int group_size, + const int num_groups, const int groups_per_block, const int groups_per_row, + const int mn, const int tma_aligned_mn, const float eps, + const float min_8bit, const float max_8bit) { + const int threads_per_group = 16; + const int64_t local_group_id = threadIdx.x / threads_per_group; + const int lane_id = threadIdx.x % threads_per_group; + + const int64_t block_group_id = blockIdx.x * groups_per_block; + const int64_t global_group_id = block_group_id + local_group_id; + if (global_group_id >= num_groups) { + return; + } + + const int64_t block_group_offset = global_group_id * group_size; + + float local_absmax = eps; + + const T* group_input = input + block_group_offset; + DST_DTYPE* group_output = + static_cast(output_q) + block_group_offset; + + // shared memory to cache each group's data to avoid double DRAM reads. + extern __shared__ __align__(16) char smem_raw[]; + T* smem = reinterpret_cast(smem_raw); + T* smem_group = smem + local_group_id * group_size; + + constexpr int vec_size = 16 / sizeof(T); + using vec_t = vllm::vec_n_t; + + // copy global -> shared & compute absmax + auto scalar_op_cache = [&] __device__(T & dst, const T& src) { + float abs_v = fabsf(static_cast(src)); + local_absmax = fmaxf(local_absmax, abs_v); + dst = src; + }; + + vllm::vectorize_with_alignment( + group_input, // in + smem_group, // out (shared) + group_size, // elements per group + lane_id, // thread id + threads_per_group, // stride in group + scalar_op_cache); // scalar handler + + local_absmax = GroupReduceMax(local_absmax); + + float y_s = local_absmax / max_8bit; + y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f)))); + + // pack 4 scales into a uint32 + if (lane_id == 0) { + // map flat group id to 2D indices (mn_idx, sf_k_idx) + const int sf_k_idx = static_cast(global_group_id % groups_per_row); + const int mn_idx = static_cast(global_group_id / groups_per_row); + + if (mn_idx < mn) { + // each uint32 in output_s_packed stores 4 packed scales + const int sf_k_pack_idx = sf_k_idx / 4; + const int pos = sf_k_idx % 4; + + // reinterpret the UE8M0 scale y_s as IEEE bits, extract the 8-bit + // exponent, and place it into the correct byte of the 32-bit word. + const unsigned int bits = __float_as_uint(y_s); + const unsigned int exponent = (bits >> 23u) & 0xffu; + const unsigned int contrib = exponent << (pos * 8u); + + const int out_idx = sf_k_pack_idx * tma_aligned_mn + mn_idx; + // atomically OR 8-bit exponent into the packed scales buffer + atomicOr(output_s_packed + out_idx, contrib); + } + } + + __syncthreads(); + + // quantize shared -> global 8-bit + auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) { + float q = fminf(fmaxf(static_cast(src) / y_s, min_8bit), max_8bit); + dst = DST_DTYPE(q); + }; + + vllm::vectorize_with_alignment( + smem_group, // in (shared) + group_output, // out (global quant tensor) + group_size, // elements + lane_id, // tid + threads_per_group, // stride + scalar_op_quant); // scalar handler +} + +void per_token_group_quant_8bit_packed(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s_packed, + int64_t group_size, double eps, + double min_8bit, double max_8bit) { + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(output_q.is_contiguous()); + + const int64_t k = input.size(-1); + TORCH_CHECK(k % group_size == 0, "Last dimension (", k, + ") must be divisible by group_size (", group_size, ")."); + + const int64_t mn = input.numel() / k; + const int64_t groups_per_row = k / group_size; + const int64_t num_groups = mn * groups_per_row; + + TORCH_CHECK(output_s_packed.dim() == 2, + "output_s_packed must be 2D, got dim=", output_s_packed.dim(), + "."); + + const int64_t k_num_packed_sfk = (groups_per_row + 3) / 4; + const int64_t tma_aligned_mn = ((mn + 3) / 4) * 4; + + TORCH_CHECK(output_s_packed.scalar_type() == at::ScalarType::Int, + "output_s_packed must have dtype int32 for UE8M0-packed scales."); + // DeepGEMM expects SFA scales in MN-major form with shape + // [mn, ceil_div(K, 128 * 4)] and TMA-aligned stride on the last + // dimension. + TORCH_CHECK(output_s_packed.size(0) == mn && + output_s_packed.size(1) == k_num_packed_sfk, + "output_s_packed shape must be [", mn, ", ", k_num_packed_sfk, + "], but got [", output_s_packed.size(0), ", ", + output_s_packed.size(1), "]."); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + constexpr int THREADS_PER_GROUP = 16; + + int groups_per_block = 1; + + if (num_groups % 16 == 0) { + groups_per_block = 16; + } else if (num_groups % 8 == 0) { + groups_per_block = 8; + } else if (num_groups % 4 == 0) { + groups_per_block = 4; + } else if (num_groups % 2 == 0) { + groups_per_block = 2; + } + + auto dst_type = output_q.scalar_type(); + const int num_blocks = num_groups / groups_per_block; + const int num_threads = groups_per_block * THREADS_PER_GROUP; + + // zero-initialize packed scales, since we use atomicOr to accumulate + // exponents from different groups. + output_s_packed.zero_(); + +#define LAUNCH_PACKED_KERNEL(T, DST_DTYPE) \ + do { \ + dim3 grid(num_blocks); \ + dim3 block(num_threads); \ + size_t smem_bytes = \ + static_cast(groups_per_block) * group_size * sizeof(T); \ + per_token_group_quant_8bit_packed_kernel \ + <<>>( \ + static_cast(input.data_ptr()), output_q.data_ptr(), \ + reinterpret_cast(output_s_packed.data_ptr()), \ + static_cast(group_size), static_cast(num_groups), \ + groups_per_block, static_cast(groups_per_row), \ + static_cast(mn), static_cast(tma_aligned_mn), \ + static_cast(eps), static_cast(min_8bit), \ + static_cast(max_8bit)); \ + } while (0) + + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "per_token_group_quant_8bit_packed", ([&] { + if (dst_type == at::ScalarType::Float8_e4m3fn) { + LAUNCH_PACKED_KERNEL(scalar_t, __nv_fp8_e4m3); + } else if (dst_type == at::ScalarType::Char) { + LAUNCH_PACKED_KERNEL(scalar_t, int8_t); + } else { + TORCH_CHECK( + false, + "per_token_group_quant_8bit_packed only supports FP8/INT8 " + "outputs."); + } + })); + +#undef LAUNCH_PACKED_KERNEL +} + void per_token_group_quant_fp8(const torch::Tensor& input, torch::Tensor& output_q, torch::Tensor& output_s, int64_t group_size, double eps, double fp8_min, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 914227838558a..23ac1d9abeea9 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -617,6 +617,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("per_token_group_fp8_quant", torch::kCUDA, &per_token_group_quant_fp8); + // Compute per-token-group 8-bit quantized tensor and UE8M0-packed, + // TMA-aligned scales for DeepGEMM. + ops.def( + "per_token_group_fp8_quant_packed(Tensor input, Tensor! output_q, " + "Tensor! output_s_packed, int group_size, float eps, float fp8_min, " + "float fp8_max) -> ()"); + ops.impl("per_token_group_fp8_quant_packed", torch::kCUDA, + &per_token_group_quant_8bit_packed); + // Compute per-token-group INT8 quantized tensor and scaling factor. ops.def( "per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! " diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 9f47e692d5ae2..4a64736ed767b 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -23,9 +23,11 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, + per_token_group_quant_fp8_packed_for_deepgemm, silu_mul_per_token_group_quant_fp8_colmajor, ) from vllm.utils.deep_gemm import ( + DeepGemmQuantScaleFMT, get_mk_alignment_for_contiguous_layout, m_grouped_fp8_gemm_nt_contiguous, ) @@ -157,23 +159,40 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def _act_mul_quant( self, input: torch.Tensor, output: torch.Tensor, activation: str ) -> tuple[torch.Tensor, torch.Tensor]: - if activation == "silu": - return silu_mul_per_token_group_quant_fp8_colmajor( - input=input, output=output - ) - else: - # This is a fallback path. If we find ourselves using any activation other - # than silu, we should add that activation to - # silu_mul_per_token_group_quant_fp8_colmajor kernel as it is much faster. + assert self.block_shape is not None + block_k = self.block_shape[1] + scale_fmt = DeepGemmQuantScaleFMT.from_oracle() + + # 1. DeepGemm UE8M0: use packed per-token-group quant + if scale_fmt == DeepGemmQuantScaleFMT.UE8M0: M_sum, N = input.size() act_out = torch.empty( (M_sum, N // 2), dtype=input.dtype, device=input.device ) self.activation(activation, act_out, input) - assert self.block_shape is not None - return per_token_group_quant_fp8( - act_out, self.block_shape[1], column_major_scales=True, out_q=output + a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm( + act_out, + block_k, + out_q=output, ) + return a2q, a2q_scale + + # 2. Hopper / non‑E8M0: prefer the fused SiLU+mul+quant kernel + if activation == "silu": + use_ue8m0 = scale_fmt == DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0 + return silu_mul_per_token_group_quant_fp8_colmajor( + input=input, + output=output, + use_ue8m0=use_ue8m0, + ) + + # 3. fallback path for non-SiLU activations in non‑UE8M0 cases. + M_sum, N = input.size() + act_out = torch.empty((M_sum, N // 2), dtype=input.dtype, device=input.device) + self.activation(activation, act_out, input) + return per_token_group_quant_fp8( + act_out, block_k, column_major_scales=True, out_q=output + ) def apply( self, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 6e73833d1ae1c..7e1bda8639ac7 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -269,7 +269,11 @@ class W8A8BlockFp8LinearOp: weight_scale: torch.Tensor, ) -> torch.Tensor: assert self.deepgemm_input_quant_op is not None - q_input, input_scale = self.deepgemm_input_quant_op(input_2d) + q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( + input_2d, + group_size=self.act_quant_group_shape.col, + use_ue8m0=True, + ) output = torch.empty( (q_input.shape[0], weight.shape[0]), dtype=torch.bfloat16, @@ -791,6 +795,80 @@ def per_token_group_quant_fp8( return x_q, x_s +def per_token_group_quant_fp8_packed_for_deepgemm( + x: torch.Tensor, + group_size: int, + eps: float = 1e-10, + use_ue8m0: bool | None = None, + out_q: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """FP8 per-token-group quantization for DeepGEMM. + + Returns: + (x_q, x_s_packed) + x_q: FP8 activations, same shape as `x`. + x_s_packed: Int32 tensor with logical shape + [mn, ceil(num_groups_per_row / 4)], laid out with + TMA-aligned stride along the packed-K dimension + """ + if use_ue8m0 is None: + use_ue8m0 = is_deep_gemm_e8m0_used() + # for DeepGEMM UE8M0-packed layout we *require* UE8M0 scales. + assert use_ue8m0, ( + "per_token_group_quant_fp8_packed_for_deepgemm requires UE8M0 scales." + ) + + dtype = current_platform.fp8_dtype() + assert x.shape[-1] % group_size == 0, ( + f"the last dimension of `x` {x.shape[-1]} must be divisible " + f"by `group_size` {group_size}" + ) + assert x.stride(-1) == 1, "`x` groups must be contiguous" + + finfo = torch.finfo(dtype) + fp8_min, fp8_max = finfo.min, finfo.max + + # compute DeepGEMM-style packed scale tensor shape. + hidden_dim = x.shape[-1] + mn = x.numel() // hidden_dim + num_groups_per_row = hidden_dim // group_size + k_num_packed_sf_k = (num_groups_per_row + 3) // 4 + tma_aligned_mn = ((mn + 3) // 4) * 4 + + x_s_packed = torch.empty_strided( + (mn, k_num_packed_sf_k), + (1, tma_aligned_mn), + device=x.device, + dtype=torch.int32, + ) + + # CUDA kernel path only (DeepGEMM + E8M0 is CUDA-specific). + assert current_platform.is_cuda(), ( + "per_token_group_quant_fp8_packed_for_deepgemm is only valid on CUDA " + "platforms using DeepGEMM." + ) + + x_contiguous = x.contiguous() + if out_q is not None: + x_q_local = out_q + else: + x_q_local = torch.empty_like(x_contiguous, device=x.device, dtype=dtype) + + torch.ops._C.per_token_group_fp8_quant_packed( + x_contiguous, + x_q_local, + x_s_packed, + group_size, + eps, + fp8_min, + fp8_max, + ) + + # return a tensor with the original logical shape. + x_q = x_q_local.view_as(x) + return x_q, x_s_packed + + @triton.jit def _w8a8_triton_block_scaled_mm( # Pointers to inputs and output From b952f4d3c31068de3a98d680261be9a5caa04caa Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 7 Dec 2025 23:51:36 +0800 Subject: [PATCH 052/133] [v1] Add PrefixLM support to FlexAttention backend (#27938) Signed-off-by: Isotr0py --- docs/models/supported_models.md | 20 ------ .../multimodal/generation/test_common.py | 1 - .../vllm_add_dummy_platform/dummy_platform.py | 1 + vllm/attention/backends/abstract.py | 9 +++ vllm/attention/layer.py | 5 ++ vllm/attention/selector.py | 5 ++ vllm/config/model.py | 14 +++++ vllm/multimodal/inputs.py | 25 ++++++++ vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 19 ++++++ vllm/platforms/interface.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/tpu.py | 5 +- vllm/platforms/xpu.py | 3 +- vllm/v1/attention/backends/flex_attention.py | 62 +++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 26 +++++++- 16 files changed, 173 insertions(+), 25 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 1089de87b994e..ec3ba4474c192 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -740,23 +740,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -!!! warning - Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs. - However, there are differences in how they handle text + image inputs: - - V0 correctly implements the model's attention pattern: - - Uses bidirectional attention between the image tokens corresponding to the same image - - Uses causal attention for other tokens - - Implemented via (naive) PyTorch SDPA with masking tensors - - Note: May use significant memory for long prompts with image - - V1 currently uses a simplified attention pattern: - - Uses causal attention for all tokens, including image tokens - - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}` - - Will be updated in the future to support the correct behavior - - This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. - !!! note `Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its MobileNet-v5 vision backbone. @@ -776,9 +759,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. For more details, please see: -!!! warning - Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. - !!! note For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported. diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index fd26b838ae209..c5a0b6748f797 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -382,7 +382,6 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, - num_logprobs=10, ), "glm4v": VLMTestInfo( models=["zai-org/glm-4v-9b"], diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index a80617a366cab..8448003e70531 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -30,5 +30,6 @@ class DummyPlatform(Platform): use_mla, has_sink, use_sparse, + use_mm_prefix, ): return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend" # noqa E501 diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 84cca8e686075..03f4c40302eb8 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -166,6 +166,10 @@ class AttentionBackend(ABC): def supports_sink(cls) -> bool: return False + @classmethod + def supports_mm_prefix(cls) -> bool: + return False + @classmethod def is_sparse(cls) -> bool: return False @@ -207,6 +211,7 @@ class AttentionBackend(ABC): use_mla: bool, has_sink: bool, use_sparse: bool, + use_mm_prefix: bool, device_capability: "DeviceCapability", attn_type: str, ) -> list[str]: @@ -219,6 +224,10 @@ class AttentionBackend(ABC): invalid_reasons.append("kv_cache_dtype not supported") if not cls.supports_block_size(block_size): invalid_reasons.append("block_size not supported") + if use_mm_prefix and not cls.supports_mm_prefix(): + invalid_reasons.append( + "partial multimodal token full attention not supported" + ) if use_mla != cls.is_mla(): if use_mla: invalid_reasons.append("MLA not supported") diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8a522deedf3ce..340b161ea1e15 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -230,6 +230,10 @@ class Attention(nn.Module, AttentionLayerBase): self.sliding_window = sliding_window self.has_sink = extra_impl_args.get("sinks") is not None + # NOTE: model_config may be None during certain tests + model_config = vllm_config.model_config + self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm + # During model initialization, the default dtype is set as the model # weight and activation dtype. dtype = torch.get_default_dtype() @@ -241,6 +245,7 @@ class Attention(nn.Module, AttentionLayerBase): block_size, use_mla=False, has_sink=self.has_sink, + use_mm_prefix=self.use_mm_prefix, attn_type=attn_type, ) else: diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index aeb130dfe8726..f6aba271d2e96 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -27,6 +27,7 @@ def get_attn_backend( use_mla: bool = False, has_sink: bool = False, use_sparse: bool = False, + use_mm_prefix: bool = False, attn_type: str | None = None, ) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" @@ -52,6 +53,7 @@ def get_attn_backend( use_mla=use_mla, has_sink=has_sink, use_sparse=use_sparse, + use_mm_prefix=use_mm_prefix, attn_type=attn_type, ) @@ -66,6 +68,7 @@ def _cached_get_attn_backend( use_mla: bool = False, has_sink: bool = False, use_sparse: bool = False, + use_mm_prefix: bool = False, attn_type: str | None = None, ) -> type[AttentionBackend]: from vllm.platforms import current_platform @@ -87,6 +90,7 @@ def _cached_get_attn_backend( use_mla, has_sink, use_sparse, + use_mm_prefix, attn_type, ) else: @@ -99,6 +103,7 @@ def _cached_get_attn_backend( use_mla, has_sink, use_sparse, + use_mm_prefix, attn_type, ) if not attention_cls: diff --git a/vllm/config/model.py b/vllm/config/model.py index 509a9c5e162f7..583904a949ea1 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -4,6 +4,7 @@ import warnings from collections.abc import Callable from dataclasses import InitVar, field +from functools import cached_property from typing import TYPE_CHECKING, Any, Literal, cast, get_args import torch @@ -1217,6 +1218,19 @@ class ModelConfig: ) return False + @cached_property + def is_mm_prefix_lm(self) -> bool: + """Whether to use bidirectional attention for mm positions.""" + MM_PREFIX_LM_MODELS = ( + "gemma3", + # TODO(Isotr0py): Disable paligemma for now before + # we supports soft cap attention for FlexAttention + # "paligemma", + ) + if not hasattr(self.hf_config, "model_type"): + return False + return self.hf_config.model_type in MM_PREFIX_LM_MODELS + def get_head_size(self) -> int: # TODO remove hard code if self.is_deepseek_mla: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index d9118f5b9e9a5..2ed66554e358e 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -175,6 +175,31 @@ class PlaceholderRange: return int(self.is_embed.sum().item()) + def extract_embeds_range(self) -> list[tuple[int, int]]: + """Extract the start and end indices of the embedded region in prompt. + + For example, given `PlaceholderRange(offset=2, length=5)` and + `is_embed = [False, True, False, True, True]`, the output is + `[(1 + offset, 1 + offset), (3 + offset, 4 + offset)]`. + + Returns: + A tuple `(start, end)` representing the start and end + indices (inclusive) of the embedded region. + Returns full placeholder range if `is_embed` is `None`. + """ + if self.is_embed is None: + return [(self.offset, self.offset + self.length)] + + mask_i = self.is_embed.int() + starts = torch.nonzero( + torch.diff(mask_i, prepend=mask_i.new_zeros(1)) == 1 + ).flatten() + ends = torch.nonzero( + torch.diff(mask_i, append=mask_i.new_zeros(1)) == -1 + ).flatten() + ranges = torch.stack((starts, ends), dim=1) + self.offset + return [tuple(x) for x in ranges.tolist()] + def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index a2518d5fd3dc4..a49b6e92df00d 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -133,6 +133,7 @@ class CpuPlatform(Platform): use_mla: bool, has_sink: bool, use_sparse: bool, + use_mm_prefix: bool, attn_type: str | None = None, ) -> str: if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 37c95f48669f8..39101c43142f7 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -233,6 +233,20 @@ class CudaPlatformBase(Platform): "Forcing kv cache block size to 64 for FlashMLASparse backend." ) + scheduler_config = vllm_config.scheduler_config + # Note: model_config may be None during testing + if ( + model_config is not None + and model_config.is_mm_prefix_lm + and scheduler_config.is_multimodal_model + and not scheduler_config.disable_chunked_mm_input + ): + logger.warning( + "Forcing --disable_chunked_mm_input for models " + "with multimodal-bidirectional attention." + ) + scheduler_config.disable_chunked_mm_input = True + @classmethod def get_current_memory_usage( cls, device: torch.types.Device | None = None @@ -268,6 +282,7 @@ class CudaPlatformBase(Platform): use_mla, has_sink, use_sparse, + use_mm_prefix, device_capability, attn_type, ) -> tuple[ @@ -289,6 +304,7 @@ class CudaPlatformBase(Platform): use_mla, has_sink, use_sparse, + use_mm_prefix, device_capability, attn_type, ) @@ -312,6 +328,7 @@ class CudaPlatformBase(Platform): use_mla: bool, has_sink: bool, use_sparse: bool, + use_mm_prefix: bool, attn_type: str | None = None, ) -> str: if attn_type is None: @@ -332,6 +349,7 @@ class CudaPlatformBase(Platform): use_mla, has_sink, use_sparse, + use_mm_prefix, device_capability, attn_type, ) @@ -356,6 +374,7 @@ class CudaPlatformBase(Platform): use_mla, has_sink, use_sparse, + use_mm_prefix, device_capability, attn_type, ) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 27c6fac09f498..f04e94e425257 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -239,6 +239,7 @@ class Platform: use_mla: bool, has_sink: bool, use_sparse: bool, + use_mm_prefix: bool, attn_type: str | None = None, ) -> str: """Get the attention backend class of a device.""" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 32c7f8e536639..ff0fc78517876 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -216,6 +216,7 @@ class RocmPlatform(Platform): use_mla, has_sink, use_sparse, + use_mm_prefix, attn_type: str | None = None, ) -> str: from vllm._aiter_ops import rocm_aiter_ops diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index cbc0a996f3661..d6998e7a308af 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -62,8 +62,9 @@ class TpuPlatform(Platform): kv_cache_dtype: str | None, block_size: int, use_mla: bool, - has_sink, - use_sparse, + has_sink: bool, + use_sparse: bool, + use_mm_prefix: bool, attn_type: str | None = None, ) -> str: if use_sparse: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 768714fb16726..0a05750764d8d 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -48,7 +48,8 @@ class XPUPlatform(Platform): block_size: int, use_mla: bool, has_sink: bool, - use_sparse, + use_sparse: bool, + use_mm_prefix: bool, attn_type: str | None = None, ) -> str: from vllm.v1.attention.backends.utils import set_kv_cache_layout diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index a2a6eeeb16b24..d8dbe4cbae013 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -17,6 +17,7 @@ from torch.nn.attention.flex_attention import ( and_masks, create_block_mask, flex_attention, + or_masks, ) from vllm.attention.backends.abstract import ( @@ -42,6 +43,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) +torch._dynamo.config.recompile_limit = 16 create_block_mask_compiled = torch.compile( create_block_mask, fullgraph=True, mode="reduce-overhead" ) @@ -91,6 +93,11 @@ class FlexAttentionBackend(AttentionBackend): """FlexAttention supports both decoder and encoder-only attention.""" return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY) + @classmethod + def supports_mm_prefix(cls) -> bool: + """FlexAttention supports full attention for image tokens.""" + return True + @staticmethod def get_impl_cls() -> type["FlexAttentionImpl"]: return FlexAttentionImpl @@ -316,6 +323,7 @@ class FlexAttentionMetadata: kv_block_size: int = 16 transformed_score_mod: _score_mod_signature | None = None sliding_window: int | None = None + mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None @cached_property def logical_block_ids(self): @@ -443,6 +451,45 @@ class FlexAttentionMetadata: return final_mask_mod if self.causal else sliding_window_mask_mod + def get_prefix_lm_mask_mod(self) -> _mask_mod_signature: + """Creates the prefix LM mask_mod function for FlexAttention.""" + + assert self.doc_ids is not None + request_lookup = self.doc_ids + + def prefix_lm_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + cu_q_idx: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + ): + mask = torch.zeros_like(q_idx, dtype=torch.bool) + for req, doc_range_lst in (self.mm_prefix_range or {}).items(): + req_mask = request_lookup[cu_q_idx] == req + for start, end in doc_range_lst: + doc_mask_q = (q_idx >= start) & (q_idx <= end) + doc_mask_kv = (kv_idx >= start) & (kv_idx <= end) + mask = mask | (req_mask & doc_mask_q & doc_mask_kv) + return mask + + def final_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + physical_kv_idx: torch.Tensor, + ) -> torch.Tensor: + (is_valid, logical_q_idx, logical_kv_idx) = ( + self._convert_physical_to_logical(self.doc_ids, q_idx, physical_kv_idx) + ) + return torch.where( + is_valid, + prefix_lm_mask_mod(b, h, q_idx, logical_q_idx, logical_kv_idx), + False, + ) + + return final_mask_mod + def get_mask_mod(self): # Stage-1: initialize the base mask_mod # (causal mask for decoder or bidirectional mask for encoder) @@ -456,6 +503,10 @@ class FlexAttentionMetadata: # Add sliding window mask for sliding window attention sliding_window_mask_mod = self.get_sliding_window_mask_mod() mask_mod = and_masks(mask_mod, sliding_window_mask_mod) + if self.mm_prefix_range: + # Add prefix LM mask for vision-language prefix LM attention + prefix_lm_mask_mod = self.get_prefix_lm_mask_mod() + mask_mod = or_masks(mask_mod, prefix_lm_mask_mod) return mask_mod def get_transformed_score_mod(self) -> _score_mod_signature | None: @@ -709,6 +760,7 @@ class FlexAttentionImpl(AttentionImpl): sliding_window: int | None alibi_slopes: torch.Tensor | None logits_soft_cap: float | None + mm_prefix_range: dict[int, list[tuple[int, int]]] | None = None def __init__( self, @@ -810,11 +862,21 @@ class FlexAttentionImpl(AttentionImpl): num_actual_tokens = attn_metadata.num_actual_tokens + needs_rebuild_block_mask = False if attn_metadata.sliding_window != self.sliding_window: attn_metadata.sliding_window = self.sliding_window if attn_metadata.direct_build: # update mask mod in attention metadata attn_metadata.mask_mod = attn_metadata.get_mask_mod() + needs_rebuild_block_mask = True + + if self.mm_prefix_range != getattr(attn_metadata, "mm_prefix_range", None): + self.mm_prefix_range = attn_metadata.mm_prefix_range + attn_metadata.mask_mod = attn_metadata.get_mask_mod() + needs_rebuild_block_mask = True + + if needs_rebuild_block_mask: + if attn_metadata.direct_build and attn_metadata.causal: attn_metadata.block_mask = attn_metadata._build_block_mask_direct() else: attn_metadata.block_mask = attn_metadata.build_block_mask() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a50360ab08694..22a3f9d8d2dda 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -48,7 +48,10 @@ from vllm.distributed.parallel_state import ( is_global_first_rank, prepare_communication_buffer_for_model, ) -from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.forward_context import ( + BatchDescriptor, + set_forward_context, +) from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.rotary_embedding import ( @@ -329,6 +332,7 @@ class GPUModelRunner( self.use_alibi = model_config.uses_alibi self.cascade_attn_enabled = not self.model_config.disable_cascade_attn + self.is_mm_prefix_lm = self.model_config.is_mm_prefix_lm # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY @@ -1700,6 +1704,26 @@ class GPUModelRunner( for layer_name in attn_group.layer_names: attn_metadata[layer_name] = attn_metadata_i + if self.is_mm_prefix_lm: + req_doc_ranges = {} + for req_id in self.input_batch.req_ids: + image_doc_ranges = [] + req_state = self.requests[req_id] + for mm_feature in req_state.mm_features: + pos_info = mm_feature.mm_position + img_doc_range = pos_info.extract_embeds_range() + image_doc_ranges.extend(img_doc_range) + req_idx = self.input_batch.req_id_to_index[req_id] + req_doc_ranges[req_idx] = image_doc_ranges + + if isinstance(attn_metadata, list): + for ub_metadata in attn_metadata: + for _metadata in ub_metadata.values(): + _metadata.mm_prefix_range = req_doc_ranges # type: ignore[attr-defined] + else: + for _metadata in attn_metadata.values(): + _metadata.mm_prefix_range = req_doc_ranges # type: ignore[attr-defined] + if spec_decode_common_attn_metadata is not None and ( num_reqs != num_reqs_padded or num_tokens != num_tokens_padded ): From 0044c4038c571e7fb38acb008256f925dfe515f1 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 7 Dec 2025 10:53:51 -0500 Subject: [PATCH 053/133] [BugFix][DeepSeek-V3.2] Fix backend selection logic for Blackwell (#30195) --- vllm/platforms/cuda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 39101c43142f7..915392a4125f9 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -182,8 +182,8 @@ class CudaPlatformBase(Platform): if vllm_config.attention_config.backend is None: # Default case - if cls.is_device_capability(100): - # Blackwell => Force CutlassMLA. + if cls.is_device_capability(100) and not use_sparse: + # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2). use_cutlass_mla = True # Set the backend in AttentionConfig so it's used during # backend selection From af0444bf40b7db2f3fb9fe1508d25ceba24cac87 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Sun, 7 Dec 2025 17:38:04 +0100 Subject: [PATCH 054/133] [Performance] Fused blockwise quant RMS norm (#27883) Signed-off-by: ElizaWszola Signed-off-by: yewentao256 Co-authored-by: yewentao256 --- .../fused_kernels/layernorm_rms_benchmarks.py | 89 +++- csrc/dispatch_utils.h | 18 + csrc/ops.h | 7 + ...fused_layernorm_dynamic_per_token_quant.cu | 144 ++++++- .../fused_kernels/layernorm_utils.cuh | 408 +++++++++++++----- csrc/torch_bindings.cpp | 8 + tests/compile/test_fusion.py | 69 ++- .../core/test_fused_quant_layernorm.py | 82 +++- vllm/_custom_ops.py | 40 ++ vllm/compilation/fusion.py | 172 +++++++- vllm/compilation/matcher_utils.py | 44 +- .../layers/quantization/utils/fp8_utils.py | 2 +- .../layers/quantization/utils/quant_utils.py | 6 + vllm/utils/deep_gemm.py | 17 + 14 files changed, 949 insertions(+), 157 deletions(-) diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index d809bf1db8cbc..fb3329975cee3 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -14,6 +14,9 @@ from tqdm import tqdm import vllm._custom_ops as ops from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, +) @dataclass @@ -22,6 +25,7 @@ class bench_params_t: hidden_size: int add_residual: bool dtype: torch.dtype + group_size: list[int] def description(self): return ( @@ -29,6 +33,7 @@ class bench_params_t: f"x D {self.hidden_size} " f"x R {self.add_residual} " f"x DT {self.dtype}" + f"x GS {self.group_size}" ) @@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]: HIDDEN_SIZES = list(range(1024, 8129, 1024)) ADD_RESIDUAL = [True, False] DTYPES = [torch.bfloat16, torch.float] + GROUP_SIZES = [[1, 64], [1, 128]] - combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES) + combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES) bench_params = list( - map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations) + map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations) ) return bench_params @@ -52,6 +58,7 @@ def unfused_int8_impl( x: torch.Tensor, residual: torch.Tensor | None, quant_dtype: torch.dtype, + group_size: list[int], ): # Norm torch_out = None @@ -69,6 +76,7 @@ def unfused_fp8_impl( x: torch.Tensor, residual: torch.Tensor | None, quant_dtype: torch.dtype, + group_size: list[int], ): # Norm torch_out = None @@ -81,23 +89,63 @@ def unfused_fp8_impl( torch_out, _ = ops.scaled_fp8_quant(torch_out) +def unfused_groupwise_fp8_impl( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: torch.Tensor | None, + quant_dtype: torch.dtype, + group_size: list[int], +): + # Norm + torch_out = None + if residual is None: + torch_out = rms_norm_layer.forward_cuda(x, residual) + else: + torch_out, _ = rms_norm_layer.forward_cuda(x, residual) + + # Quant + torch_out, _ = per_token_group_quant_fp8( + torch_out, group_size=group_size[1], use_ue8m0=False + ) + + def fused_impl( rms_norm_layer: RMSNorm, # this stores the weights x: torch.Tensor, residual: torch.Tensor | None, quant_dtype: torch.dtype, + group_size: list[int], ): out, _ = ops.rms_norm_dynamic_per_token_quant( x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual ) +def fused_groupwise_impl( + rms_norm_layer: RMSNorm, # this stores the weights + x: torch.Tensor, + residual: torch.Tensor | None, + quant_dtype: torch.dtype, + group_size: list[int], +): + out, _ = ops.rms_norm_per_block_quant( + x, + rms_norm_layer.weight, + 1e-6, + quant_dtype, + group_size, + residual=residual, + is_scale_transposed=True, + ) + + # Bench functions def bench_fn( rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor, quant_dtype: torch.dtype, + group_size: list[int], label: str, sub_label: str, fn: Callable, @@ -110,10 +158,11 @@ def bench_fn( "x": x, "residual": residual, "quant_dtype": quant_dtype, + "group_size": group_size, "fn": fn, } return TBenchmark.Timer( - stmt="fn(rms_norm_layer, x, residual, quant_dtype)", + stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)", globals=globals, label=label, sub_label=sub_label, @@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.int8, + params.group_size, label, sub_label, unfused_int8_impl, @@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.float8_e4m3fn, + params.group_size, label, sub_label, unfused_fp8_impl, @@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.int8, + params.group_size, label, sub_label, fused_impl, @@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.float8_e4m3fn, + params.group_size, label, sub_label, fused_impl, @@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu ) ) + # unfused groupwise fp8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + params.group_size, + label, + sub_label, + unfused_groupwise_fp8_impl, + "unfused_groupwise_fp8_impl", + ) + ) + + # fused groupwise fp8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + params.group_size, + label, + sub_label, + fused_groupwise_impl, + "fused_groupwise_fp8_impl", + ) + ) + print_timers(timers) return timers diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index e1d131e4a7851..de0c505b7a62f 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -118,6 +118,24 @@ } \ } +#define VLLM_DISPATCH_BOOL(expr, const_expr, ...) \ + if (expr) { \ + constexpr bool const_expr = true; \ + __VA_ARGS__(); \ + } else { \ + constexpr bool const_expr = false; \ + __VA_ARGS__(); \ + } + +#define VLLM_DISPATCH_GROUP_SIZE(group_size, const_group_size, ...) \ + if (group_size == 128) { \ + constexpr int const_group_size = 128; \ + __VA_ARGS__(); \ + } else if (group_size == 64) { \ + constexpr int const_group_size = 64; \ + __VA_ARGS__(); \ + } + #define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \ switch (NUM_DIMS) { \ case 2: { \ diff --git a/csrc/ops.h b/csrc/ops.h index d302f04913266..9617d6358e18a 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -128,6 +128,13 @@ void rms_norm_dynamic_per_token_quant(torch::Tensor& out, std::optional scale_ub, std::optional residual); +void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input, + torch::Tensor const& weight, + torch::Tensor& scales, double const epsilon, + std::optional scale_ub, + std::optional residual, + int64_t group_size, bool is_scale_transposed); + void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, std::optional key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu index 92d6c2f402a24..2080ef3cd39b5 100644 --- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu +++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu @@ -31,14 +31,15 @@ __device__ void rms_norm_dynamic_per_token_quant_vec( // RMS Norm + Quant if constexpr (std::is_same_v) { + token_scale = 1.0f / token_scale; vllm::vectorized::norm_and_quant( - out, input, weight, rms, 1.0f / token_scale, hidden_size, residual); + out, input, weight, rms, &token_scale, hidden_size, residual); } else { // FP8 - Do not invert token_scale for exact match with FBGemm vllm::vectorized::norm_and_quant( - out, input, weight, rms, token_scale, hidden_size, residual); + out, input, weight, rms, &token_scale, hidden_size, residual); } } @@ -75,14 +76,52 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel( // RMS Norm + Quant if constexpr (std::is_same_v) { + token_scale = 1.0f / token_scale; vllm::norm_and_quant( - out, input, weight, rms, 1.0f / token_scale, hidden_size, residual); + out, input, weight, rms, &token_scale, hidden_size, residual); } else { // FP8 - Do not invert s_token_scale for exact match with FBGemm vllm::norm_and_quant( - out, input, weight, rms, token_scale, hidden_size, residual); + out, input, weight, rms, &token_scale, hidden_size, residual); } } + +// RMS norm + quant kernel +template +__global__ void rms_norm_per_block_quant_kernel( + scalar_out_t* __restrict__ out, // [..., hidden_size] + float* __restrict__ scales, // [num_tokens, hidden_size / group_size] + // or + // [hidden_size / group_size, num_tokens] + scalar_t const* __restrict__ input, // [..., hidden_size] + scalar_t const* __restrict__ weight, // [hidden_size] + float const* scale_ub, float const var_epsilon, int32_t const hidden_size, + scalar_t* __restrict__ residual = nullptr) { + float rms; + // Compute RMS + // Always able to vectorize due to constraints on hidden_size + vllm::vectorized::compute_rms( + &rms, input, hidden_size, var_epsilon, residual); + + // Compute Scale + // Always able to vectorize due to constraints on hidden_size and group_size + vllm::vectorized::compute_dynamic_per_token_scales< + scalar_t, scalar_out_t, has_residual, is_scale_transposed, group_size>( + nullptr, scales, input, weight, rms, scale_ub, hidden_size, residual); + + // RMS Norm + Quant + // Always able to vectorize due to constraints on hidden_size + // For int8, don't invert token_scale here: do it inside the norm_and_quant + // kernel. We do it because particular elements of token_scale can be shared + // between multiple threads, so this way, we avoid extra synchronization + // overhead. + vllm::vectorized::norm_and_quant< + scalar_t, scalar_out_t, std::is_same_v, + has_residual, is_scale_transposed, group_size>( + out, input, weight, rms, scales, hidden_size, residual); +} + } // namespace vllm // Residual add + RMS norm + dynamic per token @@ -103,30 +142,19 @@ void rms_norm_dynamic_per_token_quant_dispatch( const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - if (residual.has_value()) { + VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] { VLLM_DISPATCH_QUANT_TYPES( out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] { vllm::rms_norm_dynamic_per_token_quant_kernel + has_residual> <<>>( out.data_ptr(), scales.data_ptr(), input.data_ptr(), weight.data_ptr(), scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, - var_epsilon, hidden_size, residual->data_ptr()); + var_epsilon, hidden_size, + has_residual ? residual->data_ptr() : nullptr); }); - - } else { - VLLM_DISPATCH_QUANT_TYPES( - out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] { - vllm::rms_norm_dynamic_per_token_quant_kernel - <<>>( - out.data_ptr(), scales.data_ptr(), - input.data_ptr(), weight.data_ptr(), - scale_ub.has_value() ? scale_ub->data_ptr() : nullptr, - var_epsilon, hidden_size, nullptr); - }); - } + }); } void rms_norm_dynamic_per_token_quant( @@ -157,3 +185,79 @@ void rms_norm_dynamic_per_token_quant( out, input, weight, scales, var_epsilon, scale_ub, residual); }); } + +// Residual add + RMS norm + dynamic per token +void rms_norm_per_block_quant_dispatch( + torch::Tensor& out, // [..., hidden_size] + torch::Tensor const& input, // [..., hidden_size] + torch::Tensor const& weight, // [hidden_size] + torch::Tensor& scales, // [num_tokens, hidden_size / group_size] or + // [hidden_size / group_size, num_tokens] + int32_t group_size, + double const var_epsilon, // Variance epsilon used in norm calculation + std::optional const& scale_ub, + std::optional& residual, bool is_scale_transposed) { + int32_t hidden_size = input.size(-1); + auto num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + const int max_block_size = (num_tokens <= 256) ? 512 : 256; + dim3 block(std::min(hidden_size, max_block_size)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "rms_norm_per_block_quant_fp_dispatch", [&] { + using scalar_in_t = scalar_t; + VLLM_DISPATCH_GROUP_SIZE(group_size, gs, [&] { + VLLM_DISPATCH_BOOL(residual.has_value(), has_residual, [&] { + VLLM_DISPATCH_BOOL(is_scale_transposed, transpose_scale, [&] { + VLLM_DISPATCH_QUANT_TYPES( + out.scalar_type(), "rms_norm_per_block_quant_kernel", [&] { + vllm::rms_norm_per_block_quant_kernel + <<>>( + out.data_ptr(), scales.data_ptr(), + input.data_ptr(), + weight.data_ptr(), + scale_ub.has_value() ? scale_ub->data_ptr() + : nullptr, + var_epsilon, hidden_size, + has_residual ? residual->data_ptr() + : nullptr); + }); + }); + }); + }); + }); +} + +void rms_norm_per_block_quant(torch::Tensor& out, torch::Tensor const& input, + torch::Tensor const& weight, + torch::Tensor& scales, double const var_epsilon, + std::optional scale_ub, + std::optional residual, + int64_t group_size, bool is_scale_transposed) { + static c10::ScalarType kFp8Type = is_fp8_ocp() + ? c10::ScalarType::Float8_e4m3fn + : c10::ScalarType::Float8_e4m3fnuz; + TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8); + TORCH_CHECK(out.is_contiguous() && input.is_contiguous()); + + if (scale_ub.has_value()) { + TORCH_CHECK(out.dtype() == kFp8Type); + } + TORCH_CHECK(weight.dtype() == input.dtype()); + TORCH_CHECK(scales.dtype() == torch::kFloat32); + if (residual) { + TORCH_CHECK(residual->scalar_type() == input.scalar_type()); + } + + TORCH_CHECK(group_size == 128 || group_size == 64, + "Unsupported group size: ", group_size); + + rms_norm_per_block_quant_dispatch(out, input, weight, scales, group_size, + var_epsilon, scale_ub, residual, + is_scale_transposed); +} \ No newline at end of file diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index 2d2fd771205c7..cb7adc3125734 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -9,6 +9,7 @@ #include "quant_conversions.cuh" #include "../../cub_helpers.h" +#include "../../cuda_compat.h" namespace vllm { @@ -43,62 +44,150 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, *rms = s_rms; } -template +__device__ float warpReduceMaxSpecialized(volatile float* val, int64_t tid, + int64_t thread_in_warp, + int64_t reduced_elems) { + static_assert(WARP_SIZE == 32 || WARP_SIZE == 64); + if constexpr (WARP_SIZE == 64) { + if (thread_in_warp + 64 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 64]); + } + if (thread_in_warp + 32 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 32]); + if (thread_in_warp + 16 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 16]); + if (thread_in_warp + 8 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 8]); + if (thread_in_warp + 4 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 4]); + if (thread_in_warp + 2 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 2]); + if (thread_in_warp + 1 < reduced_elems) + val[tid] = fmaxf(val[tid], val[tid + 1]); + return val[tid]; +} + +template __device__ void compute_dynamic_per_token_scales( float* __restrict__ token_scale, float* __restrict__ all_token_scales, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, float const rms, float const* __restrict__ scale_ub, - int32_t const hidden_size, - scalar_t const* __restrict__ residual = nullptr) { - int64_t const token_offset = blockIdx.x * static_cast(hidden_size); - ; - constexpr scalar_out_t qmax{quant_type_max_v}; - + int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr, + int32_t const group_size = 0) { float block_absmax_val_maybe = 0.0f; - for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { - float x = static_cast(input[token_offset + i]); - if constexpr (has_residual) { - x += static_cast(residual[token_offset + i]); - } - - x = static_cast(static_cast(x * rms) * weight[i]); - block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x)); - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStore; - block_absmax_val_maybe = - BlockReduce(reduceStore) - .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x); - - __shared__ float s_token_scale; - if (threadIdx.x == 0) { - float scale = 0.0f; - if (scale_ub) { - scale = min(block_absmax_val_maybe, *scale_ub); - } else { - scale = block_absmax_val_maybe; - } - // token scale computation - scale = max(scale / qmax, min_scaling_factor::val()); - s_token_scale = scale; // Shared memory store - all_token_scales[blockIdx.x] = scale; // Global output store - } + constexpr scalar_out_t qmax{quant_type_max_v}; __syncthreads(); + if (group_size > 0) { + __shared__ float s_max_vals[1024]; + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + int64_t num_groups = hidden_size / group_size; + int64_t const threads_per_group = blockDim.x / num_groups; + int64_t const thread_in_group = threadIdx.x % threads_per_group; + int64_t const group_offset = threadIdx.x / threads_per_group * group_size; + int64_t const thread_offset = group_offset + thread_in_group; + int64_t const thread_end = + min(group_offset + group_size, static_cast(hidden_size)); + for (auto i = thread_offset; i < thread_end; i += threads_per_group) { + float x = static_cast(input[token_offset + i]); + if constexpr (has_residual) { + x += static_cast(residual[token_offset + i]); + } + x = static_cast(static_cast(x * rms) * weight[i]); + block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x)); + } + s_max_vals[threadIdx.x] = block_absmax_val_maybe; + __syncthreads(); - *token_scale = s_token_scale; + int64_t const warp_size = WARP_SIZE; + int64_t const num_warps = blockDim.x / warp_size; + int64_t const warp_id = threadIdx.x / warp_size; + int64_t const thread_in_warp = threadIdx.x % warp_size; + int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps; + for (auto i = 0; i < groups_per_warp; ++i) { + int64_t const group_id = i * num_warps + warp_id; + if (group_id < num_groups) { + int64_t warp_start = group_id * threads_per_group; + int64_t const start = warp_start + thread_in_warp; + int64_t const warp_end = min(warp_start + threads_per_group, + static_cast(hidden_size)); + for (auto j = start; j + warp_size < warp_end; j += warp_size) { + s_max_vals[start] = + fmaxf(s_max_vals[start], s_max_vals[j + warp_size]); + } + warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp, + min(warp_end - warp_start, warp_size)); + } + } + __syncthreads(); + + if (thread_in_group == 0 && thread_offset < thread_end) { + block_absmax_val_maybe = s_max_vals[threadIdx.x]; + float scale = 0.0f; + if (scale_ub) { + scale = min(block_absmax_val_maybe, *scale_ub); + } else { + scale = block_absmax_val_maybe; + } + // token scale computation + scale = max(scale / qmax, min_scaling_factor::val()); + // Global output store + if constexpr (is_scale_transposed) { + all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x + + blockIdx.x] = scale; + } else { + all_token_scales[blockIdx.x * num_groups + + threadIdx.x / threads_per_group] = scale; + } + } + __syncthreads(); + } else { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + + for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { + float x = static_cast(input[token_offset + i]); + if constexpr (has_residual) { + x += static_cast(residual[token_offset + i]); + } + + x = static_cast(static_cast(x * rms) * weight[i]); + block_absmax_val_maybe = fmaxf(block_absmax_val_maybe, fabsf(x)); + } + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + block_absmax_val_maybe = + BlockReduce(reduceStore) + .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x); + + __shared__ float s_token_scale; + if (threadIdx.x == 0) { + float scale = 0.0f; + if (scale_ub) { + scale = min(block_absmax_val_maybe, *scale_ub); + } else { + scale = block_absmax_val_maybe; + } + // token scale computation + scale = max(scale / qmax, min_scaling_factor::val()); + s_token_scale = scale; // Shared memory store + all_token_scales[blockIdx.x] = scale; // Global output store + } + __syncthreads(); + + *token_scale = s_token_scale; + } } template + bool has_residual = false, bool is_scale_transposed = false> __device__ void norm_and_quant(scalar_out_t* __restrict__ output, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, - float const rms, float const scale, + float const rms, float* const scale, int32_t const hidden_size, - scalar_t* __restrict__ residual = nullptr) { + scalar_t* __restrict__ residual = nullptr, + int32_t const group_size = 0) { int64_t const token_offset = blockIdx.x * static_cast(hidden_size); - ; for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { float x = static_cast(input[token_offset + i]); @@ -109,8 +198,21 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, // Norm x = static_cast(static_cast(x * rms) * weight[i]); // Quant + // If groupwise is_scale_inverted is true, so we invert the scale here. + int64_t scale_idx = 0; + if (group_size > 0) { + if constexpr (is_scale_transposed) { + scale_idx = (i / group_size) * gridDim.x + blockIdx.x; + } else { + scale_idx = blockIdx.x * (hidden_size / group_size) + i / group_size; + } + } + auto scale_val = + (group_size > 0 + ? (is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx]) + : *scale); output[token_offset + i] = - ScaledQuant::quant_fn(x, scale); + ScaledQuant::quant_fn(x, scale_val); } } @@ -178,95 +280,191 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, // Vectorized version of vllm::compute_dynamic_per_token_scales // hidden_size must be a multiple of 4 -template +template __device__ void compute_dynamic_per_token_scales( float* __restrict__ token_scale, float* __restrict__ all_token_scales, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, float const rms, float const* __restrict__ scale_ub, int32_t const hidden_size, scalar_t const* __restrict__ residual = nullptr) { - int64_t const token_offset = blockIdx.x * static_cast(hidden_size); - ; - - // Vectorized input/weight/residual to better utilize memory bandwidth. - vec4_t const* vec_input = - reinterpret_cast const*>(&input[token_offset]); - vec4_t const* vec_weight = - reinterpret_cast const*>(weight); - vec4_t const* vec_residual = nullptr; - if constexpr (has_residual) { - vec_residual = - reinterpret_cast const*>(&residual[token_offset]); - } - constexpr scalar_out_t qmax{quant_type_max_v}; const int VEC_SIZE = 4; - int32_t const num_vec_elems = hidden_size >> 2; float block_absmax_val_maybe = 0.0f; -#pragma unroll 4 - for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { - vec4_t in = vec_input[i]; - vec4_t const w = vec_weight[i]; + // Vectorized input/weight/residual to better utilize memory bandwidth. + vec4_t const* vec_input = nullptr; + vec4_t const* vec_weight = nullptr; + vec4_t const* vec_residual = nullptr; - vec4_t x; -#pragma unroll - for (int j = 0; j < VEC_SIZE; ++j) { - x.val[j] = static_cast(in.val[j]); - } + if constexpr (group_size > 0) { + __shared__ float s_max_vals[1024]; + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + int64_t const num_groups = hidden_size / group_size; + int64_t const threads_per_group = blockDim.x / num_groups; + int64_t const thread_in_group = threadIdx.x % threads_per_group; + int64_t const group_offset = + threadIdx.x / threads_per_group * (group_size >> 2); + int64_t const thread_offset = group_offset + thread_in_group; + int64_t const thread_end = min(group_offset + (group_size >> 2), + static_cast(hidden_size >> 2)); + vec_input = reinterpret_cast const*>(&input[token_offset]); + vec_weight = reinterpret_cast const*>(weight); if constexpr (has_residual) { - vec4_t r = vec_residual[i]; + vec_residual = + reinterpret_cast const*>(&residual[token_offset]); + } + int32_t const num_vec_elems = thread_end; + +#pragma unroll 4 + for (auto i = thread_offset; i < num_vec_elems; i += threads_per_group) { + vec4_t in = vec_input[i]; + vec4_t const w = vec_weight[i]; + + vec4_t x; #pragma unroll for (int j = 0; j < VEC_SIZE; ++j) { - x.val[j] += static_cast(r.val[j]); + x.val[j] = static_cast(in.val[j]); + } + + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } + } + +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + block_absmax_val_maybe = + fmaxf(block_absmax_val_maybe, + fabs(static_cast(x.val[j] * rms) * w.val[j])); } } + s_max_vals[threadIdx.x] = block_absmax_val_maybe; + __syncthreads(); + + int64_t const warp_size = WARP_SIZE; + int64_t const num_warps = blockDim.x / warp_size; + int64_t const warp_id = threadIdx.x / warp_size; + int64_t const thread_in_warp = threadIdx.x % warp_size; + int64_t const groups_per_warp = (num_groups + num_warps - 1) / num_warps; + for (auto i = 0; i < groups_per_warp; ++i) { + int64_t const group_id = i * num_warps + warp_id; + if (group_id < num_groups) { + int64_t warp_start = group_id * threads_per_group; + int64_t const start = warp_start + thread_in_warp; + int64_t const warp_end = min(warp_start + threads_per_group, + static_cast(hidden_size)); + for (auto j = start; j + warp_size < warp_end; j += warp_size) { + s_max_vals[start] = + fmaxf(s_max_vals[start], s_max_vals[j + warp_size]); + } + warpReduceMaxSpecialized(s_max_vals, start, thread_in_warp, + min(warp_end - warp_start, warp_size)); + } + } + __syncthreads(); + + if (thread_in_group == 0 && thread_offset < thread_end) { + block_absmax_val_maybe = s_max_vals[threadIdx.x]; + float scale = 0.0f; + if (scale_ub) { + scale = min(block_absmax_val_maybe, *scale_ub); + } else { + scale = block_absmax_val_maybe; + } + // token scale computation + scale = max(scale / qmax, min_scaling_factor::val()); + // Global output store + if constexpr (is_scale_transposed) { + all_token_scales[(threadIdx.x / threads_per_group) * gridDim.x + + blockIdx.x] = scale; + } else { + all_token_scales[blockIdx.x * num_groups + + threadIdx.x / threads_per_group] = scale; + } + } + __syncthreads(); + + } else { + int64_t const token_offset = blockIdx.x * static_cast(hidden_size); + vec_input = reinterpret_cast const*>(&input[token_offset]); + vec_weight = reinterpret_cast const*>(weight); + if constexpr (has_residual) { + vec_residual = + reinterpret_cast const*>(&residual[token_offset]); + } + + int32_t const num_vec_elems = (hidden_size >> 2); + +#pragma unroll 4 + for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + vec4_t in = vec_input[i]; + vec4_t const w = vec_weight[i]; + + vec4_t x; #pragma unroll - for (int j = 0; j < VEC_SIZE; ++j) { - block_absmax_val_maybe = - fmaxf(block_absmax_val_maybe, - fabs(static_cast(x.val[j] * rms) * w.val[j])); + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); + } + + if constexpr (has_residual) { + vec4_t r = vec_residual[i]; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } + } + +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + block_absmax_val_maybe = + fmaxf(block_absmax_val_maybe, + fabs(static_cast(x.val[j] * rms) * w.val[j])); + } } - } - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage reduceStore; - block_absmax_val_maybe = - BlockReduce(reduceStore) - .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x); + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + block_absmax_val_maybe = + BlockReduce(reduceStore) + .Reduce(block_absmax_val_maybe, CubMaxOp{}, blockDim.x); - __shared__ float s_token_scale; - if (threadIdx.x == 0) { - float scale = 0.0f; - if (scale_ub) { - scale = min(block_absmax_val_maybe, *scale_ub); - } else { - scale = block_absmax_val_maybe; + __shared__ float s_token_scale; + if (threadIdx.x == 0) { + float scale = 0.0f; + if (scale_ub) { + scale = min(block_absmax_val_maybe, *scale_ub); + } else { + scale = block_absmax_val_maybe; + } + // token scale computation + scale = max(scale / qmax, min_scaling_factor::val()); + s_token_scale = scale; // shared memory store + all_token_scales[blockIdx.x] = scale; // global output store } - // token scale computation - scale = max(scale / qmax, min_scaling_factor::val()); - s_token_scale = scale; // shared memory store - all_token_scales[blockIdx.x] = scale; // global output store - } - __syncthreads(); + __syncthreads(); - *token_scale = s_token_scale; + *token_scale = s_token_scale; + } } // hidden_size must be a multiple of 4 template + bool has_residual = false, bool is_scale_transposed = false, + int32_t group_size = 0> __device__ void norm_and_quant(scalar_out_t* __restrict__ output, scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight, - float const rms, float const scale, + float const rms, float* const scale, int32_t const hidden_size, scalar_t* __restrict__ residual = nullptr) { int64_t const token_offset = blockIdx.x * static_cast(hidden_size); - ; // Vectorized input/output/weight/residual to better utilize memory bandwidth. vec4_t const* vec_input = @@ -311,10 +509,26 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, } q8x4_t out; + + float scale_val; + + if constexpr (group_size > 0) { + int64_t const num_groups = hidden_size / group_size; + int64_t scale_idx = 0; + if constexpr (is_scale_transposed) { + scale_idx = (i * VEC_SIZE / group_size) * gridDim.x + blockIdx.x; + } else { + scale_idx = blockIdx.x * num_groups + i * VEC_SIZE / group_size; + } + scale_val = + is_scale_inverted ? 1.0f / scale[scale_idx] : scale[scale_idx]; + } else { + scale_val = *scale; + } #pragma unroll for (int j = 0; j < VEC_SIZE; ++j) { out.val[j] = ScaledQuant::quant_fn( - static_cast(x.val[j] * rms) * w.val[j], scale); + static_cast(x.val[j] * rms) * w.val[j], scale_val); } vec_output[i] = out; } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 23ac1d9abeea9..db37a9b9b88e3 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -215,6 +215,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA, &rms_norm_dynamic_per_token_quant); + // Fused Layernorm + Block quant kernels + ops.def( + "rms_norm_per_block_quant(Tensor! result, Tensor input, " + "Tensor weight, Tensor! scale, float epsilon, " + "Tensor? scale_ub, Tensor!? residual, int group_size, " + "bool is_scale_transposed) -> ()"); + ops.impl("rms_norm_per_block_quant", torch::kCUDA, &rms_norm_per_block_quant); + // Rotary embedding // Apply GPT-NeoX or GPT-J style rotary embedding to query and key. ops.def( diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index d0ba8385f4a01..2ad34a79859a3 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -18,6 +18,9 @@ from vllm.config import ( VllmConfig, ) from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp, +) from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, QuantKey, @@ -25,10 +28,12 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, + cutlass_block_fp8_supported, cutlass_fp8_supported, maybe_create_device_identity, ) from vllm.platforms import current_platform +from vllm.utils.deep_gemm import is_deep_gemm_supported from ..utils import override_cutlass_fp8_supported from .backend import TestBackend @@ -44,7 +49,7 @@ class TestModel(torch.nn.Module): self, hidden_size: int, eps: float, - static: bool, + group_shape: GroupShape, cuda_force_torch: bool, *args, **kwargs, @@ -52,8 +57,17 @@ class TestModel(torch.nn.Module): super().__init__(*args, **kwargs) self.cuda_force_torch = cuda_force_torch self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)] - self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] - group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN + if group_shape.is_per_group(): + self.wscale = [ + torch.rand( + (hidden_size // group_shape[1], hidden_size // group_shape[1]), + dtype=torch.float32, + ) + for _ in range(3) + ] + else: + self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] + static = group_shape == GroupShape.PER_TENSOR quant_scale = ScaleDesc(torch.float32, static, group_shape) self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True) if static: @@ -61,18 +75,29 @@ class TestModel(torch.nn.Module): else: self.scale = [None for _ in range(3)] self.w = [ - torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() - for _ in range(3) + torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3) ] + if not group_shape.is_per_group(): + self.w = [self.w[0].t() for _ in range(3)] - with override_cutlass_fp8_supported(not cuda_force_torch): - self.fp8_linear = Fp8LinearOp( - act_quant_static=static, + if group_shape.is_per_group(): + self.fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(group_shape[1], group_shape[1]), act_quant_group_shape=group_shape, + cutlass_block_fp8_supported=cutlass_block_fp8_supported(), + use_aiter_and_is_supported=False, ) + self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled() + else: + with override_cutlass_fp8_supported(not cuda_force_torch): + self.fp8_linear = Fp8LinearOp( + act_quant_static=static, + act_quant_group_shape=group_shape, + ) + self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled() self.enable_rms_norm_custom_op = self.norm[0].enabled() - self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled() + self.group_shape = group_shape def forward(self, x): # avoid having graph input be an arg to a pattern directly @@ -119,11 +144,19 @@ class TestModel(torch.nn.Module): ) +GROUP_SHAPES = [ + GroupShape.PER_TOKEN, + GroupShape.PER_TENSOR, + GroupShape(1, 128), + GroupShape(1, 64), +] + + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -@pytest.mark.parametrize("hidden_size", [64]) +@pytest.mark.parametrize("hidden_size", [256]) @pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) -@pytest.mark.parametrize("static", [True, False]) +@pytest.mark.parametrize("group_shape", GROUP_SHAPES) @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False]) @pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False]) # cuda_force_torch used to test torch code path on platforms that @@ -139,7 +172,7 @@ def test_fusion_rmsnorm_quant( hidden_size, num_tokens, eps, - static, + group_shape, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, cuda_force_torch, @@ -149,6 +182,15 @@ def test_fusion_rmsnorm_quant( torch.manual_seed(1) maybe_create_device_identity() # needed for certain non-cutlass fp8 paths + if not enable_quant_fp8_custom_op and group_shape.is_per_group(): + pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization") + + # Skip test for 64-bit group shape when running with cutlass or deepgemm + if group_shape == GroupShape(1, 64) and ( + cutlass_block_fp8_supported() or is_deep_gemm_supported() + ): + pytest.skip("Unsupported group shape 64 for CUTLASS/DeepGemm") + custom_ops = [] if enable_rms_norm_custom_op: custom_ops.append("+rms_norm") @@ -172,8 +214,7 @@ def test_fusion_rmsnorm_quant( backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) backend2 = TestBackend(noop_pass, cleanup_pass) - model = TestModel(hidden_size, eps, static, cuda_force_torch) - + model = TestModel(hidden_size, eps, group_shape, cuda_force_torch) # First dimension dynamic x = torch.rand(num_tokens, hidden_size) torch._dynamo.mark_dynamic(x, 0) diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index b5fc653ca7353..094073f5d3f92 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -8,6 +8,12 @@ import torch import vllm._custom_ops as ops from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, +) +from vllm.model_executor.layers.quantization.utils.int8_utils import ( + per_token_group_quant_int8, +) DTYPES = [torch.bfloat16, torch.float] QUANT_DTYPES = [torch.int8, torch.float8_e4m3fn] @@ -21,6 +27,7 @@ NUM_TOKENS_HIDDEN_SIZES = [ ADD_RESIDUAL = [False, True] SCALE_UBS = [True, False] +GROUP_SIZES = [None, [1, 64], [1, 128]] SEEDS = [0] CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] @@ -45,12 +52,13 @@ def ref_rms_norm( return out, residual -def ref_dynamic_per_token_quant( +def ref_dynamic_per_token_or_block_quant( rms_norm_layer: RMSNorm, x: torch.Tensor, quant_dtype: torch.dtype, residual: torch.Tensor | None, scale_ub: torch.Tensor | None, + group_size: list[int] | None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: if scale_ub is not None: assert quant_dtype == torch.float8_e4m3fn @@ -59,13 +67,24 @@ def ref_dynamic_per_token_quant( torch_out, residual = ref_rms_norm(rms_norm_layer, x, residual) # Quant - if quant_dtype == torch.float8_e4m3fn: - torch_out, scales = ops.scaled_fp8_quant( - torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True - ) + if group_size is not None: + if quant_dtype == torch.float8_e4m3fn: + torch_out, scales = per_token_group_quant_fp8( + torch_out, group_size=group_size[1], use_ue8m0=False + ) + else: + assert quant_dtype == torch.int8 + torch_out, scales = per_token_group_quant_int8( + torch_out, group_size=group_size[1] + ) else: - assert quant_dtype == torch.int8 - torch_out, scales, _ = ops.scaled_int8_quant(torch_out) + if quant_dtype == torch.float8_e4m3fn: + torch_out, scales = ops.scaled_fp8_quant( + torch_out, scale_ub=scale_ub, use_per_token_if_dynamic=True + ) + else: + assert quant_dtype == torch.int8 + torch_out, scales, _ = ops.scaled_int8_quant(torch_out) return torch_out, scales, residual @@ -76,24 +95,32 @@ def ref_impl( quant_dtype: torch.dtype, residual: torch.Tensor | None, scale_ub: torch.Tensor | None, + group_size: list[int] | None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: - return ref_dynamic_per_token_quant( - rms_norm_layer, x, quant_dtype, residual, scale_ub + return ref_dynamic_per_token_or_block_quant( + rms_norm_layer, x, quant_dtype, residual, scale_ub, group_size ) -def ops_dynamic_per_token_quant( +def ops_dynamic_per_token_or_block_quant( weight: torch.Tensor, x: torch.Tensor, quant_dtype: torch.dtype, residual: torch.Tensor | None, scale_ub: torch.Tensor | None, + group_size: list[int] | None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: if residual is not None: residual = residual.clone() - out, scales = ops.rms_norm_dynamic_per_token_quant( - x, weight, EPS, quant_dtype, scale_ub, residual - ) + if group_size is not None: + out, scales = ops.rms_norm_per_block_quant( + x, weight, EPS, quant_dtype, group_size, scale_ub, residual, True + ) + scales = scales.contiguous() + else: + out, scales = ops.rms_norm_dynamic_per_token_quant( + x, weight, EPS, quant_dtype, scale_ub, residual + ) return out, scales, residual @@ -103,8 +130,11 @@ def ops_impl( quant_dtype: torch.dtype, residual: torch.Tensor | None, scale_ub: torch.Tensor | None, + group_size: list[int] | None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: - return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub) + return ops_dynamic_per_token_or_block_quant( + weight, x, quant_dtype, residual, scale_ub, group_size + ) @pytest.mark.parametrize("num_tokens, hidden_size", NUM_TOKENS_HIDDEN_SIZES) @@ -112,6 +142,7 @@ def ops_impl( @pytest.mark.parametrize("has_scale_ub", SCALE_UBS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("quant_dtype", QUANT_DTYPES) +@pytest.mark.parametrize("group_size", GROUP_SIZES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() @@ -122,6 +153,7 @@ def test_rms_norm( has_scale_ub: bool, dtype: torch.dtype, quant_dtype: torch.dtype, + group_size: list[int] | None, seed: int, device: str, ) -> None: @@ -130,6 +162,14 @@ def test_rms_norm( torch.cuda.manual_seed(seed) torch.set_default_device(device) + if group_size is not None and hidden_size % group_size[1] != 0: + # skip + return + + if group_size is not None and has_scale_ub: + # blockwise baseline doesn't support scale_ub + return + if has_scale_ub and quant_dtype != torch.float8_e4m3fn: # skip return @@ -150,10 +190,10 @@ def test_rms_norm( scale_ub = None ref_out, ref_scales, ref_residual = ref_impl( - layer, x, quant_dtype, residual, scale_ub + layer, x, quant_dtype, residual, scale_ub, group_size ) ops_out, ops_scales, ops_residual = ops_impl( - layer.weight, x, quant_dtype, residual, scale_ub + layer.weight, x, quant_dtype, residual, scale_ub, group_size ) assert ref_out.dtype == quant_dtype @@ -166,11 +206,15 @@ def test_rms_norm( assert torch.allclose(ref_scales, ops_scales) a = ref_out.to(dtype=torch.float32) b = ops_out.to(dtype=torch.float32) - ok = torch.allclose(a, b) + ok = torch.allclose(a, b, atol=1e-6) if not ok: # fallback: compare dequantized values with relaxed tolerance - a_deq = a * ref_scales.view(-1, 1) - b_deq = b * ops_scales.view(-1, 1) + if group_size is None: + a_deq = a * ref_scales.view(-1, 1) + b_deq = b * ops_scales.view(-1, 1) + else: + a_deq = a * ref_scales.repeat_interleave(group_size[1], dim=1) + b_deq = b * ops_scales.repeat_interleave(group_size[1], dim=1) # NOTE: It is possible that some future test cases trigger this # max diff due to precision issues. If such an error is # encountered, it's recommended to inspect the differences between diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 94e27545239f4..77d5453291e3c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -436,6 +436,46 @@ def rms_norm_dynamic_per_token_quant( return output, scales +# fused quant layer norm ops blocked +def rms_norm_per_block_quant( + input: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, + group_size: list[int], + scale_ub: torch.Tensor | None = None, + residual: torch.Tensor | None = None, + is_scale_transposed: bool = False, +) -> tuple[torch.Tensor, torch.Tensor]: + assert len(group_size) == 2 + output = torch.empty_like(input, dtype=quant_dtype) + if is_scale_transposed: + scales = torch.empty( + (input.shape[-1] // group_size[1], input.numel() // input.shape[-1]), + device=input.device, + dtype=torch.float32, + ).transpose(0, 1) + else: + scales = torch.empty( + (input.numel() // input.shape[-1], input.shape[-1] // group_size[1]), + device=input.device, + dtype=torch.float32, + ) + + torch.ops._C.rms_norm_per_block_quant( + output, + input, + weight, + scales, + epsilon, + scale_ub, + residual, + group_size[1], + is_scale_transposed, + ) + return output, scales + + # quantization ops # awq def awq_dequantize( diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 1d6e297b495eb..de083a2e5e3c7 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -15,13 +15,22 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, QuantKey, ScaleDesc, + kFp8Dynamic64Sym, + kFp8Dynamic128Sym, kFp8DynamicTensorSym, kFp8DynamicTokenSym, kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale, ) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + cutlass_block_fp8_supported, +) from vllm.platforms import current_platform +from vllm.utils.deep_gemm import ( + is_deep_gemm_e8m0_used, + should_use_deepgemm_for_fp8_linear_for_nk, +) from .inductor_pass import enable_fake_mode from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm @@ -58,6 +67,9 @@ QUANT_OPS: dict[QuantKey, OpOverload] = { } if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default +if current_platform.is_cuda(): + QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501 + QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501 class FusedRMSQuantKey(NamedTuple): @@ -90,6 +102,18 @@ FUSED_OPS: dict[FusedRMSQuantKey, OpOverload] = { FusedRMSQuantKey( kFp8DynamicTokenSym, True ): torch.ops._C.rms_norm_dynamic_per_token_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8Dynamic128Sym, False + ): torch.ops._C.rms_norm_per_block_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8Dynamic128Sym, True + ): torch.ops._C.rms_norm_per_block_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8Dynamic64Sym, False + ): torch.ops._C.rms_norm_per_block_quant.default, # noqa: E501 + FusedRMSQuantKey( + kFp8Dynamic64Sym, True + ): torch.ops._C.rms_norm_per_block_quant.default, # noqa: E501 } @@ -100,6 +124,15 @@ class RMSNormQuantPattern: config = get_current_vllm_config() self.model_dtype = config.model_config.dtype if config.model_config else None + # groupwise FP8 linear uses col major scales if deepgemm and cutlass + using_deepgemm = should_use_deepgemm_for_fp8_linear_for_nk( + self.model_dtype, + config.model_config.hf_config.intermediate_size, + config.model_config.hf_config.hidden_size, + ) + use_col_major_scales = using_deepgemm or cutlass_block_fp8_supported() + use_e8m0 = is_deep_gemm_e8m0_used() if using_deepgemm else False + assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}" self.FUSED_OP = FUSED_OPS[key] @@ -108,7 +141,9 @@ class RMSNormQuantPattern: if not key.fused_add else MatcherFusedAddRMSNorm(epsilon) ) - self.quant_matcher = MatcherQuantFP8(key.quant) + self.quant_matcher = MatcherQuantFP8( + key.quant, use_col_major_scales=use_col_major_scales, use_e8m0=use_e8m0 + ) class RMSNormStaticQuantPattern(RMSNormQuantPattern): @@ -218,6 +253,120 @@ class FusedAddRMSNormStaticQuantPattern(RMSNormQuantPattern): ) +class FusedAddRMSNormGroupQuantPattern(RMSNormQuantPattern): + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + group_shape: GroupShape, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=True, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + self.group_shape = group_shape + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass): + def pattern(input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor): + result_rms, residual = self.rmsnorm_matcher(input, weight, residual) + result, scale = self.quant_matcher(result_rms) + return result, residual, scale + + def replacement( + input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor + ): + # In case we're matching native rms-norm, conversions might be + # optimized out. We convert here just to be safe. + input = input.to(dtype=self.model_dtype) + + result = torch.empty_like(input, dtype=self.quant_dtype) + scale = self.quant_matcher.make_scale( + input, transposed=self.quant_matcher.use_col_major_scales + ) + at = auto_functionalized( + self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + scale_ub=None, + residual=residual, + group_size=self.group_shape[1], + is_scale_transposed=self.quant_matcher.use_col_major_scales, + ) + + # result, residual, scale + return at[1], at[3], at[2] + + pm.register_replacement( + pattern, + replacement, + self.rmsnorm_matcher.inputs(), + pm.fwd_only, + pm_pass, + ) + + +class RMSNormGroupQuantPattern(RMSNormQuantPattern): + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + group_shape: GroupShape, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=False, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + self.group_shape = group_shape + super().__init__(epsilon, key) + + def register(self, pm_pass: PatternMatcherPass): + def pattern(input: torch.Tensor, weight: torch.Tensor): + result_rms = self.rmsnorm_matcher(input, weight) + result, scale = self.quant_matcher(result_rms) + return result, scale + + def replacement(input: torch.Tensor, weight: torch.Tensor): + # In case we're matching native rms-norm, conversions might be + # optimized out. We convert here just to be safe. + input = input.to(dtype=self.model_dtype) + + result = torch.empty_like(input, dtype=self.quant_dtype) + scale = self.quant_matcher.make_scale( + input, transposed=self.quant_matcher.use_col_major_scales + ) + at = auto_functionalized( + self.FUSED_OP, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=self.epsilon, + scale_ub=None, + residual=None, + group_size=self.group_shape[1], + is_scale_transposed=self.quant_matcher.use_col_major_scales, + ) + + # result, scale + return at[1], at[2] + + pm.register_replacement( + pattern, + replacement, + self.rmsnorm_matcher.inputs(), + pm.fwd_only, + pm_pass, + ) + + class RMSNormDynamicQuantPattern(RMSNormQuantPattern): def __init__( self, @@ -340,6 +489,25 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass): # Make sure fused add patterns are before simple rms norm, # as the latter is a subset of the former in torch ops for epsilon in [1e-5, 1e-6]: + # Fuse fused_add_rms_norm + fp8 group quant + FusedAddRMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128) + ).register(self.patterns) + + # Fuse rms_norm + fp8 group quant + RMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128) + ).register(self.patterns) + + FusedAddRMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64) + ).register(self.patterns) + + # Fuse rms_norm + fp8 group quant + RMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64) + ).register(self.patterns) + # Fuse fused_add_rms_norm + static fp8 quant FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register( self.patterns @@ -366,9 +534,11 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass): def uuid(self) -> Any: return self.hash_source( self, + RMSNormGroupQuantPattern, RMSNormQuantPattern, RMSNormStaticQuantPattern, RMSNormDynamicQuantPattern, FusedAddRMSNormStaticQuantPattern, FusedAddRMSNormDynamicQuantPattern, + FusedAddRMSNormGroupQuantPattern, ) diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py index e4cd063d2aee1..0c0bece9b3fda 100644 --- a/vllm/compilation/matcher_utils.py +++ b/vllm/compilation/matcher_utils.py @@ -13,6 +13,8 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, _normalize_quant_group_shape, + kFp8Dynamic64Sym, + kFp8Dynamic128Sym, kFp8DynamicTensorSym, kFp8DynamicTokenSym, kFp8StaticTensorSym, @@ -35,6 +37,10 @@ QUANT_OPS: dict[QuantKey, OpOverload] = { if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): QUANT_OPS[kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default # noqa: E501 +if current_platform.is_cuda(): + QUANT_OPS[kFp8Dynamic128Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501 + QUANT_OPS[kFp8Dynamic64Sym] = torch.ops._C.per_token_group_fp8_quant.default # noqa: E501 + SILU_MUL_OP = torch.ops._C.silu_and_mul.default @@ -224,12 +230,20 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp): class MatcherQuantFP8(MatcherCustomOp): - def __init__(self, quant_key: QuantKey, enabled: bool | None = None): + def __init__( + self, + quant_key: QuantKey, + enabled: bool | None = None, + use_col_major_scales: bool = False, + use_e8m0: bool = False, + ): if enabled is None: enabled = QuantFP8.enabled() super().__init__(enabled) self.quant_key = quant_key + self.use_col_major_scales = use_col_major_scales + self.use_e8m0 = use_e8m0 assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}" self.QUANT_OP = QUANT_OPS[quant_key] @@ -248,6 +262,27 @@ class MatcherQuantFP8(MatcherCustomOp): input.shape, device=input.device, dtype=self.quant_key.dtype ) + if self.quant_key.scale.group_shape.is_per_group(): + assert scale is None + scale = self.make_scale(input, transposed=self.use_col_major_scales) + + finfo = torch.finfo(self.quant_key.dtype) + fp8_min = finfo.min + fp8_max = finfo.max + + _, result, scale = auto_functionalized( + self.QUANT_OP, + input=input, + output_q=result, + output_s=scale, + group_size=self.quant_key.scale.group_shape[1], + eps=1e-10, + fp8_min=fp8_min, + fp8_max=fp8_max, + scale_ue8m0=self.use_e8m0, + ) + return result, scale + if self.quant_key.scale.static: assert scale is not None _, result = auto_functionalized( @@ -269,7 +304,7 @@ class MatcherQuantFP8(MatcherCustomOp): ) -> tuple[torch.Tensor, torch.Tensor]: return self.quant_fp8(input, scale) - def make_scale(self, input: torch.Tensor): + def make_scale(self, input: torch.Tensor, transposed: bool = False): normalized_group_shape = _normalize_quant_group_shape( input, self.quant_key.scale.group_shape ) @@ -277,6 +312,11 @@ class MatcherQuantFP8(MatcherCustomOp): input.shape[0] // normalized_group_shape[0], input.shape[1] // normalized_group_shape[1], ) + if transposed: + scale_shape = tuple(reversed(scale_shape)) + return torch.empty( + scale_shape, device=input.device, dtype=torch.float32 + ).permute(-1, -2) return torch.empty(scale_shape, device=input.device, dtype=torch.float32) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 7e1bda8639ac7..ad92f4ec63c34 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -733,7 +733,7 @@ def per_token_group_quant_fp8( assert out_q is None or out_q.shape == x.shape x_q = out_q if x_q is None: - x_q = torch.empty_like(x, device=x.device, dtype=dtype) + x_q = torch.empty(x.shape, device=x.device, dtype=dtype) # Allocate the scale tensor in either row- or column-major format. if column_major_scales: diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index d056d3404385a..92ee8c498e01f 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -115,6 +115,12 @@ kFp8DynamicTokenSym = QuantKey(FP8_DTYPE, kDynamicTokenScale, symmetric=True) kNvfp4GroupScale = ScaleDesc(FP8_DTYPE, False, GroupShape(1, 16)) kNvfp4Quant = QuantKey(FP4_DTYPE, scale=kNvfp4GroupScale, scale2=kStaticTensorScale) +kDynamic128Scale = ScaleDesc(torch.float32, False, GroupShape(1, 128)) +kFp8Dynamic128Sym = QuantKey(FP8_DTYPE, kDynamic128Scale, symmetric=True) + +kDynamic64Scale = ScaleDesc(torch.float32, False, GroupShape(1, 64)) +kFp8Dynamic64Sym = QuantKey(FP8_DTYPE, kDynamic64Scale, symmetric=True) + # Normalize the group_shape to the full extent for any dims that are -1 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape): diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index b25c1e3e1ece3..8545108a02666 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -381,6 +381,22 @@ def should_use_deepgemm_for_fp8_linear( ) +def should_use_deepgemm_for_fp8_linear_for_nk( + output_dtype: torch.dtype, + shape0: int, + shape1: int, + supports_deep_gemm: bool | None = None, +): + if supports_deep_gemm is None: + supports_deep_gemm = is_deep_gemm_supported() + return ( + supports_deep_gemm + and output_dtype == torch.bfloat16 + and shape0 % 128 == 0 + and shape1 % 128 == 0 + ) + + __all__ = [ "calc_diff", "fp8_gemm_nt", @@ -394,6 +410,7 @@ __all__ = [ "is_deep_gemm_supported", "get_num_sms", "should_use_deepgemm_for_fp8_linear", + "should_use_deepgemm_for_fp8_linear_for_nk", "get_col_major_tma_aligned_tensor", "get_mk_alignment_for_contiguous_layout", ] From 444f0e3f339caba85f84c6628e1df50605b241a0 Mon Sep 17 00:00:00 2001 From: daniel-salib Date: Sun, 7 Dec 2025 18:02:52 -0800 Subject: [PATCH 055/133] [Frontend] Add MCP type support infrastructure to Responses API (#30054) Signed-off-by: Daniel Salib --- .../openai/parser/test_harmony_utils.py | 185 +++++++++++++++++- .../openai/parser/harmony_utils.py | 163 +++++++++++---- vllm/entrypoints/openai/protocol.py | 8 + 3 files changed, 309 insertions(+), 47 deletions(-) diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index ae6f558f22b03..a3fd80938de6a 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem +from openai.types.responses.response_output_item import McpCall from openai_harmony import Author, Message, Role, TextContent from vllm.entrypoints.openai.parser.harmony_utils import ( @@ -400,17 +401,19 @@ class TestParseOutputMessage: assert output_items[0].arguments == '{"location": "San Francisco"}' assert output_items[1].arguments == '{"location": "New York"}' - def test_commentary_with_unknown_recipient_raises_error(self): - """Test that commentary with unknown recipient raises ValueError.""" - message = Message.from_role_and_content(Role.ASSISTANT, "some content") + def test_commentary_with_unknown_recipient_creates_mcp_call(self): + """Test that commentary with unknown recipient creates MCP call.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') message = message.with_channel("commentary") - message = message.with_recipient("unknown_recipient") + message = message.with_recipient("custom_tool") - try: - parse_output_message(message) - raise AssertionError("Expected ValueError to be raised") - except ValueError as e: - assert "Unknown recipient: unknown_recipient" in str(e) + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].type == "mcp_call" + assert output_items[0].name == "custom_tool" + assert output_items[0].server_label == "custom_tool" def test_analysis_channel_creates_reasoning(self): """Test that analysis channel creates reasoning items.""" @@ -451,3 +454,167 @@ def test_has_custom_tools() -> None: assert has_custom_tools( {"web_search_preview", "code_interpreter", "container", "others"} ) + + +def test_parse_mcp_call_basic() -> None: + """Test that MCP calls are parsed with correct type and server_label.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"path": "/tmp"}') + message = message.with_recipient("filesystem") + message = message.with_channel("commentary") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].type == "mcp_call" + assert output_items[0].name == "filesystem" + assert output_items[0].server_label == "filesystem" + assert output_items[0].arguments == '{"path": "/tmp"}' + assert output_items[0].status == "completed" + + +def test_parse_mcp_call_dotted_recipient() -> None: + """Test that dotted recipients extract the tool name correctly.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"cmd": "ls"}') + message = message.with_recipient("repo_browser.list") + message = message.with_channel("commentary") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], McpCall) + assert output_items[0].name == "list" + assert output_items[0].server_label == "repo_browser" + + +def test_mcp_vs_function_call() -> None: + """Test that function calls are not parsed as MCP calls.""" + func_message = Message.from_role_and_content(Role.ASSISTANT, '{"arg": "value"}') + func_message = func_message.with_recipient("functions.my_tool") + func_message = func_message.with_channel("commentary") + + func_items = parse_output_message(func_message) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + + +def test_mcp_vs_builtin_tools() -> None: + """Test that built-in tools (python, container) are not parsed as MCP calls.""" + # Test python (built-in tool) - should be reasoning, not MCP + python_message = Message.from_role_and_content(Role.ASSISTANT, "print('hello')") + python_message = python_message.with_recipient("python") + python_message = python_message.with_channel("commentary") + + python_items = parse_output_message(python_message) + + assert len(python_items) == 1 + assert not isinstance(python_items[0], McpCall) + assert python_items[0].type == "reasoning" + + +def test_parse_remaining_state_commentary_channel() -> None: + """Test parse_remaining_state with commentary channel and various recipients.""" + from unittest.mock import Mock + + from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state + + # Test 1: functions.* recipient → should return function tool call + parser_func = Mock() + parser_func.current_content = '{"arg": "value"}' + parser_func.current_role = Role.ASSISTANT + parser_func.current_channel = "commentary" + parser_func.current_recipient = "functions.my_tool" + + func_items = parse_remaining_state(parser_func) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + assert func_items[0].name == "my_tool" + assert func_items[0].status == "in_progress" + + # Test 2: MCP tool (not builtin) → should return MCP call + parser_mcp = Mock() + parser_mcp.current_content = '{"path": "/tmp"}' + parser_mcp.current_role = Role.ASSISTANT + parser_mcp.current_channel = "commentary" + parser_mcp.current_recipient = "filesystem" + + mcp_items = parse_remaining_state(parser_mcp) + + assert len(mcp_items) == 1 + assert isinstance(mcp_items[0], McpCall) + assert mcp_items[0].type == "mcp_call" + assert mcp_items[0].name == "filesystem" + assert mcp_items[0].server_label == "filesystem" + assert mcp_items[0].status == "in_progress" + + # Test 3: Built-in tool (python) + # should NOT return MCP call, falls through to reasoning + parser_builtin = Mock() + parser_builtin.current_content = "print('hello')" + parser_builtin.current_role = Role.ASSISTANT + parser_builtin.current_channel = "commentary" + parser_builtin.current_recipient = "python" + + builtin_items = parse_remaining_state(parser_builtin) + + # Should fall through to reasoning logic + assert len(builtin_items) == 1 + assert not isinstance(builtin_items[0], McpCall) + assert builtin_items[0].type == "reasoning" + + +def test_parse_remaining_state_analysis_channel() -> None: + """Test parse_remaining_state with analysis channel and various recipients.""" + from unittest.mock import Mock + + from vllm.entrypoints.openai.parser.harmony_utils import parse_remaining_state + + # Test 1: functions.* recipient → should return function tool call + parser_func = Mock() + parser_func.current_content = '{"arg": "value"}' + parser_func.current_role = Role.ASSISTANT + parser_func.current_channel = "analysis" + parser_func.current_recipient = "functions.my_tool" + + func_items = parse_remaining_state(parser_func) + + assert len(func_items) == 1 + assert not isinstance(func_items[0], McpCall) + assert func_items[0].type == "function_call" + assert func_items[0].name == "my_tool" + assert func_items[0].status == "in_progress" + + # Test 2: MCP tool (not builtin) → should return MCP call + parser_mcp = Mock() + parser_mcp.current_content = '{"query": "test"}' + parser_mcp.current_role = Role.ASSISTANT + parser_mcp.current_channel = "analysis" + parser_mcp.current_recipient = "database" + + mcp_items = parse_remaining_state(parser_mcp) + + assert len(mcp_items) == 1 + assert isinstance(mcp_items[0], McpCall) + assert mcp_items[0].type == "mcp_call" + assert mcp_items[0].name == "database" + assert mcp_items[0].server_label == "database" + assert mcp_items[0].status == "in_progress" + + # Test 3: Built-in tool (container) + # should NOT return MCP call, falls through to reasoning + parser_builtin = Mock() + parser_builtin.current_content = "docker run" + parser_builtin.current_role = Role.ASSISTANT + parser_builtin.current_channel = "analysis" + parser_builtin.current_recipient = "container" + + builtin_items = parse_remaining_state(parser_builtin) + + # Should fall through to reasoning logic + assert len(builtin_items) == 1 + assert not isinstance(builtin_items[0], McpCall) + assert builtin_items[0].type == "reasoning" diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 7da0914ce3d3e..2260e9604c3ed 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -19,6 +19,7 @@ from openai.types.responses.response_function_web_search import ( ActionSearch, ResponseFunctionWebSearch, ) +from openai.types.responses.response_output_item import McpCall from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent, ) @@ -155,11 +156,7 @@ def get_developer_message( "web_search_preview", "code_interpreter", "container", - "mcp", ): - # These are built-in tools that are added to the system message. - # Adding in MCP for now until we support MCP tools executed - # server side pass elif tool.type == "function": @@ -427,6 +424,44 @@ def _parse_final_message(message: Message) -> ResponseOutputItem: ) +def _parse_mcp_recipient(recipient: str) -> tuple[str, str]: + """ + Parse MCP recipient into (server_label, tool_name). + + For dotted recipients like "repo_browser.list": + - server_label: "repo_browser" (namespace/server) + - tool_name: "list" (specific tool) + + For simple recipients like "filesystem": + - server_label: "filesystem" + - tool_name: "filesystem" + """ + if "." in recipient: + server_label = recipient.split(".")[0] + tool_name = recipient.split(".")[-1] + else: + server_label = recipient + tool_name = recipient + return server_label, tool_name + + +def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem]: + """Parse MCP calls into MCP call items.""" + server_label, tool_name = _parse_mcp_recipient(recipient) + output_items = [] + for content in message.content: + response_item = McpCall( + arguments=content.text, + type="mcp_call", + name=tool_name, + server_label=server_label, + id=f"mcp_{random_uuid()}", + status="completed", + ) + output_items.append(response_item) + return output_items + + def parse_output_message(message: Message) -> list[ResponseOutputItem]: """ Parse a Harmony message into a list of output response items. @@ -440,33 +475,34 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: output_items: list[ResponseOutputItem] = [] recipient = message.recipient - # Browser tool calls - if recipient is not None and recipient.startswith("browser."): - output_items.append(_parse_browser_tool_call(message, recipient)) + if recipient is not None: + # Browser tool calls + if recipient.startswith("browser."): + output_items.append(_parse_browser_tool_call(message, recipient)) - # Analysis channel (reasoning/chain-of-thought) + # Function calls (should only happen on commentary channel) + elif message.channel == "commentary" and recipient.startswith("functions."): + output_items.extend(_parse_function_call(message, recipient)) + + # Built-in tools are treated as reasoning + elif recipient.startswith(("python", "browser", "container")): + # Built-in tool recipients (python/browser/container) + # generate reasoning output + output_items.extend(_parse_reasoning_content(message)) + + # All other recipients are MCP calls + else: + output_items.extend(_parse_mcp_call(message, recipient)) + + # No recipient - handle based on channel for non-tool messages elif message.channel == "analysis": output_items.extend(_parse_reasoning_content(message)) - # Commentary channel elif message.channel == "commentary": - # Function calls - if recipient is not None and recipient.startswith("functions."): - output_items.extend(_parse_function_call(message, recipient)) + # Per Harmony format, commentary channel can contain preambles to calling + # multiple functions - explanatory text with no recipient + output_items.extend(_parse_reasoning_content(message)) - # Built-in tools on commentary channel are treated as reasoning for now - elif ( - recipient is None # Preambles: explanatory text before tool calls - or recipient.startswith(("python", "browser", "container")) - ): - # Per Harmony format, commentary channel can contain preambles to calling - # multiple functions - explanatory text with no recipient. Built-in tool - # recipients (python/browser/container) also generate reasoning output. - output_items.extend(_parse_reasoning_content(message)) - else: - raise ValueError(f"Unknown recipient: {recipient}") - - # Final output message elif message.channel == "final": output_items.append(_parse_final_message(message)) @@ -485,20 +521,70 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: if current_recipient is not None and current_recipient.startswith("browser."): return [] - if parser.current_channel == "analysis": - reasoning_item = ResponseReasoningItem( - id=f"rs_{random_uuid()}", - summary=[], - type="reasoning", - content=[ - ResponseReasoningTextContent( - text=parser.current_content, type="reasoning_text" + if current_recipient and parser.current_channel in ("commentary", "analysis"): + if current_recipient.startswith("functions."): + rid = random_uuid() + return [ + ResponseFunctionToolCall( + arguments=parser.current_content, + call_id=f"call_{rid}", + type="function_call", + name=current_recipient.split(".")[-1], + id=f"fc_{rid}", + status="in_progress", ) - ], - status=None, - ) - return [reasoning_item] - elif parser.current_channel == "final": + ] + # Built-in tools (python, browser, container) should be treated as reasoning + elif not ( + current_recipient.startswith("python") + or current_recipient.startswith("browser") + or current_recipient.startswith("container") + ): + # All other recipients are MCP calls + rid = random_uuid() + server_label, tool_name = _parse_mcp_recipient(current_recipient) + return [ + McpCall( + arguments=parser.current_content, + type="mcp_call", + name=tool_name, + server_label=server_label, + id=f"mcp_{rid}", + status="in_progress", + ) + ] + + if parser.current_channel == "commentary": + return [ + ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + ] + + if parser.current_channel == "analysis": + return [ + ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=parser.current_content, type="reasoning_text" + ) + ], + status=None, + ) + ] + + if parser.current_channel == "final": output_text = ResponseOutputText( text=parser.current_content, annotations=[], # TODO @@ -515,6 +601,7 @@ def parse_remaining_state(parser: StreamableParser) -> list[ResponseOutputItem]: type="message", ) return [text_item] + return [] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2d34a6a0cd5ad..aeff6bded7f00 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -25,6 +25,10 @@ from openai.types.responses import ( ResponseContentPartDoneEvent, ResponseFunctionToolCall, ResponseInputItemParam, + ResponseMcpCallArgumentsDeltaEvent, + ResponseMcpCallArgumentsDoneEvent, + ResponseMcpCallCompletedEvent, + ResponseMcpCallInProgressEvent, ResponseOutputItem, ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent, @@ -1790,6 +1794,10 @@ StreamingResponsesResponse: TypeAlias = ( | ResponseCodeInterpreterCallCodeDoneEvent | ResponseCodeInterpreterCallInterpretingEvent | ResponseCodeInterpreterCallCompletedEvent + | ResponseMcpCallArgumentsDeltaEvent + | ResponseMcpCallArgumentsDoneEvent + | ResponseMcpCallInProgressEvent + | ResponseMcpCallCompletedEvent ) From 735284ed865e0c863cbd084527f0ae77a560fac1 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Sun, 7 Dec 2025 18:04:03 -0800 Subject: [PATCH 056/133] [responsesAPI][7] Browser, Container MCP tools for non harmony models (#29989) Signed-off-by: Andrew Xia Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia Co-authored-by: Cyrus Leung --- vllm/entrypoints/context.py | 89 ++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index a484a437c853e..01ddab473723b 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -278,12 +278,14 @@ class ParsableContext(ConversationContext): def need_builtin_tool_call(self) -> bool: """Return true if the last message is a MCP tool call""" last_message = self.parser.response_messages[-1] - # TODO: figure out which tools are MCP tools - if ( # noqa: SIM103 - last_message.type == "function_call" - and last_message.name in ("code_interpreter", "python") - ): - return True + # TODO(qandrew): figure out which tools are MCP tools + if last_message.type == "function_call": # noqa: SIM102 + if last_message.name in ( + "code_interpreter", + "python", + "web_search_preview", + ) or last_message.name.startswith("container"): + return True return False @@ -310,12 +312,87 @@ class ParsableContext(ConversationContext): return [message] + async def call_search_tool( + self, tool_session: Union["ClientSession", Tool], last_msg: FunctionCall + ) -> list[ResponseInputOutputItem]: + self.called_tools.add("browser") + if isinstance(tool_session, Tool): + return await tool_session.get_result_parsable_context(self) + if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: + try: + args = json.loads(last_msg.arguments) + except json.JSONDecodeError as e: + return _create_json_parse_error_messages(last_msg, e) + else: + args = json.loads(last_msg.arguments) + result = await tool_session.call_tool("search", args) + result_str = result.content[0].text + + message = ResponseFunctionToolCallOutputItem( + id=f"fco_{random_uuid()}", + type="function_call_output", + call_id=f"call_{random_uuid()}", + output=result_str, + status="completed", + ) + + return [message] + + async def call_container_tool( + self, tool_session: Union["ClientSession", Tool], last_msg: Message + ) -> list[Message]: + """ + Call container tool. Expect this to be run in a stateful docker + with command line terminal. + The official container tool would at least + expect the following format: + - for tool name: exec + - args: + { + "cmd":List[str] "command to execute", + "workdir":optional[str] "current working directory", + "env":optional[object/dict] "environment variables", + "session_name":optional[str] "session name", + "timeout":optional[int] "timeout in seconds", + "user":optional[str] "user name", + } + """ + self.called_tools.add("container") + if isinstance(tool_session, Tool): + return await tool_session.get_result_parsable_context(self) + # tool_name = last_msg.recipient.split(".")[1].split(" ")[0] + if envs.VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: + try: + args = json.loads(last_msg.arguments) + except json.JSONDecodeError as e: + return _create_json_parse_error_messages(last_msg, e) + else: + args = json.loads(last_msg.arguments) + result = await tool_session.call_tool("exec", args) + result_str = result.content[0].text + + message = ResponseFunctionToolCallOutputItem( + id=f"fco_{random_uuid()}", + type="function_call_output", + call_id=f"call_{random_uuid()}", + output=result_str, + status="completed", + ) + + return [message] + async def call_tool(self) -> list[ResponseInputOutputItem]: if not self.parser.response_messages: return [] last_msg = self.parser.response_messages[-1] if last_msg.name == "code_interpreter": return await self.call_python_tool(self._tool_sessions["python"], last_msg) + elif last_msg.name == "web_search_preview": + return await self.call_search_tool(self._tool_sessions["browser"], last_msg) + elif last_msg.name.startswith("container"): + return await self.call_container_tool( + self._tool_sessions["container"], last_msg + ) return [] def render_for_completion(self): From 344b50d5258d7cf3f136416e1dbcd9b5ee99bb00 Mon Sep 17 00:00:00 2001 From: Zhijian Jiang Date: Sun, 7 Dec 2025 19:26:25 -0800 Subject: [PATCH 057/133] Address comment to mergify.yml in #30117 (#30219) Signed-off-by: Zhijian Jiang --- .github/mergify.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 5cb9fcdf9f86a..58a5d77867c95 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -172,7 +172,7 @@ pull_request_rules: - files~=^tests/entrypoints/test_context.py - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py - - files~=^vllm/entrypoints/harmony_utils.py + - files~=^vllm/entrypoints/openai/parser/harmony_utils.py - files~=^vllm/entrypoints/tool_server.py - files~=^vllm/entrypoints/tool.py - files~=^vllm/entrypoints/context.py @@ -390,4 +390,4 @@ pull_request_rules: actions: label: add: - - kv-connector \ No newline at end of file + - kv-connector From d726a7b0ed0b58b7703709dc3e486c5fe2f55db3 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 7 Dec 2025 20:21:05 -0800 Subject: [PATCH 058/133] [BugFix] Unblock use of LoRA with data parallel mode (#30220) Signed-off-by: Nick Hill --- vllm/v1/metrics/loggers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6961e15c2d0c5..882e0ce0b2e03 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -952,7 +952,10 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_lora_info: Gauge | None = None if vllm_config.lora_config is not None: if len(self.engine_indexes) > 1: - raise NotImplementedError("LoRA in DP mode is not supported yet.") + logger.warning( + "vllm:lora_requests_info prometheus metrics may be " + "incorrect/misleading with data parallel deployments." + ) self.labelname_max_lora = "max_lora" self.labelname_waiting_lora_adapters = "waiting_lora_adapters" self.labelname_running_lora_adapters = "running_lora_adapters" From c6df05ebb49901510890aea7c7411a17a38c0a8f Mon Sep 17 00:00:00 2001 From: Zhiwei <532707544@qq.com> Date: Mon, 8 Dec 2025 13:23:46 +0800 Subject: [PATCH 059/133] [ROCm] [Fused Moe EP] Use binary expert mask for aiter fused moe kernel (#29773) Signed-off-by: ZhiweiYan-96 --- vllm/model_executor/layers/fused_moe/layer.py | 4 ++++ vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 + 2 files changed, 5 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6001b6d83c398..9b4d77a060c29 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -520,6 +520,10 @@ class FusedMoE(CustomOp): self._init_aiter_shared_experts_topK_buffer( vllm_config=vllm_config, dp_size=dp_size_ ) + if self.use_ep and self.rocm_aiter_fmoe_enabled: + assert self.expert_mask is None or torch.all( + (expert_mask == 0) | (expert_mask == 1) + ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." assert intermediate_size % self.tp_size == 0 self.hidden_size = hidden_size diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 8be0299eaa66f..9e2b2134310fc 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -633,6 +633,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): topk_ids=topk_ids, activation=activation, quant_config=self.moe_quant_config, + expert_map=expert_map, ) else: from vllm.model_executor.layers.fused_moe import fused_experts From d143271234454026454c5ee6a55fc516dd298dac Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Mon, 8 Dec 2025 14:43:47 +0800 Subject: [PATCH 060/133] [Bugfix] fix fuse_allreduce_rms when tp =1 (#30178) Signed-off-by: zjy0516 --- vllm/compilation/collective_fusion.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 2717738dd7c29..57bd94c7e8ad6 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -1076,11 +1076,15 @@ class AllReduceFusionPass(VllmPatternMatcherPass): self.disabled = True self.tp_size = get_tensor_model_parallel_world_size() if self.tp_size <= 1: + logger.warning_once("AllReduce fusion pass is disabled for tp_size <= 1.") return self.patterns: PatternMatcherPass = PatternMatcherPass( pass_name="all_reduce_fusion_pass" ) if config.model_config is None: + logger.warning_once( + "AllReduce fusion pass is disabled for missing model_config." + ) return self.hidden_dim = config.model_config.get_hidden_size() self.group = get_tp_group().device_group @@ -1188,6 +1192,9 @@ class AllReduceFusionPass(VllmPatternMatcherPass): self.disabled = False def is_applicable_for_range(self, compile_range: Range) -> bool: + if self.disabled: + logger.warning_once("AllReduce fusion pass is disabled.") + return False return compile_range.end <= self.max_token_num @VllmInductorPass.time_and_log From cd00c443d2272e5325dc6d730616f0a68ad533d1 Mon Sep 17 00:00:00 2001 From: Zhiyu Date: Sun, 7 Dec 2025 23:05:27 -0800 Subject: [PATCH 061/133] [Misc] Rename TensorRT Model Optimizer to Model Optimizer (#30091) Signed-off-by: Zhiyu Cheng --- docs/features/quantization/README.md | 2 +- docs/features/quantization/modelopt.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 7b5287bad3bb8..8b4dcf01969ae 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -14,7 +14,7 @@ Contents: - [INT4 W4A16](int4.md) - [INT8 W8A8](int8.md) - [FP8 W8A8](fp8.md) -- [NVIDIA TensorRT Model Optimizer](modelopt.md) +- [NVIDIA Model Optimizer](modelopt.md) - [AMD Quark](quark.md) - [Quantized KV Cache](quantized_kvcache.md) - [TorchAO](torchao.md) diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index c48ccb719a79d..b02d5ba9e89a2 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -1,6 +1,6 @@ -# NVIDIA TensorRT Model Optimizer +# NVIDIA Model Optimizer -The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models. +The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models. We recommend installing the library with: @@ -10,7 +10,7 @@ pip install nvidia-modelopt ## Quantizing HuggingFace Models with PTQ -You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. +You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. Below is an example showing how to quantize a model using modelopt's PTQ API: From bcb6f5947f8acac22f5d1d5fc92a91b06fd57e77 Mon Sep 17 00:00:00 2001 From: Dazhi Jiang Date: Mon, 8 Dec 2025 15:12:42 +0800 Subject: [PATCH 062/133] [Perf] Remove sync point in vit torch sdpa attn backend (#30232) Signed-off-by: Dazhi Jiang --- vllm/attention/ops/vit_attn_wrappers.py | 12 ++++++------ vllm/model_executor/models/ernie45_vl.py | 12 ++++++------ vllm/model_executor/models/glm4_1v.py | 12 ++++++------ vllm/model_executor/models/qwen2_vl.py | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index d9f15f1e42858..9036c2b801949 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -93,12 +93,12 @@ def torch_sdpa_wrapper( cu_seqlens: torch.Tensor, ) -> torch.Tensor: outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] + + lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + q_chunks = torch.split(q, lens, dim=1) + k_chunks = torch.split(k, lens, dim=1) + v_chunks = torch.split(v, lens, dim=1) + for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks): q_i, k_i, v_i = ( einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] ) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 3305b6a0e58f0..053d260cc09b2 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -289,12 +289,12 @@ class Ernie4_5_VisionAttention(nn.Module): elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] + + lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + q_chunks = torch.split(q, lens, dim=1) + k_chunks = torch.split(k, lens, dim=1) + v_chunks = torch.split(v, lens, dim=1) + for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks): q_i, k_i, v_i = ( rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 39a837b789bed..741edfdda3e2c 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -377,12 +377,12 @@ class Glm4vVisionAttention(nn.Module): elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] + + lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + q_chunks = torch.split(q, lens, dim=1) + k_chunks = torch.split(k, lens, dim=1) + v_chunks = torch.split(v, lens, dim=1) + for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks): q_i, k_i, v_i = ( rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 885e172d1e81a..608e90337f452 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -424,12 +424,12 @@ class Qwen2VisionAttention(nn.Module): k = k.contiguous() v = v.contiguous() outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] + + lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + q_chunks = torch.split(q, lens, dim=1) + k_chunks = torch.split(k, lens, dim=1) + v_chunks = torch.split(v, lens, dim=1) + for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks): q_i, k_i, v_i = ( rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] ) From 9e77ffca3f41e0e73879098f1686a4c82b8619d9 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 8 Dec 2025 16:10:09 +0800 Subject: [PATCH 063/133] [Model][7/N] Improve all pooling task | Deprecation as_reward_model. Extract hidden states prefer using new multi-vector retrieval API (#26686) Signed-off-by: wang.yuqi --- docs/models/pooling_models.md | 10 ++- docs/models/supported_models.md | 9 +-- .../pooling/token_embed/jina_embeddings_v4.py | 71 +++++++++++++++++++ tests/models/test_registry.py | 2 - tests/test_config.py | 2 +- vllm/config/model.py | 10 ++- vllm/model_executor/model_loader/utils.py | 4 -- vllm/model_executor/models/adapters.py | 38 ---------- 8 files changed, 88 insertions(+), 58 deletions(-) create mode 100644 examples/pooling/token_embed/jina_embeddings_v4.py diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index e2d427e8a4590..32ffcf96fabef 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -33,8 +33,8 @@ shown in the table below. | Architecture | `--convert` | Supported pooling tasks | |-------------------------------------------------|-------------|---------------------------------------| | `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | | `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | -| `*ForRewardModeling`, `*RewardModel` | `reward` | `token_classify` | !!! tip You can explicitly set `--convert ` to specify how to convert the model. @@ -70,7 +70,6 @@ the pooler assigned to each task has the following attributes by default: | Task | Pooling Type | Normalization | Softmax | |------------|--------------|---------------|---------| -| `reward` | `ALL` | ❌ | ❌ | | `embed` | `LAST` | ✅︎ | ❌ | | `classify` | `LAST` | ❌ | ✅︎ | @@ -318,3 +317,10 @@ We have split the `encode` task into two more specific token-wise tasks: `token_ ### Remove softmax from PoolingParams We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. + +### as_reward_model + +Pooling models now default support all pooling, you can use it without any settings. + +- Extracting hidden states prefers using `token_embed` task. +- Reward models prefers using `token_classify` task. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ec3ba4474c192..d0166060c267a 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -581,16 +581,9 @@ These models primarily support the [`LLM.reward`](./pooling_models.md#llmreward) | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|-------------------|----------------------|---------------------------| | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | -| `LlamaForCausalLM`C | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | - -C Automatically converted into a reward model via `--convert reward`. ([details](./pooling_models.md#model-conversion)) -\* Feature support is the same as that of the original model. - -If your model is not in the above list, we will try to automatically convert the model using -[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. !!! important For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, diff --git a/examples/pooling/token_embed/jina_embeddings_v4.py b/examples/pooling/token_embed/jina_embeddings_v4.py new file mode 100644 index 0000000000000..83d4c446d426c --- /dev/null +++ b/examples/pooling/token_embed/jina_embeddings_v4.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm import LLM +from vllm.inputs.data import TextPrompt +from vllm.multimodal.utils import fetch_image + +# Initialize model +model = LLM( + model="jinaai/jina-embeddings-v4-vllm-text-matching", + runner="pooling", + max_model_len=1024, + gpu_memory_utilization=0.8, +) + +# Create text prompts +text1 = "Ein wunderschöner Sonnenuntergang am Strand" +text1_prompt = TextPrompt(prompt=f"Query: {text1}") + +text2 = "浜辺に沈む美しい夕日" +text2_prompt = TextPrompt(prompt=f"Query: {text2}") + +# Create image prompt +image = fetch_image( + "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg" # noqa: E501 +) +image_prompt = TextPrompt( + prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n", # noqa: E501 + multi_modal_data={"image": image}, +) + +# Encode all prompts +prompts = [text1_prompt, text2_prompt, image_prompt] +outputs = model.encode(prompts, pooling_task="token_embed") + + +def get_embeddings(outputs): + VISION_START_TOKEN_ID, VISION_END_TOKEN_ID = 151652, 151653 + + embeddings = [] + for output in outputs: + if VISION_START_TOKEN_ID in output.prompt_token_ids: + # Gather only vision tokens + img_start_pos = torch.where( + torch.tensor(output.prompt_token_ids) == VISION_START_TOKEN_ID + )[0][0] + img_end_pos = torch.where( + torch.tensor(output.prompt_token_ids) == VISION_END_TOKEN_ID + )[0][0] + embeddings_tensor = output.outputs.data.detach().clone()[ + img_start_pos : img_end_pos + 1 + ] + else: + # Use all tokens for text-only prompts + embeddings_tensor = output.outputs.data.detach().clone() + + # Pool and normalize embeddings + pooled_output = ( + embeddings_tensor.sum(dim=0, dtype=torch.float32) + / embeddings_tensor.shape[0] + ) + embeddings.append(torch.nn.functional.normalize(pooled_output, dim=-1)) + return embeddings + + +embeddings = get_embeddings(outputs) + +for embedding in embeddings: + print(embedding.shape) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 9017a0fd91407..a089696e10ffc 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -13,7 +13,6 @@ from vllm.model_executor.models import ( ) from vllm.model_executor.models.adapters import ( as_embedding_model, - as_reward_model, as_seq_cls_model, ) from vllm.model_executor.models.registry import ( @@ -46,7 +45,6 @@ def test_registry_imports(model_arch): # All vLLM models should be convertible to a pooling model assert is_pooling_model(as_seq_cls_model(model_cls)) assert is_pooling_model(as_embedding_model(model_cls)) - assert is_pooling_model(as_reward_model(model_cls)) if model_arch in _MULTIMODAL_MODELS: assert supports_multimodal(model_cls) diff --git a/tests/test_config.py b/tests/test_config.py index 203447cd531fb..77d3a7115978e 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -97,7 +97,7 @@ def test_update_config(): ("intfloat/multilingual-e5-small", "pooling", "none", "embed"), ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"), + ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"), ("openai/whisper-small", "generate", "none", "transcription"), ], ) diff --git a/vllm/config/model.py b/vllm/config/model.py index 583904a949ea1..764bdf7000561 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -516,7 +516,11 @@ class ModelConfig: if task == "classify": return "classify" if task == "reward": - return "reward" + logger.warning( + "Pooling models now default support all pooling; " + "you can use it without any settings." + ) + return "embed" if task == "score": new_task = self._get_default_pooling_task(architectures) return "classify" if new_task == "classify" else "embed" @@ -1899,8 +1903,8 @@ _SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [ ("ForImageClassification", ("pooling", "classify")), ("ForVideoClassification", ("pooling", "classify")), ("ClassificationModel", ("pooling", "classify")), - ("ForRewardModeling", ("pooling", "reward")), - ("RewardModel", ("pooling", "reward")), + ("ForRewardModeling", ("pooling", "embed")), + ("RewardModel", ("pooling", "embed")), # Let other `*Model`s take priority ("Model", ("pooling", "embed")), ] diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index eeb2444150eef..74b02e4c62583 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -167,7 +167,6 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]() def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: from vllm.model_executor.models.adapters import ( as_embedding_model, - as_reward_model, as_seq_cls_model, try_create_mm_pooling_model_cls, ) @@ -207,9 +206,6 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], elif convert_type == "classify": logger.debug_once("Converting to sequence classification model.") model_cls = as_seq_cls_model(model_cls) - elif convert_type == "reward": - logger.debug_once("Converting to reward model.") - model_cls = as_reward_model(model_cls) else: assert_never(convert_type) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 007d847ac3b7b..70f203b9f7c64 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -346,44 +346,6 @@ def as_seq_cls_model(cls: _T) -> _T: return ModelForSequenceClassification # type: ignore -def as_reward_model(cls: _T) -> _T: - """ - Subclass an existing vLLM model to support reward modeling. - - By default, we return the hidden states of each token directly. - - Note: - We assume that no extra layers are added to the original model; - please implement your own model if this is not the case. - """ - # Avoid modifying existing reward models - if is_pooling_model(cls): - return cls - - # Lazy import - from vllm.model_executor.layers.pooler import DispatchPooler, Pooler - - from .interfaces_base import default_pooling_type - - @default_pooling_type("ALL") - class ModelForReward(_create_pooling_model_cls(cls)): - def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): - pooler_config = vllm_config.model_config.pooler_config - assert pooler_config is not None - - self.pooler = DispatchPooler( - { - "token_classify": Pooler.for_token_classify( - pooler_config=pooler_config - ) - } - ) - - ModelForReward.__name__ = _get_pooling_model_name(cls.__name__, "ForReward") - - return ModelForReward # type: ignore - - class SequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: From 408cf42f67dbcd50027fcd0f6ba35df83ced9107 Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Mon, 8 Dec 2025 18:29:14 +0800 Subject: [PATCH 064/133] [CI] Prevents triggering of an inactive issue/PR check for forked repository. (#29654) Signed-off-by: Shiming Zhang --- .github/workflows/stale.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index dca3089f496c9..c8a52f1a63269 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -7,6 +7,8 @@ on: jobs: close-issues-and-pull-requests: + # Prevents triggering on forks or other repos + if: github.repository == 'vllm-project/vllm' permissions: issues: write pull-requests: write From 2e660c24349944ac78abcf2c35d17e3caf6cdd08 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 8 Dec 2025 20:01:21 +0800 Subject: [PATCH 065/133] [Frontend] Binary embedding response does not return metadata by setting encoding_format to bytes_only. (#30249) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../embed/embedding_requests_bytes_client.py | 37 +++++++++++- .../entrypoints/pooling/embed/test_online.py | 50 +++++++++++++++++ .../pooling/pooling/test_online.py | 56 +++++++++++++++++++ vllm/entrypoints/pooling/embed/api_router.py | 4 +- vllm/entrypoints/pooling/embed/protocol.py | 4 +- vllm/entrypoints/pooling/embed/serving.py | 34 ++++++----- .../entrypoints/pooling/pooling/api_router.py | 4 +- vllm/entrypoints/pooling/pooling/protocol.py | 4 +- vllm/entrypoints/pooling/pooling/serving.py | 35 +++++++----- vllm/utils/serial_utils.py | 43 ++++++++++++-- 10 files changed, 230 insertions(+), 41 deletions(-) diff --git a/examples/pooling/embed/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py index c2832f1b54ce7..5ea4525241497 100644 --- a/examples/pooling/embed/embedding_requests_bytes_client.py +++ b/examples/pooling/embed/embedding_requests_bytes_client.py @@ -16,6 +16,7 @@ from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, ENDIANNESS, MetadataItem, + build_metadata_items, decode_pooling_output, ) @@ -38,6 +39,11 @@ def parse_args(): def main(args): api_url = f"http://{args.host}:{args.port}/v1/embeddings" model_name = args.model + embedding_size = 0 + + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] * 2 # The OpenAI client does not support the bytes encoding_format. # The OpenAI client does not support the embed_dtype and endianness parameters. @@ -45,7 +51,7 @@ def main(args): for endianness in ENDIANNESS: prompt = { "model": model_name, - "input": "vLLM is great!", + "input": input_texts, "encoding_format": "bytes", "embed_dtype": embed_dtype, "endianness": endianness, @@ -57,7 +63,34 @@ def main(args): embedding = decode_pooling_output(items=items, body=body) embedding = [x.to(torch.float32) for x in embedding] - embedding = torch.cat(embedding) + embedding = torch.stack(embedding) + embedding_size = embedding.shape[-1] + print(embed_dtype, endianness, embedding.shape) + + # The vllm server always sorts the returned embeddings in the order of input. So + # returning metadata is not necessary. You can set encoding_format to bytes_only + # to let the server not return metadata. + for embed_dtype in EMBED_DTYPE_TO_TORCH_DTYPE: + for endianness in ENDIANNESS: + prompt = { + "model": model_name, + "input": input_texts, + "encoding_format": "bytes_only", + "embed_dtype": embed_dtype, + "endianness": endianness, + } + response = post_http_request(prompt=prompt, api_url=api_url) + body = response.content + + items = build_metadata_items( + embed_dtype=embed_dtype, + endianness=endianness, + shape=(embedding_size,), + n_request=len(input_texts), + ) + embedding = decode_pooling_output(items=items, body=body) + embedding = [x.to(torch.float32) for x in embedding] + embedding = torch.stack(embedding) print(embed_dtype, endianness, embedding.shape) diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index ddba1c790ba8c..f96338c47f0be 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -24,6 +24,7 @@ from vllm.utils.serial_utils import ( ENDIANNESS, MetadataItem, binary2tensor, + build_metadata_items, decode_pooling_output, ) @@ -344,6 +345,55 @@ async def test_bytes_embed_dtype_and_endianness( ) +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_bytes_only_embed_dtype_and_endianness( + server: RemoteOpenAIServer, client: openai.AsyncOpenAI, model_name: str +): + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] * 2 + + responses_float = await client.embeddings.create( + input=input_texts, model=model_name, encoding_format="float" + ) + float_data = [d.embedding for d in responses_float.data] + embedding_size = len(float_data[0]) + + for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()): + for endianness in ENDIANNESS: + responses_bytes = requests.post( + server.url_for("/v1/embeddings"), + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "bytes_only", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, + ) + + assert "metadata" not in responses_bytes.headers + body = responses_bytes.content + items = build_metadata_items( + embed_dtype=embed_dtype, + endianness=endianness, + shape=(embedding_size,), + n_request=len(input_texts), + ) + + bytes_data = decode_pooling_output(items=items, body=body) + bytes_data = [x.to(torch.float32).tolist() for x in bytes_data] + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=bytes_data, + name_0="float_data", + name_1="bytes_data", + tol=1e-2, + ) + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py index cc5c2f26f80fb..33add5bdaef49 100644 --- a/tests/entrypoints/pooling/pooling/test_online.py +++ b/tests/entrypoints/pooling/pooling/test_online.py @@ -18,6 +18,7 @@ from vllm.utils.serial_utils import ( ENDIANNESS, MetadataItem, binary2tensor, + build_metadata_items, decode_pooling_output, ) @@ -352,6 +353,61 @@ async def test_bytes_embed_dtype_and_endianness( ) +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_bytes_only_embed_dtype_and_endianness( + server: RemoteOpenAIServer, model_name: str +): + input_texts = [ + "The best thing about vLLM is that it supports many different models", + ] * 2 + + url = server.url_for("pooling") + float_response = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "float", + }, + ) + responses_float = PoolingResponse.model_validate(float_response.json()) + float_data = [np.array(d.data).squeeze(-1).tolist() for d in responses_float.data] + n_tokens = responses_float.usage.prompt_tokens // len(input_texts) + + for embed_dtype in list(EMBED_DTYPE_TO_TORCH_DTYPE.keys()): + for endianness in ENDIANNESS: + responses_bytes = requests.post( + url, + json={ + "model": model_name, + "input": input_texts, + "encoding_format": "bytes_only", + "embed_dtype": embed_dtype, + "endianness": endianness, + }, + ) + + assert "metadata" not in responses_bytes.headers + body = responses_bytes.content + items = build_metadata_items( + embed_dtype=embed_dtype, + endianness=endianness, + shape=(n_tokens, 1), + n_request=len(input_texts), + ) + bytes_data = decode_pooling_output(items=items, body=body) + bytes_data = [x.to(torch.float32).view(-1).tolist() for x in bytes_data] + + check_embeddings_close( + embeddings_0_lst=float_data, + embeddings_1_lst=bytes_data, + name_0="float_data", + name_1="bytes_data", + tol=1e-2, + ) + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("param_name", ["encoding_format", "embed_dtype", "endianness"]) diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py index 5b10a32e79f81..24b0c8c2b3cf6 100644 --- a/vllm/entrypoints/pooling/embed/api_router.py +++ b/vllm/entrypoints/pooling/embed/api_router.py @@ -59,8 +59,8 @@ async def create_embedding( return JSONResponse(content=generator.model_dump()) elif isinstance(generator, EmbeddingBytesResponse): return StreamingResponse( - content=generator.body, - headers={"metadata": generator.metadata}, + content=generator.content, + headers=generator.headers, media_type=generator.media_type, ) diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 7eb53e14d5d8a..6a8f8c4434e55 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -203,6 +203,6 @@ class EmbeddingResponse(OpenAIBaseModel): class EmbeddingBytesResponse(OpenAIBaseModel): - body: list[bytes] - metadata: str + content: list[bytes] + headers: dict[str, str] | None = None media_type: str = "application/octet-stream" diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py index 868a3cb017a6b..aafc354897105 100644 --- a/vllm/entrypoints/pooling/embed/serving.py +++ b/vllm/entrypoints/pooling/embed/serving.py @@ -163,29 +163,35 @@ class EmbeddingMixin(OpenAIServing): usage=usage, ) - def encode_bytes(): - body, items, usage = encode_pooling_bytes( + def encode_bytes(bytes_only: bool) -> EmbeddingBytesResponse: + content, items, usage = encode_pooling_bytes( pooling_outputs=final_res_batch_checked, embed_dtype=embed_dtype, endianness=endianness, ) - metadata = { - "id": ctx.request_id, - "created": ctx.created_time, - "model": ctx.model_name, - "data": items, - "usage": usage, - } - return EmbeddingBytesResponse( - body=body, - metadata=json.dumps(metadata), + headers = ( + None + if bytes_only + else { + "metadata": json.dumps( + { + "id": ctx.request_id, + "created": ctx.created_time, + "model": ctx.model_name, + "data": items, + "usage": usage, + } + ) + } ) + return EmbeddingBytesResponse(content=content, headers=headers) + if encoding_format == "float" or encoding_format == "base64": return encode_float_base64() - elif encoding_format == "bytes": - return encode_bytes() + elif encoding_format == "bytes" or encoding_format == "bytes_only": + return encode_bytes(bytes_only=encoding_format == "bytes_only") else: assert_never(encoding_format) diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py index 674da94d126cf..4baaf8f30f6bb 100644 --- a/vllm/entrypoints/pooling/pooling/api_router.py +++ b/vllm/entrypoints/pooling/pooling/api_router.py @@ -55,8 +55,8 @@ async def create_pooling(request: PoolingRequest, raw_request: Request): return JSONResponse(content=generator.model_dump()) elif isinstance(generator, PoolingBytesResponse): return StreamingResponse( - content=generator.body, - headers={"metadata": generator.metadata}, + content=generator.content, + headers=generator.headers, media_type=generator.media_type, ) diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index 364cd93738b84..76b361b49b668 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -143,6 +143,6 @@ class PoolingResponse(OpenAIBaseModel): class PoolingBytesResponse(OpenAIBaseModel): - body: list[bytes] - metadata: str + content: list[bytes] + headers: dict[str, str] | None = None media_type: str = "application/octet-stream" diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 7fb767e26d019..57f1a6440cf76 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -314,29 +314,38 @@ class OpenAIServingPooling(OpenAIServing): usage=usage, ) - def encode_bytes(): - body, items, usage = encode_pooling_bytes( + def encode_bytes(bytes_only: bool) -> PoolingBytesResponse: + content, items, usage = encode_pooling_bytes( pooling_outputs=final_res_batch, embed_dtype=embed_dtype, endianness=endianness, ) - metadata = { - "id": request_id, - "created": created_time, - "model": model_name, - "data": items, - "usage": usage, - } + headers = ( + None + if bytes_only + else { + "metadata": json.dumps( + { + "id": request_id, + "created": created_time, + "model": model_name, + "data": items, + "usage": usage, + } + ) + } + ) + return PoolingBytesResponse( - body=body, - metadata=json.dumps(metadata), + content=content, + headers=headers, ) if encoding_format == "float" or encoding_format == "base64": return encode_float_base64() - elif encoding_format == "bytes": - return encode_bytes() + elif encoding_format == "bytes" or encoding_format == "bytes_only": + return encode_bytes(bytes_only=encoding_format == "bytes_only") else: assert_never(encoding_format) diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py index a6d717e03d37d..07db5eaf74c8d 100644 --- a/vllm/utils/serial_utils.py +++ b/vllm/utils/serial_utils.py @@ -2,15 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import io +import math import sys from dataclasses import dataclass -from typing import Literal +from typing import TYPE_CHECKING, Any, Literal import numpy as np import torch from typing_extensions import assert_never -from vllm import PoolingRequestOutput +if TYPE_CHECKING: + from vllm import PoolingRequestOutput +else: + PoolingRequestOutput = Any sys_byteorder = sys.byteorder @@ -27,6 +31,14 @@ EMBED_DTYPE_TO_TORCH_DTYPE = { "fp8_e5m2": torch.float8_e5m2, } +EMBED_DTYPE_TO_N_BYTES = { + "float32": 4, + "float16": 2, + "bfloat16": 2, + "fp8_e4m3": 1, + "fp8_e5m2": 1, +} + EMBED_DTYPE_TO_TORCH_DTYPE_VIEW = { "float32": torch.float32, @@ -50,7 +62,7 @@ ENDIANNESS = ["native", "big", "little"] EmbedDType = Literal["float32", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] Endianness = Literal["native", "big", "little"] -EncodingFormat = Literal["float", "base64", "bytes"] +EncodingFormat = Literal["float", "base64", "bytes", "bytes_only"] def tensor2base64(x: torch.Tensor) -> str: @@ -114,7 +126,7 @@ def encode_pooling_output( elif encoding_format == "base64": embedding_bytes = tensor2binary(output.outputs.data, embed_dtype, endianness) return base64.b64encode(embedding_bytes).decode("utf-8") - elif encoding_format == "bytes": + elif encoding_format == "bytes" or encoding_format == "bytes_only": return tensor2binary(output.outputs.data, embed_dtype, endianness) assert_never(encoding_format) @@ -129,6 +141,29 @@ class MetadataItem: shape: tuple[int, ...] +def build_metadata_items( + embed_dtype: EmbedDType, + endianness: Endianness, + shape: tuple[int, ...], + n_request: int, +): + n_bytes = EMBED_DTYPE_TO_N_BYTES[embed_dtype] + size = math.prod(shape) + items = [ + MetadataItem( + index=i, + embed_dtype=embed_dtype, + endianness=endianness, + start=i * size * n_bytes, + end=(i + 1) * size * n_bytes, + shape=shape, + ) + for i in range(n_request) + ] + + return items + + def encode_pooling_bytes( pooling_outputs: list[PoolingRequestOutput], embed_dtype: EmbedDType, From 77072e93b327fd391482f5facadf93a804ced0cc Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 8 Dec 2025 03:06:20 -0900 Subject: [PATCH 066/133] [docs] governance documents (#24801) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: simon-mo Signed-off-by: Michael Goin Signed-off-by: Simon Mo Co-authored-by: Mark McLoughlin Co-authored-by: Russell Bryant Co-authored-by: Michael Goin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Luka Govedič Co-authored-by: Cyrus Leung --- docs/.nav.yml | 1 + docs/governance/collaboration.md | 43 ++++++++ docs/governance/committers.md | 183 +++++++++++++++++++++++++++++++ docs/governance/process.md | 125 +++++++++++++++++++++ 4 files changed, 352 insertions(+) create mode 100644 docs/governance/collaboration.md create mode 100644 docs/governance/committers.md create mode 100644 docs/governance/process.md diff --git a/docs/.nav.yml b/docs/.nav.yml index aa98ad52be215..835cc773e7599 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -59,6 +59,7 @@ nav: - CLI Reference: cli - Community: - community/* + - Governance: governance - Blog: https://blog.vllm.ai - Forum: https://discuss.vllm.ai - Slack: https://slack.vllm.ai diff --git a/docs/governance/collaboration.md b/docs/governance/collaboration.md new file mode 100644 index 0000000000000..5b3d2beffe5b9 --- /dev/null +++ b/docs/governance/collaboration.md @@ -0,0 +1,43 @@ +# Collaboration Policy + +This page outlines how vLLM collaborates with model providers, hardware vendors, and other stakeholders. + +## Adding New Major Features + +Anyone can contribute to vLLM. For major features, submit an RFC (request for comments) first. To submit an RFC, create an [issue](https://github.com/vllm-project/vllm/issues/new/choose) and select the `RFC` template. +RFCs are similar to design docs that discuss the motivation, problem solved, alternatives considered, and proposed change. + +Once you submit the RFC, please post it in the #contributors channel in vLLM Slack, and loop in area owners and committers for feedback. +For high-interest features, the committers nominate a person to help with the RFC process and PR review. This makes sure someone is guiding you through the process. It is reflected as the "assignee" field in the RFC issue. +If the assignee and lead maintainers find the feature to be contentious, the maintainer team aims to make decisions quickly after learning the details from everyone. This involves assigning a committer as the DRI (Directly Responsible Individual) to make the decision and shepherd the code contribution process. + +For features that you intend to maintain, please feel free to add yourself in [`mergify.yml`](https://github.com/vllm-project/vllm/blob/main/.github/mergify.yml) to receive notifications and auto-assignment when the PRs touching the feature you are maintaining. Over time, the ownership will be evaluated and updated through the committers nomination and voting process. + +## Adding New Models + +If you use vLLM, we recommend you making the model work with vLLM by following the [model registration](../contributing/model/registration.md) process before you release it publicly. + +The vLLM team helps with new model architectures not supported by vLLM, especially models pushing architectural frontiers. +Here's how the vLLM team works with model providers. The vLLM team includes all [committers](./committers.md) of the project. model providers can exclude certain members but shouldn't, as this may harm release timelines due to missing expertise. Contact [project leads](./process.md) if you want to collaborate. + +Once we establish the connection between the vLLM team and model provider: + +- The vLLM team learns the model architecture and relevant changes, then plans which area owners to involve and what features to include. +- The vLLM team creates a private communication channel (currently a Slack channel in the vLLM workspace) and a private fork within the vllm-project organization. The model provider team can invite others to the channel and repo. +- Third parties like compute providers, hosted inference providers, hardware vendors, and other organizations often work with both the model provider and vLLM on model releases. We establish direct communication (with permission) or three-way communication as needed. + +The vLLM team works with model providers on features, integrations, and release timelines. We work to meet release timelines, but engineering challenges like feature development, model accuracy alignment, and optimizations can cause delays. + +The vLLM maintainers will not publicly share details about model architecture, release timelines, or upcoming releases. We maintain model weights on secure servers with security measures (though we can work with security reviews and testing without certification). We delete pre-release weights or artifacts upon request. + +The vLLM team collaborates on marketing and promotional efforts for model releases. model providers can use vLLM's trademark and logo in publications and materials. + +## Adding New Hardware + +vLLM is designed as a platform for frontier model architectures and high-performance accelerators. +For new hardware, follow the [hardware plugin](../design/plugin_system.md) system to add support. +Use the platform plugin system to add hardware support. +As hardware gains popularity, we help endorse it in our documentation and marketing materials. +The vLLM GitHub organization can host hardware plugin repositories, especially for collaborative efforts among companies. + +We rarely add new hardware to vLLM directly. Instead, we make existing hardware platforms modular to keep the vLLM core hardware-agnostic. diff --git a/docs/governance/committers.md b/docs/governance/committers.md new file mode 100644 index 0000000000000..c9428027da953 --- /dev/null +++ b/docs/governance/committers.md @@ -0,0 +1,183 @@ +# Committers + +This document lists the current committers of the vLLM project and the core areas they maintain. +Committers have write access to the vLLM repository and are responsible for reviewing and merging PRs. +You can also refer to the [CODEOWNERS](https://github.com/vllm-project/vllm/blob/main/.github/CODEOWNERS) file for concrete file-level ownership and reviewers. Both this documents and the CODEOWNERS file are living documents and they complement each other. + +## Active Committers + +We try to summarize each committer's role in vLLM in a few words. In general, vLLM committers cover a wide range of areas and help each other in the maintenance process. +Please refer to the later section about Area Owners for exact component ownership details. +Sorted alphabetically by GitHub handle: + +- [@22quinn](https://github.com/22quinn): RL API +- [@aarnphm](https://github.com/aarnphm): Structured output +- [@alexm-redhat](https://github.com/alexm-redhat): Performance +- [@ApostaC](https://github.com/ApostaC): Connectors, offloading +- [@benchislett](https://github.com/benchislett): Engine core and spec decode +- [@bigPYJ1151](https://github.com/bigPYJ1151): Intel CPU/XPU integration +- [@chaunceyjiang](https://github.com/chaunceyjiang): Tool use and reasoning parser +- [@DarkLight1337](https://github.com/DarkLight1337): Multimodality, API server +- [@esmeetu](https://github.com/esmeetu): developer marketing, community +- [@gshtras](https://github.com/gshtras): AMD integration +- [@heheda12345](https://github.com/heheda12345): Hybrid memory allocator +- [@hmellor](https://github.com/hmellor): Hugging Face integration, documentation +- [@houseroad](https://github.com/houseroad): Engine core and Llama models +- [@Isotr0py](https://github.com/Isotr0py): Multimodality, new model support +- [@jeejeelee](https://github.com/jeejeelee): LoRA, new model support +- [@jikunshang](https://github.com/jikunshang): Intel CPU/XPU integration +- [@khluu](https://github.com/khluu): CI infrastructure +- [@KuntaiDu](https://github.com/KuntaiDu): KV Connector +- [@LucasWilkinson](https://github.com/LucasWilkinson): Kernels and performance +- [@luccafong](https://github.com/luccafong): Llama models, speculative decoding, distributed +- [@markmc](https://github.com/markmc): Observability +- [@mgoin](https://github.com/mgoin): Quantization and performance +- [@NickLucche](https://github.com/NickLucche): KV connector +- [@njhill](https://github.com/njhill): Distributed, API server, engine core +- [@noooop](https://github.com/noooop): Pooling models +- [@patrickvonplaten](https://github.com/patrickvonplaten): Mistral models, new model support +- [@pavanimajety](https://github.com/pavanimajety): NVIDIA GPU integration +- [@ProExpertProg](https://github.com/ProExpertProg): Compilation, startup UX +- [@robertgshaw2-redhat](https://github.com/robertgshaw2-redhat): Core, distributed, disagg +- [@ruisearch42](https://github.com/ruisearch42): Pipeline parallelism, Ray Support +- [@russellb](https://github.com/russellb): Structured output, engine core, security +- [@sighingnow](https://github.com/sighingnow): Qwen models, new model support +- [@simon-mo](https://github.com/simon-mo): Project lead, API entrypoints, community +- [@tdoublep](https://github.com/tdoublep): State space models +- [@tjtanaa](https://github.com/tjtanaa): AMD GPU integration +- [@tlrmchlsmth](https://github.com/tlrmchlsmth): Kernels and performance, distributed, disagg +- [@WoosukKwon](https://github.com/WoosukKwon): Project lead, engine core +- [@yaochengji](https://github.com/yaochengji): TPU integration +- [@yeqcharlotte](https://github.com/yeqcharlotte): Benchmark, Llama models +- [@yewentao256](https://github.com/yewentao256): Kernels and performance +- [@Yikun](https://github.com/Yikun): Pluggable hardware interface +- [@youkaichao](https://github.com/youkaichao): Project lead, distributed, compile, community +- [@ywang96](https://github.com/ywang96): Multimodality, benchmarks +- [@zhuohan123](https://github.com/zhuohan123): Project lead, RL integration, numerics +- [@zou3519](https://github.com/zou3519): Compilation + +### Emeritus Committers + +Committers who have contributed to vLLM significantly in the past (thank you!) but no longer active: + +- [@andoorve](https://github.com/andoorve): Pipeline parallelism +- [@cadedaniel](https://github.com/cadedaniel): Speculative decoding +- [@comaniac](https://github.com/comaniac): KV cache management, pipeline parallelism +- [@LiuXiaoxuanPKU](https://github.com/LiuXiaoxuanPKU): Speculative decoding +- [@pcmoritz](https://github.com/pcmoritz): MoE +- [@rkooo567](https://github.com/rkooo567): Chunked prefill +- [@sroy745](https://github.com/sroy745): Speculative decoding +- [@Yard1](https://github.com/Yard1): kernels and performance +- [@zhisbug](https://github.com/zhisbug): Arctic models, distributed + +## Area Owners + +This section breaks down the active committers by vLLM components and lists the area owners. +If you have PRs touching the area, please feel free to ping the area owner for review. + +### Engine Core + +- Scheduler: the core vLLM engine loop scheduling requests to next batch + - @WoosukKwon, @robertgshaw2-redhat, @njhill, @heheda12345 +- KV Cache Manager: memory management layer within scheduler maintaining KV cache logical block data + - @heheda12345, @WoosukKwon +- AsyncLLM: the zmq based protocol hosting engine core and making it accessible for entrypoints + - @robertgshaw2-redhat, @njhill, @russellb +- ModelRunner, Executor, Worker: the abstractions for engine wrapping model implementation + - @WoosukKwon, @tlrmchlsmth, @heheda12345, @LucasWilkinson, @ProExpertProg +- KV Connector: Connector interface and implementation for KV cache offload and transfer + - @robertgshaw2-redhat, @njhill, @KuntaiDu, @NickLucche, @ApostaC +- Distributed, Parallelism, Process Management: Process launchers managing each worker, and assign them to the right DP/TP/PP/EP ranks + - @youkaichao, @njhill, @WoosukKwon, @ruisearch42 +- Collectives: the usage of nccl and other communication libraries/kernels + - @tlrmchlsmth, @youkaichao +- Multimodality engine and memory management: core scheduling and memory management concerning vision, audio, and video inputs. + - @ywang96, @DarkLight1337 + +### Model Implementations + +- Model Interface: The `nn.Module` interface and implementation for various models + - @zhuohan123, @mgoin, @simon-mo, @houseroad, @ywang96 (multimodality), @jeejeelee (lora) +- Logits Processors / Sampler: The provided sampler class and pluggable logits processors + - @njhill, @houseroad, @22quinn +- Custom Layers: Utility layers in vLLM such as rotary embedding and rms norms + - @ProExpertProg +- Attention: Attention interface for paged attention + - @WoosukKwon, @LucasWilkinson, @heheda12345 +- FusedMoE: FusedMoE kernel, Modular kernel framework, EPLB + - @tlrmchlsmth +- Quantization: Various quantization config, weight loading, and kernel. + - @mgoin, @Isotr0py, @yewentao256 +- Custom quantized GEMM kernels (cutlass_scaled_mm, marlin, machete) + - @tlrmchlsmth, @LucasWilkinson +- Multi-modal Input Processing: Components that load and process image/video/audio data into feature tensors + - @DarkLight1337, @ywang96, @Isotr0py +- torch compile: The torch.compile integration in vLLM, custom passes & transformations + - @ProExpertProg, @zou3519, @youkaichao +- State space models: The state space models implementation in vLLM + - @tdoublep, @tlrmchlsmth +- Reasoning and tool calling parsers + - @chaunceyjiang, @aarnphm + +### Entrypoints + +- LLM Class: The LLM class for offline inference + - @DarkLight1337 +- API Server: The OpenAI-compatible API server + - @DarkLight1337, @njhill, @aarnphm, @simon-mo, @heheda12345 (Responses API) +- Batch Runner: The OpenAI-compatible batch runner + - @simon-mo + +### Features + +- Spec Decode: Covers model definition, attention, sampler, and scheduler related to n-grams, EAGLE, and MTP. + - @WoosukKwon, @benchislett, @luccafong +- Structured Output: The structured output implementation + - @russellb, @aarnphm +- RL: The RL related features such as collective rpc, sleep mode, etc. + - @youkaichao, @zhuohan123, @22quinn +- LoRA: @jeejeelee +- Observability: Metrics and Logging + - @markmc, @robertgshaw2-redhat, @simon-mo + +### Code Base + +- Config: Configuration registration and parsing + - @hmellor +- Documentation: @hmellor, @DarkLight1337, @simon-mo +- Benchmarks: @ywang96, @simon-mo +- CI, Build, Release Process: @khluu, @njhill, @simon-mo +- Security: @russellb + +### External Kernels Integration + +- FlashAttention: @LucasWilkinson +- FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon +- Blackwell Kernels: @mgoin, @yewentao256 +- DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256 + +### Integrations + +- Hugging Face: @hmellor, @Isotr0py +- Ray: @ruisearch42 +- NIXL: @robertgshaw2-redhat, @NickLucche + +### Collaboration with Model Vendors + +- gpt-oss: @heheda12345, @simon-mo, @zhuohan123 +- Llama: @luccafong +- Qwen: @sighingnow +- Mistral: @patrickvonplaten + +### Hardware + +- Plugin Interface: @youkaichao, @Yikun +- NVIDIA GPU: @pavanimajety +- AMD GPU: @gshtras, @tjtanaa +- Intel CPU/GPU: @jikunshang, @bigPYJ1151 +- Google TPU: @yaochengji + +### Ecosystem Projects + +- Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers) +- Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel) diff --git a/docs/governance/process.md b/docs/governance/process.md new file mode 100644 index 0000000000000..1e088dd3c1e64 --- /dev/null +++ b/docs/governance/process.md @@ -0,0 +1,125 @@ +# Governance Process + +vLLM's success comes from our strong open source community. We favor informal, meritocratic norms over formal policies. This document clarifies our governance philosophy and practices. + +## Values + +vLLM aims to be the fastest and easiest-to-use LLM inference and serving engine. We stay current with advances, enable innovation, and support diverse models, modalities, and hardware. + +### Design Values + +1. **Top performance**: System performance is our top priority. We monitor overheads, optimize kernels, and publish benchmarks. We never leave performance on the table. +2. **Ease of use**: vLLM must be simple to install, configure, and operate. We provide clear documentation, fast startup, clean logs, helpful error messages, and monitoring guides. Many users fork our code or study it deeply, so we keep it readable and modular. +3. **Wide coverage**: vLLM supports frontier models and high-performance accelerators. We make it easy to add new models and hardware. vLLM + PyTorch form a simple interface that avoids complexity. +4. **Production ready**: vLLM runs 24/7 in production. It must be easy to operate and monitor for health issues. +5. **Extensibility**: vLLM serves as fundamental LLM infrastructure. Our codebase cannot cover every use case, so we design for easy forking and customization. + +### Collaboration Values + +1. **Tightly Knit and Fast-Moving**: Our maintainer team is aligned on vision, philosophy, and roadmap. We work closely to unblock each other and move quickly. +2. **Individual Merit**: No one buys their way into governance. Committer status belongs to individuals, not companies. We reward contribution, maintenance, and project stewardship. + +## Project Maintainers + +Maintainers form a hierarchy based on sustained, high-quality contributions and alignment with our design philosophy. + +### Core Maintainers + +Core Maintainers function like a project planning and decision making committee. In other convention, they might be called a Technical Steering Committee (TSC). In vLLM vocabulary, they are often known as "Project Leads". They meet weekly to coordinate roadmap priorities and allocate engineering resources. Current active leads: @WoosukKwon, @zhuohan123, @simon-mo, @youkaichao, @robertshaw2-redhat, @tlrmchlsmth, @mgoin, @njhill, @ywang96, @houseroad, @yeqcharlotte, @ApostaC + +The responsibilities of the core maintainers are: + +* Author quarterly roadmap and responsible for each development effort. +* Making major changes to the technical direction or scope of vLLM and vLLM projects. +* Defining the project's release strategy. +* Work with model providers, hardware vendors, and key users of vLLM to ensure the project is on the right track. + +### Lead Maintainers + +While Core maintainers assume the day-to-day responsibilities of the project, Lead maintainers are responsible for the overall direction and strategy of the project. A committee of @WoosukKwon, @zhuohan123, @simon-mo, and @youkaichao currently shares this role with divided responsibilities. + +The responsibilities of the lead maintainers are: + +* Making decisions where consensus among core maintainers cannot be reached. +* Adopting changes to the project's technical governance. +* Organizing the voting process for new committers. + +### Committers and Area Owners + +Committers have write access and merge rights. They typically have deep expertise in specific areas and help the community. + +The responsibilities of the committers are: + +* Reviewing PRs and providing feedback. +* Addressing issues and questions from the community. +* Own specific areas of the codebase and development efforts: reviewing PRs, addressing issues, answering questions, improving documentation. + +Specially, committers are almost all area owners. They author subsystems, review PRs, refactor code, monitor tests, and ensure compatibility with other areas. All area owners are committers with deep expertise in that area, but not all committers own areas. + +For a full list of committers and their respective areas, see the [committers](./committers.md) page. + +#### Nomination Process + +Any committer can nominate candidates via our private mailing list: + +1. **Nominate**: Any committer may nominate a candidate by email to the private maintainers’ list, citing evidence mapped to the pre‑existing standards with links to PRs, reviews, RFCs, issues, benchmarks, and adoption evidence. +2. **Vote**: The lead maintainers will group voices support or concerns. Shared concerns can stop the process. The vote typically last 3 working days. For concerns, committers group discuss the clear criteria for such person to be nominated again. The lead maintainers will make the final decision. +3. **Confirm**: The lead maintainers send invitation, update CODEOWNERS, assign permissions, add to communications channels (mailing list and Slack). + +Committership is highly selective and merit based. The selection criteria requires: + +* **Area expertise**: leading design/implementation of core subsystems, material performance or reliability improvements adopted project‑wide, or accepted RFCs that shape technical direction. +* **Sustained contributions**: high‑quality merged contributions and reviews across releases, responsiveness to feedback, and stewardship of code health. +* **Community leadership**: mentoring contributors, triaging issues, improving docs, and elevating project standards. + +To further illustrate, a committer typically satisfies at least two of the following accomplishment patterns: + +* Author of an accepted RFC or design that materially shaped project direction +* Measurable, widely adopted performance or reliability improvement in core paths +* Long‑term ownership of a subsystem with demonstrable quality and stability gains +* Significant cross‑project compatibility or ecosystem enablement work (models, hardware, tooling) + +While there isn't a quantitative bar, past committers have: + +* Submitted approximately 30+ PRs of substantial quality and scope +* Provided high-quality reviews of approximately 10+ substantial external contributor PRs +* Addressed multiple issues and questions from the community in issues/forums/Slack +* Led concentrated efforts on RFCs and their implementation, or significant performance or reliability improvements adopted project‑wide + +### Working Groups + +vLLM runs informal working groups such as CI, CI infrastructure, torch compile, and startup UX. These can be loosely tracked via `#sig-` (or `#feat-`) channels in vLLM Slack. Some groups have regular sync meetings. + +### Advisory Board + +vLLM project leads consult with an informal advisory board that is composed of model providers, hardware vendors, and ecosystem partners. This manifests as a collaboration channel in Slack and frequent communications. + +## Process + +### Project Roadmap + +Project Leads publish quarterly roadmaps as GitHub issues. These clarify current priorities. Unlisted topics aren't excluded but may get less review attention. See [https://roadmap.vllm.ai/](https://roadmap.vllm.ai/). + +### Decision Making + +We make technical decisions in Slack and GitHub using RFCs and design docs. Discussion may happen elsewhere, but we maintain public records of significant changes: problem statements, rationale, and alternatives considered. + +### Merging Code + +Contributors and maintainers often collaborate closely on code changes, especially within organizations or specific areas. Maintainers should give others appropriate review opportunities based on change significance. + +PRs requires at least one committer review and approval. If the code is covered by CODEOWNERS, the PR should be reviewed by the CODEOWNERS. There are cases where the code is trivial or hotfix, the PR can be merged by the lead maintainers directly. + +In case where CI didn't pass due to the failure is not related to the PR, the PR can be merged by the lead maintainers using "force merge" option that overrides the CI checks. + +### Slack + +Contributors are encouraged to join `#pr-reviews` and `#contributors` channels. + +There are `#sig-` and `#feat-` channels for discussion and coordination around specific topics. + +The project maintainer group also uses a private channel for high-bandwidth collaboration. + +### Meetings + +We hold weekly contributor syncs with standup-style updates on progress, blockers, and plans. You can refer to the notes [standup.vllm.ai](https://standup.vllm.ai) for joining instructions. From 5c2433a6f35a84741f26aa546432649891b896bb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 8 Dec 2025 13:11:51 +0000 Subject: [PATCH 067/133] Add tip for `mypy` and `markdownlint` to the pre-commit comment (#30259) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/mergify.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 58a5d77867c95..3ad79f93bc7ad 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -35,6 +35,20 @@ pull_request_rules: For future commits, `pre-commit` will run automatically on changed files before each commit. + > [!TIP] + >
+ > Is mypy or markdownlint failing? + >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally: + > + > ```bash + > # For mypy (substitute "3.10" with the failing version if needed) + > pre-commit run --hook-stage manual mypy-3.10 + > # For markdownlint + > pre-commit run --hook-stage manual markdownlint + > ``` + >
+ - name: comment-dco-failure description: Comment on PR when DCO check fails conditions: From 80433e225ee0f91a7f6a082bf4e136df75ab4746 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 8 Dec 2025 21:29:47 +0800 Subject: [PATCH 068/133] [LoRA] Reduce the loading time of MoE LoRA (#30243) Signed-off-by: Jee Jee Li --- vllm/lora/models.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index ada30da600440..567ffce4e75fc 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -115,7 +115,7 @@ class LoRAModel: weights_mapper: WeightsMapper | None = None, ) -> "LoRAModel": """Create a LoRAModel from a dictionary of tensors.""" - pin_memory = str(device) == "cpu" and is_pin_memory_available() + loras: dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): if is_base_embeddding_weights(tensor_name): @@ -139,14 +139,8 @@ class LoRAModel: f" with the base model's vocabulary size({model_vocab_size})." ) loras[module_name].lora_a = tensor.to(device=device, dtype=dtype) - if pin_memory: - loras[module_name].lora_a = loras[module_name].lora_a.pin_memory() else: loras[module_name].lora_b = tensor.to(device=device, dtype=dtype) - - if pin_memory: - loras[module_name].lora_b = loras[module_name].lora_b.pin_memory() - return cls(lora_model_id, peft_helper.r, loras) @classmethod @@ -742,6 +736,32 @@ class LoRAModelManager: for lora in lora_model.loras.values(): lora.optimize() + first_lora: LoRALayerWeights = next(iter(lora_model.loras.values())) + assert first_lora.lora_a is not None + if isinstance(first_lora.lora_a, list): + lora_device = next(iter(first_lora.lora_a)) + else: + lora_device = first_lora.lora_a.device + # Execute pin_memory after LoRA weight merging, mainly because: + # 1. Some MoE models have a large number of LoRA weights. If we + # perform # pin_memory immediately after loading weights, the + # overhead is significant. + # 2. The weight packing above (e.g., pack_moe) may invalidate the + # pin_memory allocation, so we execute it after packing. + + pin_memory = str(lora_device) == "cpu" and is_pin_memory_available() + if pin_memory: + for lora in lora_model.loras.values(): + if isinstance(lora.lora_a, list): + for index in range(len(lora.lora_a)): + if lora.lora_a[index] is None: + continue + lora.lora_a[index] = lora.lora_a[index].pin_memory() + lora.lora_b[index] = lora.lora_b[index].pin_memory() + else: + lora.lora_a = lora.lora_a.pin_memory() + lora.lora_b = lora.lora_b.pin_memory() + def _get_lora_layer_weights( self, lora_model: LoRAModel, module_name: str ) -> LoRALayerWeights | None: From eb1051fb95323493d6d950c03dabac8ee56cb33e Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Mon, 8 Dec 2025 06:44:48 -0800 Subject: [PATCH 069/133] [ROCm] Guard group quant RMS norm fusion patterns (#30239) --- vllm/compilation/fusion.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index de083a2e5e3c7..a7e6a69e64c91 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -490,23 +490,25 @@ class RMSNormQuantFusionPass(VllmPatternMatcherPass): # as the latter is a subset of the former in torch ops for epsilon in [1e-5, 1e-6]: # Fuse fused_add_rms_norm + fp8 group quant - FusedAddRMSNormGroupQuantPattern( - epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128) - ).register(self.patterns) + # Only register group quant patterns on CUDA where the C++ op exists + if current_platform.is_cuda(): + FusedAddRMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128) + ).register(self.patterns) - # Fuse rms_norm + fp8 group quant - RMSNormGroupQuantPattern( - epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128) - ).register(self.patterns) + # Fuse rms_norm + fp8 group quant + RMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 128) + ).register(self.patterns) - FusedAddRMSNormGroupQuantPattern( - epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64) - ).register(self.patterns) + FusedAddRMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64) + ).register(self.patterns) - # Fuse rms_norm + fp8 group quant - RMSNormGroupQuantPattern( - epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64) - ).register(self.patterns) + # Fuse rms_norm + fp8 group quant + RMSNormGroupQuantPattern( + epsilon, FP8_DTYPE, group_shape=GroupShape(1, 64) + ).register(self.patterns) # Fuse fused_add_rms_norm + static fp8 quant FusedAddRMSNormStaticQuantPattern(epsilon, FP8_DTYPE).register( From 184076c3fecf8c322648f36f69a4de836d7f519b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20C=C3=A1mpora?= <961215+dcampora@users.noreply.github.com> Date: Mon, 8 Dec 2025 15:55:58 +0100 Subject: [PATCH 070/133] [DeepSeek v3.2] Make top-k work for any logit values. (#27568) Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: Cyrus Leung --- csrc/ops.h | 13 +- csrc/sampler.cu | 743 ++++++++++++++++------ csrc/torch_bindings.cpp | 10 +- tests/kernels/test_top_k_per_row.py | 95 ++- vllm/model_executor/models/deepseek_v2.py | 6 +- 5 files changed, 643 insertions(+), 224 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index 9617d6358e18a..5fce3a1a3fea3 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -102,13 +102,16 @@ void apply_repetition_penalties_(torch::Tensor& logits, const torch::Tensor& output_mask, const torch::Tensor& repetition_penalties); -void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts, - const torch::Tensor& rowEnds, torch::Tensor& indices, - int64_t numRows, int64_t stride0, int64_t stride1); +void top_k_per_row_prefill(const torch::Tensor& logits, + const torch::Tensor& rowStarts, + const torch::Tensor& rowEnds, torch::Tensor& indices, + int64_t numRows, int64_t stride0, int64_t stride1, + int64_t topK); void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n, - const torch::Tensor& seq_lens, torch::Tensor& indices, - int64_t numRows, int64_t stride0, int64_t stride1); + const torch::Tensor& seqLens, torch::Tensor& indices, + int64_t numRows, int64_t stride0, int64_t stride1, + int64_t topK); void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, torch::Tensor& scale, diff --git a/csrc/sampler.cu b/csrc/sampler.cu index 410b8988f4939..fc2154beff9e0 100644 --- a/csrc/sampler.cu +++ b/csrc/sampler.cu @@ -44,41 +44,300 @@ __global__ void apply_repetition_penalties_kernel( } } -static inline __device__ uint16_t extractBinIdx(float x) { - union { - __half h; - uint16_t u16; - } tmp; - tmp.h = __float2half_rn(x); - tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000); - return 511 - (tmp.u16 >> 7); +__device__ __forceinline__ auto convert_to_uint32(float x) -> uint32_t { + uint32_t bits = __float_as_uint(x); + return (bits & 0x80000000) ? bits : ~bits & 0x7fffffff; } -template -__device__ void topKPerRowJob(const float* logits, const int rowStart, - const int rowEnd, const int rowIdx, - int* outIndices, int stride0, int stride1) { - // The number of elements per thread for the final top-k sort. - static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock; - // The class to sort the elements during the final top-k sort. - using TopKSort = cub::BlockRadixSort; +template +static inline __device__ uint32_t extractBinIdx(float x) { + if constexpr (step == 0) { + __half hx = __float2half(x); + uint16_t bits = __half_as_ushort(hx); + bits = (bits & 0x8000) ? bits : ~bits & 0x7fff; + return bits >> 5; + } else { + uint32_t bits = __float_as_uint(x); + bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff; + if constexpr (step == 1) { + return bits >> 21; + } else if constexpr (step == 2) { + return (bits >> 10) & 0x7ff; + } else if constexpr (step == 3) { + return bits & 0x3ff; + } + } +} + +template +static inline __device__ bool isPartialMatch(float x, uint32_t pattern) { + if constexpr (shift == 0) { + return true; + } + uint32_t bits = __float_as_uint(x); + bits = (bits & 0x80000000) ? bits : ~bits & 0x7fffffff; + return (bits ^ pattern) >> shift == 0; +} + +/** + * Map a Func over the input data, using vectorized load instructions if + * possible. + * + * @tparam T element type + * @tparam IdxT indexing type + * @tparam Func void (T x, IdxT idx) + * + * @param thread_rank rank of the calling thread among all participating threads + * @param num_threads number of the threads that participate in processing + * @param in the input data + * @param len the number of elements to read + * @param f the lambda taking two arguments (T x, IdxT idx) + */ +template +__device__ void vectorized_process(size_t thread_rank, size_t num_threads, + const T* in, idxT len, Func f) { + constexpr int WARP_SIZE = 32; + using WideT = float4; + if constexpr (sizeof(T) >= sizeof(WideT)) { + for (idxT i = thread_rank; i < len; i += num_threads) { + f(in[i], i); + } + } else { + static_assert(sizeof(WideT) % sizeof(T) == 0); + constexpr int items_per_scalar = sizeof(WideT) / sizeof(T); + // TODO: it's UB + union { + WideT scalar; + T array[items_per_scalar]; + } wide; + + int skip_cnt = + (reinterpret_cast(in) % sizeof(WideT)) + ? ((sizeof(WideT) - reinterpret_cast(in) % sizeof(WideT)) / + sizeof(T)) + : 0; + if (skip_cnt > len) { + skip_cnt = len; + } + const WideT* in_cast = reinterpret_cast(in + skip_cnt); + const idxT len_cast = (len - skip_cnt) / items_per_scalar; + + for (idxT i = thread_rank; i < len_cast; i += num_threads) { + wide.scalar = in_cast[i]; + const idxT real_i = skip_cnt + i * items_per_scalar; +#pragma unroll + for (int j = 0; j < items_per_scalar; ++j) { + f(wide.array[j], real_i + j); + } + } + + static_assert(WARP_SIZE >= items_per_scalar); + // and because items_per_scalar > skip_cnt, WARP_SIZE > skip_cnt + // no need to use loop + if (thread_rank < skip_cnt) { + f(in[thread_rank], thread_rank); + } + // because len_cast = (len - skip_cnt) / items_per_scalar, + // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt; + // and so + // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <= + // WARP_SIZE no need to use loop + const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank; + if (remain_i < len) { + f(in[remain_i], remain_i); + } + } +} + +template +__device__ bool processHistogramStep( + const int* indices, const float* logits, int rowEnd, uint32_t& logitPattern, + int& thresholdBinIdx, SmemOutputType& smemOutput, int* smemThresholdBinIdx, + int* smemFinalDstIdx, int* smemFinalBinSize, int* smemFoundTopKValues, + SmemFinalType& smemFinal, int stride1, int rowStart, int topK) { + // Clear the histogram. +#pragma unroll + for (int idx = threadIdx.x; idx < kNumBins; idx += kNumThreadsPerBlock) { + smemFinal.histo.data[idx] = 0; + } + + // Make sure the histogram is ready. + __syncthreads(); + + // Update pattern + constexpr auto patternShift = step < 2 ? 0 : step == 2 ? 21 : 10; + if constexpr (step == 2) { + logitPattern = static_cast(thresholdBinIdx & 0x7ff) + << patternShift; + } else if constexpr (step == 3) { + logitPattern |= static_cast(thresholdBinIdx & 0x7ff) + << patternShift; + } + + auto distributeToBins = [&](float logit, int /* idx */ = 0) { + if (isPartialMatch(logit, logitPattern)) { + uint32_t binIdx = extractBinIdx(logit); + atomicAdd(&smemFinal.histo.data[binIdx], 1); + } + }; + + // Distribute the elements to the histogram bins. + if (stride1 == 1) { + vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart, + rowEnd - rowStart, distributeToBins); + } else { + for (int idx = rowStart + threadIdx.x; idx < rowEnd; + idx += kNumThreadsPerBlock) { + float logit = logits[idx * stride1]; + distributeToBins(logit, idx); + } + } + // Make sure the histogram is ready. + __syncthreads(); + + // Reads the value of the starting position in the smemOutput array + int lastValue = smemFoundTopKValues[0]; + + for (int round = 0; round < kNumBins / kNumThreadsPerBlock; round++) { + // Read the values from SMEM. + int idx = threadIdx.x + kNumThreadsPerBlock * round; + int binCount{0}; + binCount = smemFinal.histo.data[idx]; + + // Make sure each thread has read its value. + __syncthreads(); + + // Compute the prefix sum. + int prefixSum{0}, totalSum{0}; + using Scan = cub::BlockScan; + Scan(smemFinal.histo.scan).ExclusiveSum(binCount, prefixSum, totalSum); + + // Update the histogram with the prefix sums. + prefixSum += lastValue; + totalSum += lastValue; + smemFinal.histo.data[idx] = prefixSum; + + // Make sure the data is in shared memory. + __syncthreads(); + + // Find the last valid bin. + bool foundThreshold = false; + if (prefixSum < topK) { + int nextPrefixSum = threadIdx.x == kNumThreadsPerBlock - 1 + ? totalSum + : smemFinal.histo.data[idx + 1]; + + if (nextPrefixSum >= topK) { + smemThresholdBinIdx[0] = idx; + smemFinalBinSize[0] = nextPrefixSum - prefixSum; + foundThreshold = true; + } + } + + // Early exit: if any thread found the threshold, we can skip remaining + // rounds + if (__syncthreads_or(foundThreshold)) { + break; + } + + lastValue = totalSum; + } + + // Make sure the data is in shared memory. + __syncthreads(); + + // The threshold bin. + thresholdBinIdx = smemThresholdBinIdx[0]; + + auto processBins = [&](float logit, int idx) { + if (isPartialMatch(logit, logitPattern)) { + uint32_t binIdx = extractBinIdx(logit); + if (binIdx < thresholdBinIdx) { + // The element is part of the top-k selection + int dstIdx = atomicAdd(&smemFoundTopKValues[0], 1); + + if constexpr (mergeBlocks) { + smemOutput[dstIdx] = indices[idx]; + } else if constexpr (multipleBlocksPerRow) { + smemOutput[dstIdx] = idx + rowStart; + reinterpret_cast(smemOutput + topK)[dstIdx] = logit; + } else { + smemOutput[dstIdx] = idx; + } + } + if constexpr (step < 3) { + // Only fill the final items for sorting if the threshold bin fits + if (binIdx == thresholdBinIdx && + smemFinalBinSize[0] <= kNumFinalItems) { + int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1); + smemFinal.items.logits[dstIdx] = logit; + if constexpr (mergeBlocks) { + smemFinal.items.indices[dstIdx] = indices[idx]; + } else if constexpr (multipleBlocksPerRow) { + smemFinal.items.indices[dstIdx] = idx + rowStart; + } else { + smemFinal.items.indices[dstIdx] = idx; + } + } + } else { + if (binIdx == thresholdBinIdx) { + // The elements in the threshold bin share the same 32 bits at step 3 + int dstIdx = atomicAdd(&smemFinal.histo.data[binIdx], 1); + if (dstIdx < topK) { + if constexpr (mergeBlocks) { + smemOutput[dstIdx] = indices[idx]; + } else if constexpr (multipleBlocksPerRow) { + smemOutput[dstIdx] = idx + rowStart; + reinterpret_cast(smemOutput + topK)[dstIdx] = logit; + } else { + smemOutput[dstIdx] = idx; + } + } + } + } + } + }; + + if (stride1 == 1) { + vectorized_process(threadIdx.x, kNumThreadsPerBlock, logits + rowStart, + rowEnd - rowStart, processBins); + } else { + for (int idx = rowStart + threadIdx.x; idx < rowEnd; + idx += kNumThreadsPerBlock) { + float logit = logits[idx * stride1]; + processBins(logit, idx); + } + } + + // Make sure the elements are in shared memory. + __syncthreads(); + + // Check if we should continue to next step + return smemFinalBinSize[0] > kNumFinalItems; +} + +// Follows half - 11 - 11 - 10 bit iterations +template +static __device__ void topKPerRowJob(const int* indices, const float* logits, + int rowStart, int rowEnd, int* outIndices, + float* outLogits, int stride1, int topK) { // The number of slots for the final pass. - static constexpr int kNumFinalItems = 3072; + static constexpr int kNumFinalItems = 2048; // The number of elements per thread for the final sort. static constexpr int kNumFinalItemsPerThread = kNumFinalItems / kNumThreadsPerBlock; // The class to sort the elements during the final pass. using FinalSort = cub::BlockRadixSort; - + using FinalSortTempStorage = + std::conditional_t; // The class to compute the inclusive prefix-sum over the histogram. using Scan = cub::BlockScan; - // Shared memory to compute the block scan. - __shared__ typename Scan::TempStorage smemScan; - // The structure to store the final items (for the final pass). struct FinalItems { // Shared memory to store the indices for the final pass. @@ -87,200 +346,225 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart, float logits[kNumFinalItems]; }; + struct Histogram { + typename Scan::TempStorage scan; + int data[kNumBins]; + }; + // Shared memory to compute the block sort. __shared__ union { FinalItems items; - typename FinalSort::TempStorage finalSort; - typename TopKSort::TempStorage topKSort; + FinalSortTempStorage finalSort; + Histogram histo; } smemFinal; - // Shared memory to store the histogram. - __shared__ int smemHistogram[kNumBins]; // Shared memory to store the selected indices. - __shared__ int smemIndices[kTopK]; + // If we are processing using multiple blocks, we need to store the logits and + // indices. + extern __shared__ int32_t smemOutput[]; + // Shared memory to store the threshold bin. __shared__ int smemThresholdBinIdx[1]; // Shared memory counter to register the candidates for the final phase. __shared__ int smemFinalDstIdx[1]; + // Shared memory to determine if the threshold bin fits in the final items. + __shared__ int smemFinalBinSize[1]; + // Shared memory to keep track of the top-k values found so far by the + // previous iterations + __shared__ int smemFoundTopKValues[1]; // The length of the row. int rowLen = rowEnd - rowStart; // Shortcut if the length of the row is smaller than Top-K. Indices are not // sorted by their corresponding logit. - if (rowLen <= kTopK) { + if (rowLen <= topK) { for (int rowIt = threadIdx.x; rowIt < rowLen; rowIt += kNumThreadsPerBlock) { - int idx = rowStart + rowIt; - outIndices[rowIdx * kTopK + rowIt] = idx - rowStart; + if constexpr (multipleBlocksPerRow) { + outIndices[rowIt] = rowIt + rowStart; + outLogits[rowIt] = logits[rowIt + rowStart]; + } else { + outIndices[rowIt] = rowIt; + } } - for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK; + for (int rowIt = rowLen + threadIdx.x; rowIt < topK; rowIt += kNumThreadsPerBlock) { - outIndices[rowIdx * kTopK + rowIt] = -1; + outIndices[rowIt] = -1; + if constexpr (multipleBlocksPerRow) { + outLogits[rowIt] = -FLT_MAX; + } } + return; } - - // Clear the histogram. - if (threadIdx.x < kNumBins) { - smemHistogram[threadIdx.x] = 0; - } - - // Make sure the histogram is ready. - __syncthreads(); - - // Fetch elements one-by-one. - for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd; - rowIt += kNumThreadsPerBlock) { - uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]); - atomicAdd(&smemHistogram[idx], 1); - } - - // Make sure the histogram is ready. - __syncthreads(); - - // Read the values from SMEM. - int binCount{0}; - if (threadIdx.x < kNumBins) { - binCount = smemHistogram[threadIdx.x]; - } - - // Make sure each thread has read its value. - __syncthreads(); - - // Compute the prefix sum. - int prefixSum{0}, totalSum{0}; - Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum); - - // Update the histogram with the prefix sums. - if (threadIdx.x < kNumBins) { - smemHistogram[threadIdx.x] = prefixSum; - } - - // Make sure the data is in shared memory. - __syncthreads(); - - // Find the last valid bin. - if (threadIdx.x < kNumBins) { - int nextPrefixSum = - threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1]; - if (prefixSum < kTopK && nextPrefixSum >= kTopK) { - smemThresholdBinIdx[0] = threadIdx.x; - } - } - - // Clear the counter to store the items for the final phase. + // Initialize values if (threadIdx.x == 0) { smemFinalDstIdx[0] = 0; + smemFoundTopKValues[0] = 0; + } + __syncthreads(); + int thresholdBinIdx = -1; + uint32_t logitPattern = 0; + + // Step 0: Process first 11 bits of half representation + bool continueToNextStep = + processHistogramStep<0, kNumThreadsPerBlock, kNumBins, kNumFinalItems, + multipleBlocksPerRow, mergeBlocks>( + indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput, + smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize, + smemFoundTopKValues, smemFinal, stride1, rowStart, topK); + + if (continueToNextStep) { + // Step 1: Process next 11 bits + continueToNextStep = + processHistogramStep<1, kNumThreadsPerBlock, kNumBins, kNumFinalItems, + multipleBlocksPerRow, mergeBlocks>( + indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput, + smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize, + smemFoundTopKValues, smemFinal, stride1, rowStart, topK); } - // Make sure the data is in shared memory. - __syncthreads(); + if (continueToNextStep) { + // Step 2: Process next 11 bits + continueToNextStep = + processHistogramStep<2, kNumThreadsPerBlock, kNumBins, kNumFinalItems, + multipleBlocksPerRow, mergeBlocks>( + indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput, + smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize, + smemFoundTopKValues, smemFinal, stride1, rowStart, topK); + } - // The threshold bin. - int thresholdBinIdx = smemThresholdBinIdx[0]; + if (continueToNextStep) { + // Step 3: Process last 10 bits + processHistogramStep<3, kNumThreadsPerBlock, kNumBins, kNumFinalItems, + multipleBlocksPerRow, mergeBlocks>( + indices, logits, rowEnd, logitPattern, thresholdBinIdx, smemOutput, + smemThresholdBinIdx, smemFinalDstIdx, smemFinalBinSize, + smemFoundTopKValues, smemFinal, stride1, rowStart, topK); + } - // Fetch elements one-by-one and populate the shared memory buffers. - for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd; - rowIt += kNumThreadsPerBlock) { - float logit = logits[rowIdx * stride0 + rowIt * stride1]; - uint16_t idx = extractBinIdx(logit); - if (idx < thresholdBinIdx) { - int dstIdx = atomicAdd(&smemHistogram[idx], 1); - smemIndices[dstIdx] = rowIt; - } else if (idx == thresholdBinIdx) { - int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1); - if (dstIdx < kNumFinalItems) { - smemFinal.items.logits[dstIdx] = logit; - smemFinal.items.indices[dstIdx] = rowIt; + if (!continueToNextStep) { + // The histogram did not proceed to the final 10 bits, therefore we need to + // sort the final items The logits of the elements to be sorted in the final + // pass. + if constexpr (useRadixSort) { + // Sorting with radix sort + float finalLogits[kNumFinalItemsPerThread]; + // The indices of the elements to be sorted in the final pass. + int finalIndices[kNumFinalItemsPerThread]; + +#pragma unroll + for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) { + finalLogits[ii] = -FLT_MAX; + } + + // Read the elements from SMEM. +#pragma unroll + for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) { + int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x; + if (srcIdx < smemFinalDstIdx[0]) { + finalLogits[ii] = smemFinal.items.logits[srcIdx]; + finalIndices[ii] = smemFinal.items.indices[srcIdx]; + } + } + // Make sure the shared memory has been read. + __syncthreads(); + + // Sort the elements. + FinalSort(smemFinal.finalSort) + .SortDescendingBlockedToStriped(finalLogits, finalIndices); + + // Copy the data back to the shared memory storage. + int baseIdx = smemFoundTopKValues[0]; + +#pragma unroll + for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) { + int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x; + int dstIdx = baseIdx + srcIdx; + + if (dstIdx < topK) { + smemOutput[dstIdx] = finalIndices[ii]; + if constexpr (multipleBlocksPerRow) { + reinterpret_cast(smemOutput + topK)[dstIdx] = + finalLogits[ii]; + } + } + } + } else { + // Sorting with insertion sort + auto baseIdx = smemFoundTopKValues[0]; + for (int i = threadIdx.x; i < smemFinalDstIdx[0]; + i += kNumThreadsPerBlock) { + int outIndex = 0; + auto logit = smemFinal.items.logits[i]; + for (int j = 0; j < smemFinalDstIdx[0]; j++) { + auto otherLogit = smemFinal.items.logits[j]; + if (logit < otherLogit || (logit == otherLogit && i < j)) { + outIndex++; + } + } + // Store if outIndex is in bounds + if (outIndex + baseIdx < topK) { + smemOutput[outIndex + baseIdx] = smemFinal.items.indices[i]; + if constexpr (multipleBlocksPerRow) { + reinterpret_cast(smemOutput + topK)[outIndex + baseIdx] = + smemFinal.items.logits[i]; + } + } + } + } + __syncthreads(); + } + + // Store to global memory. + for (int i = threadIdx.x; i < topK; i += kNumThreadsPerBlock) { + if constexpr (multipleBlocksPerRow) { + outIndices[i] = smemOutput[i]; + outLogits[i] = reinterpret_cast(smemOutput + topK)[i]; + } else { + if (stride1 == 1) { + // stride1 == 1 will use vectorized_process, which indexes already skip + // the rowStart. + outIndices[i] = smemOutput[i]; + } else { + outIndices[i] = smemOutput[i] - rowStart; } } } - - // Make sure the elements are in shared memory. - __syncthreads(); - - // The logits of the elements to be sorted in the final pass. - float finalLogits[kNumFinalItemsPerThread]; - // The indices of the elements to be sorted in the final pass. - int finalIndices[kNumFinalItemsPerThread]; - -// Init. -#pragma unroll - for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) { - finalLogits[ii] = -FLT_MAX; - } - -// Read the elements from SMEM. -#pragma unroll - for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) { - int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x; - if (srcIdx < smemFinalDstIdx[0]) { - finalLogits[ii] = smemFinal.items.logits[srcIdx]; - finalIndices[ii] = smemFinal.items.indices[srcIdx]; - } - } - - // Make sure the shared memory has been read. - __syncthreads(); - - // Sort the elements. - FinalSort(smemFinal.finalSort) - .SortDescendingBlockedToStriped(finalLogits, finalIndices); - - // Copy the data back to the shared memory storage. - int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0; -#pragma unroll - for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) { - int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x; - int dstIdx = baseIdx + srcIdx; - if (dstIdx < kTopK) { - smemIndices[dstIdx] = finalIndices[ii]; - } - } - - // Make sure the data is in shared memory. - __syncthreads(); - -// Store to global memory. -#pragma unroll - for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) { - int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x; - outIndices[offset] = - smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart; - } } -template -static __global__ void topKPerRow(const float* logits, const int* rowStarts, - const int* rowEnds, int* outIndices, - int stride0, int stride1) { +template +static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill( + const float* logits, const int* rowStarts, const int* rowEnds, + int* outIndices, int stride0, int stride1, const int topK, + const int offsetIndex) { // The number of bins in the histogram. - static constexpr int kNumBins = 512; - - // The top-k width. - static constexpr int kTopK = 2048; + static constexpr int kNumBins = 2048; // The row computed by this block. - int rowIdx = blockIdx.x; + int rowIdx = blockIdx.x + offsetIndex; // The range of logits within the row. int rowStart = rowStarts[rowIdx]; int rowEnd = rowEnds[rowIdx]; - topKPerRowJob( - logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1); + // Local pointers to this block + outIndices += rowIdx * topK; + logits += rowIdx * stride0; + + topKPerRowJob( + nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK); } -template -static __global__ void topKPerRowDecode(const float* logits, const int* seqLens, - int* outIndices, int stride0, - int stride1, int next_n) { +template +static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode( + const float* logits, const int* seqLens, int* outIndices, int stride0, + int stride1, const int topK, int next_n, float* outLogits = nullptr, + const int numBlocksToMerge = 0, const int* indices = nullptr) { // The number of bins in the histogram. - static constexpr int kNumBins = 512; - - // The top-k width. - static constexpr int kTopK = 2048; + static constexpr int kNumBins = 2048; // The row computed by this block. int rowIdx = blockIdx.x; @@ -290,8 +574,25 @@ static __global__ void topKPerRowDecode(const float* logits, const int* seqLens, int seq_len = seqLens[rowIdx / next_n]; int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1; - topKPerRowJob( - logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1); + // Local pointers to this block + if constexpr (!multipleBlocksPerRow && !mergeBlocks) { + outIndices += rowIdx * topK; + } else if constexpr (multipleBlocksPerRow) { + const auto blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192 + rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192 + rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize; + outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK; + outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK; + } else if constexpr (mergeBlocks) { + rowEnd = numBlocksToMerge * topK; + indices += rowIdx * numBlocksToMerge * topK; + outIndices += rowIdx * topK; + } + logits += rowIdx * stride0; + + topKPerRowJob( + indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK); } } // namespace vllm @@ -339,28 +640,84 @@ void apply_repetition_penalties_( void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n, const torch::Tensor& seqLens, torch::Tensor& indices, - int64_t numRows, int64_t stride0, int64_t stride1) { - // Compute the results on the device. + int64_t numRows, int64_t stride0, int64_t stride1, + int64_t topK) { + constexpr int kSortingAlgorithmThreshold = 12288; + constexpr int kSplitWorkThreshold = 200 * 1000; + constexpr int kNumThreadsPerBlock = 512; + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const auto numColumns = logits.size(1); + + if (numColumns < kSortingAlgorithmThreshold) { + // Use insertion sort + vllm::topKPerRowDecode + <<>>( + logits.data_ptr(), seqLens.data_ptr(), + indices.data_ptr(), static_cast(stride0), + static_cast(stride1), static_cast(topK), + static_cast(next_n)); + } else if (numColumns < kSplitWorkThreshold) { + // From this threshold, use radix sort instead + vllm::topKPerRowDecode + <<>>( + logits.data_ptr(), seqLens.data_ptr(), + indices.data_ptr(), static_cast(stride0), + static_cast(stride1), static_cast(topK), + static_cast(next_n)); + } else { + // Long sequences are run in two steps + constexpr auto multipleBlocksPerRowConfig = 10; + + const auto outIndicesAux = + torch::empty({numRows, multipleBlocksPerRowConfig, topK}, + torch::dtype(torch::kInt32).device(logits.device())); + const auto outLogitsAux = + torch::empty({numRows, multipleBlocksPerRowConfig, topK}, + torch::dtype(torch::kFloat).device(logits.device())); + + vllm::topKPerRowDecode + <<>>( + logits.data_ptr(), seqLens.data_ptr(), + outIndicesAux.data_ptr(), static_cast(stride0), + static_cast(stride1), static_cast(topK), + static_cast(next_n), outLogitsAux.data_ptr()); + + constexpr int kNumThreadsPerBlockMerge = 1024; + vllm::topKPerRowDecode + <<>>( + outLogitsAux.data_ptr(), seqLens.data_ptr(), + indices.data_ptr(), multipleBlocksPerRowConfig * topK, 1, + static_cast(topK), static_cast(next_n), nullptr, + multipleBlocksPerRowConfig, outIndicesAux.data_ptr()); + } +} + +void top_k_per_row_prefill(const torch::Tensor& logits, + const torch::Tensor& rowStarts, + const torch::Tensor& rowEnds, torch::Tensor& indices, + int64_t numRows, int64_t stride0, int64_t stride1, + int64_t topK) { + constexpr int kSortingAlgorithmThreshold = 12288; constexpr int kNumThreadsPerBlock = 512; const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - vllm::topKPerRowDecode - <<>>( - logits.data_ptr(), seqLens.data_ptr(), - indices.data_ptr(), static_cast(stride0), - static_cast(stride1), static_cast(next_n)); -} + int numInsertionBlocks = + std::min(static_cast(numRows), kSortingAlgorithmThreshold); + vllm::topKPerRowPrefill + <<>>(logits.data_ptr(), rowStarts.data_ptr(), + rowEnds.data_ptr(), indices.data_ptr(), + static_cast(stride0), static_cast(stride1), + static_cast(topK), 0); -void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts, - const torch::Tensor& rowEnds, torch::Tensor& indices, - int64_t numRows, int64_t stride0, int64_t stride1) { - // Compute the results on the device. - constexpr int kNumThreadsPerBlock = 512; - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - vllm::topKPerRow - <<>>( - logits.data_ptr(), rowStarts.data_ptr(), - rowEnds.data_ptr(), indices.data_ptr(), - static_cast(stride0), static_cast(stride1)); + if (numRows > kSortingAlgorithmThreshold) { + int numRadixBlocks = numRows - kSortingAlgorithmThreshold; + vllm::topKPerRowPrefill + <<>>(logits.data_ptr(), rowStarts.data_ptr(), + rowEnds.data_ptr(), indices.data_ptr(), + static_cast(stride0), static_cast(stride1), + static_cast(topK), kSortingAlgorithmThreshold); + } } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index db37a9b9b88e3..62212f98b4766 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -179,15 +179,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Optimized top-k per row operation ops.def( - "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, " + "top_k_per_row_prefill(Tensor logits, Tensor rowStarts, Tensor rowEnds, " "Tensor! indices, int numRows, int stride0, " - "int stride1) -> ()"); - ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row); + "int stride1, int topK) -> ()"); + ops.impl("top_k_per_row_prefill", torch::kCUDA, &top_k_per_row_prefill); ops.def( "top_k_per_row_decode(Tensor logits, int next_n, " - "Tensor seq_lens, Tensor! indices, int numRows, " - "int stride0, int stride1) -> ()"); + "Tensor seq_lens, Tensor! indices, " + "int numRows, int stride0, int stride1, int topK) -> ()"); ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode); // Layernorm-quant diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py index cadda27b49e9c..3bf69389753e3 100644 --- a/tests/kernels/test_top_k_per_row.py +++ b/tests/kernels/test_top_k_per_row.py @@ -9,23 +9,45 @@ from vllm.platforms import current_platform # Test parameters NUM_ROWS = [1, 32, 2050] -TOP_K_VALUES = [2048] -BATCH_SIZE = [1, 2, 4, 2048, 4096] -NEXT_N = [1, 2, 4, 8] +TOP_K_VALUES = [2048, 3000] +BATCH_SIZE = [1, 2, 2048] +NEXT_N = [1, 8] +DATA_GENERATION = ["random", "10LSBits"] def create_random_logits( row_starts: torch.Tensor, row_ends: torch.Tensor, - vocab_size: int, dtype: torch.dtype, seed: int, + data_generation: str, ) -> torch.Tensor: """Create random logits tensor for testing.""" torch.manual_seed(seed) np.random.seed(seed) # Generate logits with some structure to make testing more meaningful - logits = torch.randn(row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda") + if data_generation == "random": + logits = torch.randn( + row_starts.shape[0], max(row_ends), dtype=dtype, device="cuda" + ) + elif data_generation == "10LSBits": + top_22_bits_mask = 0xFFFFFC00 + last_10_bits_mask = 0x000003FF + fixed_top_22_bits = 0x3F900000 + # Generate random bits for the last 10 bits + random_bottom_bits = torch.randint( + 0, + 2**10, + (row_starts.shape[0], max(row_ends)), + dtype=torch.int32, + device="cuda", + ) + # Combine: fixed top 22 bits with random last 10 bits + logits_bits = (fixed_top_22_bits & top_22_bits_mask) | ( + random_bottom_bits & last_10_bits_mask + ) + logits = logits_bits.view(dtype) + for i, end in enumerate(row_ends): logits[i, end:] = float("-inf") return logits @@ -113,13 +135,13 @@ def test_top_k_per_row( # Create test data vocab_size = 20000 row_starts, row_ends = create_row_boundaries(num_rows, vocab_size) - logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42) + logits = create_random_logits(row_starts, row_ends, torch.float32, 42, "random") # Create output tensors indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda") # Run CUDA implementation - torch.ops._C.top_k_per_row( + torch.ops._C.top_k_per_row_prefill( logits, row_starts, row_ends, @@ -127,6 +149,7 @@ def test_top_k_per_row( num_rows, logits.stride(0), logits.stride(1), + top_k, ) # Run reference implementation @@ -139,27 +162,23 @@ def test_top_k_per_row( # Compare results assert compare_top_k_results( logits, indices, torch_indices, row_starts, row_ends, top_k - ), "CUDA top_k_per_row results don't match torch.topk" + ), "CUDA top_k_per_row_prefill results don't match torch.topk" -@pytest.mark.parametrize("top_k", TOP_K_VALUES) -@pytest.mark.parametrize("batch_size", BATCH_SIZE) -@pytest.mark.parametrize("next_n", NEXT_N) -@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA") -@torch.inference_mode() -def test_top_k_per_row_decode( +def _run_top_k_per_row_decode_test( top_k: int, batch_size: int, next_n: int, + vocab_size: int, + data_generation: str, ) -> None: """ - Test top_k_per_row with seq_lens tensor. + Helper function to run top_k_per_row_decode test with given parameters. """ torch.set_default_device("cuda:0") # Create test data num_rows = batch_size * next_n - vocab_size = 20000 seq_lens = torch.randint( vocab_size, (batch_size,), dtype=torch.int32, device="cuda" ) @@ -167,7 +186,9 @@ def test_top_k_per_row_decode( row_indices = torch.arange(num_rows, device="cuda") // next_n next_n_offset = torch.arange(num_rows, device="cuda") % next_n row_ends = seq_lens[row_indices] - next_n + next_n_offset + 1 - logits = create_random_logits(row_starts, row_ends, vocab_size, torch.float32, 42) + logits = create_random_logits( + row_starts, row_ends, torch.float32, 42, data_generation + ) # Create output tensors indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda") @@ -181,6 +202,7 @@ def test_top_k_per_row_decode( num_rows, logits.stride(0), logits.stride(1), + top_k, ) torch.cuda.synchronize() @@ -195,4 +217,41 @@ def test_top_k_per_row_decode( # Compare results assert compare_top_k_results( logits, indices, torch_indices, row_starts, row_ends, top_k - ), "CUDA top_k_per_row results don't match torch.topk" + ), "CUDA top_k_per_row_decode results don't match torch.topk" + + +@pytest.mark.parametrize("top_k", TOP_K_VALUES) +@pytest.mark.parametrize("batch_size", BATCH_SIZE) +@pytest.mark.parametrize("next_n", NEXT_N) +@pytest.mark.parametrize("data_generation", DATA_GENERATION) +@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA") +@torch.inference_mode() +def test_top_k_per_row_decode( + top_k: int, + batch_size: int, + next_n: int, + data_generation: str, +) -> None: + """ + Test top_k_per_row with seq_lens tensor. + """ + vocab_size = 20000 + _run_top_k_per_row_decode_test( + top_k, batch_size, next_n, vocab_size, data_generation + ) + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA") +@torch.inference_mode() +def test_top_k_per_row_decode_large_vocab_size() -> None: + """ + Test top_k_per_row_decode with large vocabulary size. + """ + top_k = 2048 + batch_size = 2 + next_n = 2 + vocab_size = 300000 + data_generation = "random" + _run_top_k_per_row_decode_test( + top_k, batch_size, next_n, vocab_size, data_generation + ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index a8eb4a69b6f2b..0b6513789aea8 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -684,11 +684,10 @@ def sparse_attn_indexer( chunk.cu_seqlen_ke, ) num_rows = logits.shape[0] - assert topk_tokens == 2048, "top_k_per_row assumes size 2048" topk_indices = topk_indices_buffer[ chunk.token_start : chunk.token_end, :topk_tokens ] - torch.ops._C.top_k_per_row( + torch.ops._C.top_k_per_row_prefill( logits, chunk.cu_seqlen_ks, chunk.cu_seqlen_ke, @@ -696,6 +695,7 @@ def sparse_attn_indexer( num_rows, logits.stride(0), logits.stride(1), + topk_tokens, ) if has_decode: @@ -738,7 +738,6 @@ def sparse_attn_indexer( max_model_len=max_model_len, ) num_rows = logits.shape[0] - assert topk_tokens == 2048, "top_k_per_row assumes size 2048" topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens] torch.ops._C.top_k_per_row_decode( @@ -749,6 +748,7 @@ def sparse_attn_indexer( num_rows, logits.stride(0), logits.stride(1), + topk_tokens, ) if decode_metadata.requires_padding: # if padded, we need to unpack From 87aee9ed2b65b9ef7cdb83cc69dc1fc15f39ff36 Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Mon, 8 Dec 2025 07:46:15 -0800 Subject: [PATCH 071/133] Add evaluate_guards option to DynamicShapesConfig (#27432) Signed-off-by: Laith Sakka --- .../test_dynamic_shapes_compilation.py | 139 +++++++++++++++++- vllm/compilation/backends.py | 26 +++- vllm/compilation/decorators.py | 4 +- vllm/compilation/wrapper.py | 65 ++++++-- vllm/config/compilation.py | 17 ++- vllm/config/vllm.py | 2 +- 6 files changed, 222 insertions(+), 31 deletions(-) diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py index 1966b03cd9c89..bc3dbf5533312 100644 --- a/tests/compile/test_dynamic_shapes_compilation.py +++ b/tests/compile/test_dynamic_shapes_compilation.py @@ -2,12 +2,21 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc +import tempfile +from contextlib import contextmanager import pytest import torch from vllm import LLM, SamplingParams -from vllm.config.compilation import CompilationMode, DynamicShapesType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config +from vllm.config.compilation import ( + CompilationMode, + DynamicShapesConfig, + DynamicShapesType, +) +from vllm.forward_context import set_forward_context from vllm.tokenizers import get_tokenizer from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -29,18 +38,19 @@ def get_test_models(): ) @pytest.mark.parametrize("use_aot_compile", ["0"]) @pytest.mark.parametrize("use_bytecode_hook", [True, False]) +@pytest.mark.parametrize("evaluate_guards", [False, True]) @pytest.mark.skipif( not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" ) def test_dynamic_shapes_compilation( - monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook + monkeypatch, + model_name, + shapes_type, + use_aot_compile, + use_bytecode_hook, + evaluate_guards, ): """Test that all dynamic shapes types compile successfully""" - print( - f"\nTesting model: {model_name} with {shapes_type.name}, " - f"AOT compile: {use_aot_compile}, " - f"Bytecode hook: {use_bytecode_hook}" - ) if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED: pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0") @@ -58,6 +68,7 @@ def test_dynamic_shapes_compilation( "mode": CompilationMode.VLLM_COMPILE, "dynamic_shapes_config": { "type": shapes_type.value, + "evaluate_guards": evaluate_guards, }, }, ) @@ -86,3 +97,117 @@ def test_dynamic_shapes_compilation( torch.cuda.empty_cache() torch.cuda.synchronize() print("GPU memory cleared") + + +@pytest.mark.parametrize("use_aot_compile", ["0", "1"]) +@pytest.mark.parametrize( + "dynamic_shapes_type", + [ + DynamicShapesType.BACKED, + DynamicShapesType.BACKED_SIZE_OBLIVIOUS, + ], +) +@pytest.mark.parametrize("evaluate_guards", [False, True]) +def test_model_specialization_with_evaluate_guards( + monkeypatch, use_aot_compile, dynamic_shapes_type, evaluate_guards +): + """Test that evaluate_guards correctly detects shape specialization + violations. + """ + + if ( + use_aot_compile == "1" + and dynamic_shapes_type == DynamicShapesType.BACKED + and evaluate_guards + ): + pytest.skip("evaluate_guards for backed does not work with aot_compile =1") + + @support_torch_compile + class ModelWithSizeCheck(torch.nn.Module): + def __init__(self, **kwargs): + super().__init__() + + def forward(self, x: torch.Tensor): + # This will cause specialization - torch.compile will guard on + # sx.shape[0] + if x.shape[0] >= 10: + return x * 10 + else: + return x * 10 + + @support_torch_compile + class ModelWithOneSizeCheck(torch.nn.Module): + def __init__(self, **kwargs): + super().__init__() + + def forward(self, x: torch.Tensor): + # This will cause 0/1 specializations. + if x.shape[0] == 0: + return x * 10 + if x.shape[0] == 1: + return x * 10 + else: + return x * 10 + + @contextmanager + def use_vllm_config(vllm_config: VllmConfig): + with set_forward_context({}, vllm_config), set_current_vllm_config(vllm_config): + yield + + monkeypatch.setenv("TOKENIZERS_PARALLELISM", "true") + monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile) + monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "0") + + # Create vllm config with the desired settings + from vllm.config import CompilationMode + + vllm_config = VllmConfig( + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + dynamic_shapes_config=DynamicShapesConfig( + type=dynamic_shapes_type, + evaluate_guards=evaluate_guards, + ), + ) + ) + + def test(model_class, input1, input2, is_01_specialization=False): + with ( + torch.no_grad(), + use_vllm_config(vllm_config), + tempfile.TemporaryDirectory() as tmpdirname, + ): + monkeypatch.setenv("VLLM_CACHE_ROOT", tmpdirname) + + model = model_class(vllm_config=vllm_config).cuda() + + model(input1) + + if evaluate_guards and ( + not ( + is_01_specialization + and dynamic_shapes_type == DynamicShapesType.BACKED + ) + ): + # This should fail because guards were added. + with pytest.raises(RuntimeError) as excinfo: + model(input2) + + # Expected failure - guard was violated + error_msg = str(excinfo.value) + assert ( + "GuardManager check failed" in error_msg + or "Detected recompile when torch.compile stance" in error_msg + ), error_msg + + else: + model(input2) + + test(ModelWithSizeCheck, torch.randn(20, 10).cuda(), torch.randn(5, 10).cuda()) + test(ModelWithSizeCheck, torch.randn(5, 10).cuda(), torch.randn(20, 10).cuda()) + test( + ModelWithOneSizeCheck, + torch.randn(20, 10).cuda(), + torch.randn(1, 10).cuda(), + is_01_specialization=True, + ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 26f4f16a845a2..dd2233522263d 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -26,6 +26,7 @@ from vllm.compilation.partition_rules import ( should_split, ) from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig +from vllm.config.compilation import DynamicShapesType from vllm.config.utils import Range, hash_factors from vllm.logger import init_logger from vllm.logging_utils import lazy @@ -722,6 +723,29 @@ class VllmBackend: self.split_gm, submod_names_to_compile, self.vllm_config, self ).run(*fake_args) + from torch._guards import detect_fake_mode + + fake_mode = detect_fake_mode() + + if ( + self.compilation_config.dynamic_shapes_config.evaluate_guards + and self.compilation_config.dynamic_shapes_config.type + == DynamicShapesType.BACKED + ): + from torch.utils._sympy.value_ranges import ValueRanges + + # Drop counter-0/1 specializations guards; for backed dynamic shapes, + # torch.compile will specialize for 0/1 inputs or otherwise guards that + # shape is >= 2. This is because it's really hard not to hit a check + # against 0/1. When we evaluate shape guards, we exclude checking those + # guards (We would fail always otherwise). + + # We avoid that by updating the ranges of backed sizes when the min is + # 2 for any, we assume it's 0. + for s, r in fake_mode.shape_env.var_to_range.items(): + if r.lower == 2: + fake_mode.shape_env.var_to_range[s] = ValueRanges(0, r.upper) + graph_path = os.path.join(local_cache_dir, "computation_graph.py") if not os.path.exists(graph_path): # code adapted from @@ -749,8 +773,6 @@ class VllmBackend: graph, example_inputs, self.prefix, self.split_gm ) - # if we need to copy input buffers for cudagraph - # # index of tensors that have symbolic shapes (batch size) # for weights and static buffers, they will have concrete shapes. # symbolic shape only happens for input tensors. diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 6bb66ce3eec0c..31f5e78408460 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -392,7 +392,6 @@ def _support_torch_compile( factors.append(_model_hash_key(self.forward)) hash_key = hashlib.sha256(str(factors).encode()).hexdigest() - cache_dir = os.path.join( envs.VLLM_CACHE_ROOT, "torch_aot_compile", @@ -413,7 +412,8 @@ def _support_torch_compile( f, f_globals=self.forward.__globals__ ) _verify_source_unchanged(loaded_fn.source_info(), self.vllm_config) - loaded_fn.disable_guard_check() + if not self.compilation_config.dynamic_shapes_config.evaluate_guards: + loaded_fn.disable_guard_check() self.aot_compiled_fn = loaded_fn except Exception as e: if os.path.exists(aot_compilation_path): diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 69e1ed37a5beb..b59a4a9dd1527 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -4,7 +4,7 @@ import os import sys from abc import abstractmethod -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from types import CodeType from typing import Any @@ -13,6 +13,7 @@ import torch._C._dynamo.guards import vllm.envs as envs from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config +from vllm.config.compilation import DynamicShapesType from vllm.logger import init_logger from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context @@ -125,23 +126,49 @@ class TorchCompileWithNoGuardsWrapper: if isinstance(backend, str) and backend == "inductor": options = vllm_config.compilation_config.inductor_compile_config - if mode != CompilationMode.STOCK_TORCH_COMPILE: - # Drop all the guards. - options["guard_filter_fn"] = lambda x: [False for _ in x] - - # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False - from vllm.compilation.decorators import DynamicShapesType + self.first_compile = True + self.evaluate_guards = ( + vllm_config.compilation_config.dynamic_shapes_config.evaluate_guards + ) ds_type = vllm_config.compilation_config.dynamic_shapes_config.type - compiled_ptr: Any = self.forward - if ds_type == DynamicShapesType.UNBACKED: - if envs.VLLM_USE_BYTECODE_HOOK: - # reason is that bytecode does this hack torch._dynamo.eval_frame. - # remove_from_cache(self.original_code_object()) to force a new - # re-compilation. - raise ValueError( - "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. " + + if mode != CompilationMode.STOCK_TORCH_COMPILE: + # Drop all the guards. + if self.evaluate_guards: + assert not envs.VLLM_USE_BYTECODE_HOOK, ( + "compilation_config.dynamic_shapes_config.evaluate_guards " + "requires VLLM_USE_BYTECODE_HOOK=0. " ) + + if envs.VLLM_USE_AOT_COMPILE: + # disabled until https://github.com/pytorch/pytorch/pull/169239 + # is picked up. + assert ds_type != DynamicShapesType.BACKED, ( + "evaluate_guards for backed shapes requires " + "VLLM_USE_AOT_COMPILE=False. " + ) + + options["guard_filter_fn"] = lambda x: [ + entry.guard_type == "SHAPE_ENV" for entry in x + ] + else: + options["guard_filter_fn"] = lambda x: [False for _ in x] + + compiled_ptr: Any = self.forward + # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False + + if ds_type == DynamicShapesType.UNBACKED: + # reason is that bytecode does torch._dynamo.eval_frame. + # remove_from_cache(self.original_code_object()) to force a new + # re-compilation. And if we use + # compiled_ptr = self.check_invariants_and_forward + # it will reset all entries. + assert not envs.VLLM_USE_BYTECODE_HOOK, ( + "UNBACKED dynamic shapes requires VLLM_USE_BYTECODE_HOOK=0. " + ) + assert not self.evaluate_guards, "UNBACKED dynamic shapes do not add guards" + compiled_ptr = self.check_invariants_and_forward if envs.VLLM_USE_AOT_COMPILE: @@ -195,7 +222,13 @@ class TorchCompileWithNoGuardsWrapper: self.forward, *args, **kwargs ) else: - with _compilation_context(): + ctx = ( + nullcontext() + if self.first_compile or not self.evaluate_guards + else torch.compiler.set_stance("fail_on_recompile") + ) + self.first_compile = False + with _compilation_context(), ctx: return self._call_with_optional_nvtx_range( self._compiled_callable, *args, **kwargs ) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index b79200f0e4779..51e4912aad9db 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -344,7 +344,18 @@ class DynamicShapesConfig: backed/unbacked. """ - # TODO add a debug mode to fail + evaluate_guards: bool = False + """ + A debug mode to detect and fail if Dynamo ever specializes a dynamic shape by + guarding on it. When True, dynamic shape guards are not dropped from dynamo. + And a failure will be triggered if a recompilation ever happens due to that. + This mode requires VLLM_USE_BYTECODE_HOOK to be 0. + Enabling this allow observing the dynamic shapes guards in the tlparse + artifacts also. + When type is backed, aot_compile must be disabled for this mode to work. + until this change picked up https://github.com/pytorch/pytorch/pull/169239. + + """ def compute_hash(self) -> str: """ @@ -455,8 +466,8 @@ class CompilationConfig: We use string to avoid serialization issues when using compilation in a distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation mode is 3, the backend supports both whole graph and piecewise - compilation, available backends include eager, inductor, and custom backends, + compilation mode is 3, the backend supports both whole graph and piecewise + compilation, available backends include eager, inductor, and custom backends, the latter of which can be defined via `get_compile_backend`. Furthermore, compilation is only piecewise if splitting ops is set accordingly and use_inductor_graph_partition is off. Note that the default options for diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 36e4bd159dc72..a74413536407b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -66,7 +66,7 @@ class OptimizationLevel(IntEnum): """O0 : No optimization. no compilation, no cudagraphs, no other optimization, just starting up immediately""" O1 = 1 - """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise + """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise cudagraphs""" O2 = 2 """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs.""" From 67312cad11835bd75ca55fda83708d4806b82436 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 9 Dec 2025 00:59:31 +0800 Subject: [PATCH 072/133] [Misc] Split the LoRA code (#30253) Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 2 +- tests/lora/test_lora_checkpoints.py | 2 +- tests/lora/test_lora_huggingface.py | 2 +- tests/lora/test_lora_manager.py | 4 +- tests/lora/test_worker.py | 2 +- vllm/lora/lora_model.py | 246 ++++++++++++++++++++++ vllm/lora/{models.py => model_manager.py} | 237 +-------------------- vllm/lora/utils.py | 9 + vllm/lora/worker_manager.py | 4 +- 9 files changed, 265 insertions(+), 243 deletions(-) create mode 100644 vllm/lora/lora_model.py rename vllm/lora/{models.py => model_manager.py} (74%) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 9df3a07a9e5e9..47d1fcfe9a0c7 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -28,7 +28,7 @@ from vllm.lora.layers import ( RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA, ) -from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index e9653a2fedfaf..e6816e83da001 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -3,7 +3,7 @@ import pytest -from vllm.lora.models import LoRAModel +from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.utils import WeightsMapper diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 3348d2f8ce654..7c7f4eb4b626b 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -3,7 +3,7 @@ import pytest -from vllm.lora.models import LoRAModel +from vllm.lora.lora_model import LoRAModel from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 081f14d6fabfb..50f17ced5dd74 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -15,10 +15,10 @@ from vllm.lora.layers import ( MergedColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA, ) +from vllm.lora.lora_model import LoRAModel from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.models import ( +from vllm.lora.model_manager import ( LoRAMapping, - LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, ) diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 54059ec561907..445aaf9cb7d1e 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -16,7 +16,7 @@ from vllm.config import ( ) from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig -from vllm.lora.models import LoRAMapping +from vllm.lora.model_manager import LoRAMapping from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py new file mode 100644 index 0000000000000..db170f13ae1c7 --- /dev/null +++ b/vllm/lora/lora_model.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os + +import safetensors.torch +import torch + +from vllm.logger import init_logger +from vllm.lora.lora_weights import LoRALayerWeights +from vllm.lora.peft_helper import PEFTHelper +from vllm.lora.utils import ( + get_lora_id, + is_base_embeddding_weights, + is_regex_target_modules, + parse_fine_tuned_lora_name, +) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +from vllm.model_executor.models.utils import WeightsMapper +from vllm.utils.platform_utils import is_pin_memory_available + +logger = init_logger(__name__) + + +class LoRAModel: + """A LoRA fine-tuned model.""" + + def __init__( + self, + lora_model_id: int, + rank: int, + loras: dict[str, LoRALayerWeights], + ) -> None: + """ + Args: + lora_model_id: The integer id for the lora model. + rank: lora rank. + loras: module name -> weights for lora-replaced layers. + + """ + self.id = lora_model_id + + assert lora_model_id > 0, ( + f"a valid lora id should be greater than 0, got {self.id}" + ) + self.rank = rank + self.loras: dict[str, LoRALayerWeights] = loras + + def clone(self, lora_model_id: int) -> "LoRAModel": + """Return a copy of the object with different ids. + + Will share the underlying tensors.""" + return self.__class__( + lora_model_id, + rank=self.rank, + loras=self.loras.copy(), + ) + + def get_lora(self, module_name: str) -> LoRALayerWeights | None: + """Get LoRA for a given module by name""" + return self.loras.get(module_name, None) + + def check_lora_name(self, lora_name: str) -> bool: + return lora_name in self.loras + + @classmethod + def from_lora_tensors( + cls, + lora_model_id: int, + tensors: dict[str, torch.Tensor], + peft_helper: PEFTHelper, + device: str = "cuda", + dtype: torch.dtype | None = None, + model_vocab_size: int | None = None, + weights_mapper: WeightsMapper | None = None, + ) -> "LoRAModel": + """Create a LoRAModel from a dictionary of tensors.""" + pin_memory = str(device) == "cpu" and is_pin_memory_available() + loras: dict[str, LoRALayerWeights] = {} + for tensor_name, tensor in tensors.items(): + if is_base_embeddding_weights(tensor_name): + continue + module_name, is_lora_a = parse_fine_tuned_lora_name( + tensor_name, weights_mapper + ) + if module_name not in loras: + loras[module_name] = LoRALayerWeights.from_config( + module_name, peft_helper + ) + + if is_lora_a: + if ( + "lora_embedding_A" in tensor_name + and model_vocab_size is not None + and model_vocab_size != tensor.shape[1] + ): + raise RuntimeError( + f"The embedding LoRA size({tensor.shape[1]}) must be consistent" + f" with the base model's vocabulary size({model_vocab_size})." + ) + loras[module_name].lora_a = tensor.to(device=device, dtype=dtype) + if pin_memory: + loras[module_name].lora_a = loras[module_name].lora_a.pin_memory() + else: + loras[module_name].lora_b = tensor.to(device=device, dtype=dtype) + + if pin_memory: + loras[module_name].lora_b = loras[module_name].lora_b.pin_memory() + + return cls(lora_model_id, peft_helper.r, loras) + + @classmethod + def from_local_checkpoint( + cls, + lora_dir: str, + expected_lora_modules: set[str], + peft_helper: PEFTHelper, + *, + lora_model_id: int | None = None, + device: str = "cuda", + dtype: torch.dtype | None = None, + model_vocab_size: int | None = None, + weights_mapper: WeightsMapper | None = None, + tensorizer_config_dict: dict | None = None, + ) -> "LoRAModel": + """Create a LoRAModel from a local checkpoint. + + Args: + lora_dir: The local path that has lora data. + expected_lora_modules: Name of modules that are expected to be + replaced by lora. + peft_helper: Loaded lora configuration information. + lora_model_id: LoRA model id. If not given, automatically set by + a global counter. + device: Device where the lora model is loaded. + dtype: dtype of the lora model weights. + + Returns: + Loaded LoRA Model. + """ + lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") + lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") + lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") + + tensors: dict[str, torch.Tensor] = {} + unexpected_modules: list[list[str] | str] = [] + + def check_unexpected_modules(modules: dict): + for lora_module in modules.keys(): # noqa + if is_base_embeddding_weights(lora_module): + continue + # Handle PEFT file format where experts.base_layer is the + # gate_up_proj and experts is the down_proj + if "base_layer" in lora_module: + continue + module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) + # Case for expert lora weights + if ".experts" in module_name: + expert_idx = module_name.find(".experts") + expert_suffix = module_name[expert_idx + 1 :] + if expert_suffix not in expected_lora_modules: + unexpected_modules.append(module_name) + + elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules: + unexpected_modules.append(module_name) + + if unexpected_modules: + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct" + ) + + if tensorizer_config_dict: + from tensorizer import TensorDeserializer + + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + lora_tensor_path = os.path.join( + tensorizer_config.tensorizer_dir, "adapter_model.tensors" + ) + tensorizer_args = tensorizer_config._construct_tensorizer_args() + tensors = TensorDeserializer( + lora_tensor_path, + dtype=tensorizer_config.dtype, + **tensorizer_args.deserialization_kwargs, + ) + check_unexpected_modules(tensors) + + elif os.path.isfile(lora_tensor_path): + # Find unexpected modules. + # Use safetensor key as a source of truth to find expected modules. + # in peft if you have target_modules A, B, C and C does not exist + # in the model it won’t error and model will be trained with A, B + # loraified. C won’t exist in the safetensor but it will exist in + # the target_modules of the adapter_config.json. + unexpected_modules = [] + with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore + # Load tensors if there are only expected modules. + check_unexpected_modules(f) + for module in f.keys(): # noqa + tensors[module] = f.get_tensor(module) + elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path): + # When a bin/pt file is provided, we rely on config to find + # unexpected modules. + unexpected_modules = [] + target_modules = peft_helper.target_modules + if not isinstance(target_modules, list): + target_modules = [target_modules] + for module in target_modules: + # Compatible with more modules, + # such as:layers.11.self_attn.k_proj + part_name = module.split(".")[-1] + if part_name not in expected_lora_modules: + unexpected_modules.append(module) + # loaded lora's target modules must be a subset of + # expected_lora_modules. It is not reliable. See + # https://github.com/vllm-project/vllm/pull/5909. But there's no + # other better mechanism. + if unexpected_modules and not is_regex_target_modules( + peft_helper.target_modules, expected_lora_modules + ): + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct" + ) + lora_file_path = ( + lora_bin_file_path + if os.path.isfile(lora_bin_file_path) + else lora_pt_file_path + ) + tensors = torch.load(lora_file_path, map_location=device, weights_only=True) + else: + raise ValueError(f"{lora_dir} doesn't contain tensors") + + return cls.from_lora_tensors( + lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id, + tensors=tensors, + peft_helper=peft_helper, + device=device, + dtype=dtype, + model_vocab_size=model_vocab_size, + weights_mapper=weights_mapper, + ) diff --git a/vllm/lora/models.py b/vllm/lora/model_manager.py similarity index 74% rename from vllm/lora/models.py rename to vllm/lora/model_manager.py index 567ffce4e75fc..44e0448d92de0 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/model_manager.py @@ -2,38 +2,32 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math -import os from collections.abc import Callable from typing import TypeVar import regex as re -import safetensors.torch import torch from torch import nn from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping +from vllm.lora.lora_model import LoRAModel from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.utils import ( from_layer, from_layer_logits_processor, get_supported_lora_modules, - is_base_embeddding_weights, is_moe_model, - is_regex_target_modules, - parse_fine_tuned_lora_name, process_packed_modules_mapping, replace_submodule, ) from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper +from vllm.model_executor.models.utils import PPMissingLayer from vllm.utils.cache import LRUCache from vllm.utils.platform_utils import is_pin_memory_available @@ -53,233 +47,6 @@ class AdapterLRUCache(LRUCache[int, T]): return super()._on_remove(key, value) -_GLOBAL_LORA_ID = 0 - - -def get_lora_id(): - global _GLOBAL_LORA_ID - _GLOBAL_LORA_ID += 1 - return _GLOBAL_LORA_ID - - -class LoRAModel: - """A LoRA fine-tuned model.""" - - def __init__( - self, - lora_model_id: int, - rank: int, - loras: dict[str, LoRALayerWeights], - ) -> None: - """ - Args: - lora_model_id: The integer id for the lora model. - rank: lora rank. - loras: module name -> weights for lora-replaced layers. - - """ - self.id = lora_model_id - - assert lora_model_id > 0, ( - f"a valid lora id should be greater than 0, got {self.id}" - ) - self.rank = rank - self.loras: dict[str, LoRALayerWeights] = loras - - def clone(self, lora_model_id: int) -> "LoRAModel": - """Return a copy of the object with different ids. - - Will share the underlying tensors.""" - return self.__class__( - lora_model_id, - rank=self.rank, - loras=self.loras.copy(), - ) - - def get_lora(self, module_name: str) -> LoRALayerWeights | None: - """Get LoRA for a given module by name""" - return self.loras.get(module_name, None) - - def check_lora_name(self, lora_name: str) -> bool: - return lora_name in self.loras - - @classmethod - def from_lora_tensors( - cls, - lora_model_id: int, - tensors: dict[str, torch.Tensor], - peft_helper: PEFTHelper, - device: str = "cuda", - dtype: torch.dtype | None = None, - model_vocab_size: int | None = None, - weights_mapper: WeightsMapper | None = None, - ) -> "LoRAModel": - """Create a LoRAModel from a dictionary of tensors.""" - - loras: dict[str, LoRALayerWeights] = {} - for tensor_name, tensor in tensors.items(): - if is_base_embeddding_weights(tensor_name): - continue - module_name, is_lora_a = parse_fine_tuned_lora_name( - tensor_name, weights_mapper - ) - if module_name not in loras: - loras[module_name] = LoRALayerWeights.from_config( - module_name, peft_helper - ) - - if is_lora_a: - if ( - "lora_embedding_A" in tensor_name - and model_vocab_size is not None - and model_vocab_size != tensor.shape[1] - ): - raise RuntimeError( - f"The embedding LoRA size({tensor.shape[1]}) must be consistent" - f" with the base model's vocabulary size({model_vocab_size})." - ) - loras[module_name].lora_a = tensor.to(device=device, dtype=dtype) - else: - loras[module_name].lora_b = tensor.to(device=device, dtype=dtype) - return cls(lora_model_id, peft_helper.r, loras) - - @classmethod - def from_local_checkpoint( - cls, - lora_dir: str, - expected_lora_modules: set[str], - peft_helper: PEFTHelper, - *, - lora_model_id: int | None = None, - device: str = "cuda", - dtype: torch.dtype | None = None, - model_vocab_size: int | None = None, - weights_mapper: WeightsMapper | None = None, - tensorizer_config_dict: dict | None = None, - ) -> "LoRAModel": - """Create a LoRAModel from a local checkpoint. - - Args: - lora_dir: The local path that has lora data. - expected_lora_modules: Name of modules that are expected to be - replaced by lora. - peft_helper: Loaded lora configuration information. - lora_model_id: LoRA model id. If not given, automatically set by - a global counter. - device: Device where the lora model is loaded. - dtype: dtype of the lora model weights. - - Returns: - Loaded LoRA Model. - """ - lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") - lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") - lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") - - tensors: dict[str, torch.Tensor] = {} - unexpected_modules: list[list[str] | str] = [] - - def check_unexpected_modules(modules: dict): - for lora_module in modules.keys(): # noqa - if is_base_embeddding_weights(lora_module): - continue - # Handle PEFT file format where experts.base_layer is the - # gate_up_proj and experts is the down_proj - if "base_layer" in lora_module: - continue - module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) - # Case for expert lora weights - if ".experts" in module_name: - expert_idx = module_name.find(".experts") - expert_suffix = module_name[expert_idx + 1 :] - if expert_suffix not in expected_lora_modules: - unexpected_modules.append(module_name) - - elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules: - unexpected_modules.append(module_name) - - if unexpected_modules: - raise ValueError( - f"While loading {lora_dir}, expected" - f" target modules in {expected_lora_modules}" - f" but received {unexpected_modules}." - f" Please verify that the loaded LoRA module is correct" - ) - - if tensorizer_config_dict: - from tensorizer import TensorDeserializer - - tensorizer_config = TensorizerConfig(**tensorizer_config_dict) - lora_tensor_path = os.path.join( - tensorizer_config.tensorizer_dir, "adapter_model.tensors" - ) - tensorizer_args = tensorizer_config._construct_tensorizer_args() - tensors = TensorDeserializer( - lora_tensor_path, - dtype=tensorizer_config.dtype, - **tensorizer_args.deserialization_kwargs, - ) - check_unexpected_modules(tensors) - - elif os.path.isfile(lora_tensor_path): - # Find unexpected modules. - # Use safetensor key as a source of truth to find expected modules. - # in peft if you have target_modules A, B, C and C does not exist - # in the model it won’t error and model will be trained with A, B - # loraified. C won’t exist in the safetensor but it will exist in - # the target_modules of the adapter_config.json. - unexpected_modules = [] - with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore - # Load tensors if there are only expected modules. - check_unexpected_modules(f) - for module in f.keys(): # noqa - tensors[module] = f.get_tensor(module) - elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path): - # When a bin/pt file is provided, we rely on config to find - # unexpected modules. - unexpected_modules = [] - target_modules = peft_helper.target_modules - if not isinstance(target_modules, list): - target_modules = [target_modules] - for module in target_modules: - # Compatible with more modules, - # such as:layers.11.self_attn.k_proj - part_name = module.split(".")[-1] - if part_name not in expected_lora_modules: - unexpected_modules.append(module) - # loaded lora's target modules must be a subset of - # expected_lora_modules. It is not reliable. See - # https://github.com/vllm-project/vllm/pull/5909. But there's no - # other better mechanism. - if unexpected_modules and not is_regex_target_modules( - peft_helper.target_modules, expected_lora_modules - ): - raise ValueError( - f"While loading {lora_dir}, expected" - f" target modules in {expected_lora_modules}" - f" but received {unexpected_modules}." - f" Please verify that the loaded LoRA module is correct" - ) - lora_file_path = ( - lora_bin_file_path - if os.path.isfile(lora_bin_file_path) - else lora_pt_file_path - ) - tensors = torch.load(lora_file_path, map_location=device, weights_only=True) - else: - raise ValueError(f"{lora_dir} doesn't contain tensors") - - return cls.from_lora_tensors( - lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id, - tensors=tensors, - peft_helper=peft_helper, - device=device, - dtype=dtype, - model_vocab_size=model_vocab_size, - weights_mapper=weights_mapper, - ) - - class LoRAModelManager: """A manager that manages multiple LoRA-fine-tuned models.""" diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 47484b2b984df..4d264c06826b8 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -48,6 +48,15 @@ if TYPE_CHECKING: logger = init_logger(__name__) +_GLOBAL_LORA_ID = 0 + + +def get_lora_id(): + global _GLOBAL_LORA_ID + _GLOBAL_LORA_ID += 1 + return _GLOBAL_LORA_ID + + _all_lora_classes: set[type[BaseLayerWithLoRA]] = { VocabParallelEmbeddingWithLoRA, ColumnParallelLinearWithLoRA, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 7d77ba7247ef0..28c2a53d84e42 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -8,8 +8,8 @@ import torch from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.lora.models import ( - LoRAModel, +from vllm.lora.lora_model import LoRAModel +from vllm.lora.model_manager import ( LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager, From 398a596ed249c14b76806900894304a62d653603 Mon Sep 17 00:00:00 2001 From: weiguihua2 Date: Tue, 9 Dec 2025 01:33:48 +0800 Subject: [PATCH 073/133] [MP executor] fix get device count for multi node of mp executor feature (#30042) Signed-off-by: weiguihua2 --- vllm/distributed/device_communicators/shm_broadcast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 052df19e34d72..114516ff07a1f 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -27,6 +27,7 @@ from zmq import ( # type: ignore import vllm.envs as envs from vllm.distributed.utils import StatelessProcessGroup, sched_yield from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils.network_utils import ( get_ip, get_open_port, @@ -632,7 +633,7 @@ class MessageQueue: The MessageQueue instance for the calling process, and a list of handles (only non-empty for the reader process). """ - local_size = torch.cuda.device_count() + local_size = current_platform.device_count() rank = dist.get_rank() same_node = rank // local_size == reader_rank // local_size buffer_io = MessageQueue( From fcd5306f65876f72122d7e5852b6000738498d7e Mon Sep 17 00:00:00 2001 From: shaharmor98 <17088876+shaharmor98@users.noreply.github.com> Date: Mon, 8 Dec 2025 19:35:01 +0200 Subject: [PATCH 074/133] Add latent MoE support (#30203) Signed-off-by: Shahar Mor Co-authored-by: Tyler Michael Smith --- vllm/model_executor/models/nemotron_h.py | 62 +++++++++++++++++-- vllm/transformers_utils/configs/nemotron_h.py | 2 + 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index baeb901bbb05a..2d9dfbd3e7688 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -83,6 +83,7 @@ class NemotronHMLP(nn.Module): def __init__( self, config: NemotronHConfig, + hidden_size: int, intermediate_size: int, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -93,7 +94,7 @@ class NemotronHMLP(nn.Module): super().__init__() self.up_proj = ColumnParallelLinear( - input_size=config.hidden_size, + input_size=hidden_size, output_size=intermediate_size, bias=bias, quant_config=quant_config, @@ -102,7 +103,7 @@ class NemotronHMLP(nn.Module): ) self.down_proj = RowParallelLinear( input_size=intermediate_size, - output_size=config.hidden_size, + output_size=hidden_size, bias=bias, quant_config=quant_config, reduce_results=reduce_results, @@ -135,6 +136,10 @@ class NemotronHMoE(nn.Module): self.ep_size = self.ep_group.size() self.n_routed_experts: int = config.n_routed_experts self.n_shared_experts: int = config.n_shared_experts + self.use_latent_moe: bool = getattr(config, "moe_latent_size", None) is not None + self.moe_hidden_size: int = ( + config.moe_latent_size if self.use_latent_moe else config.hidden_size + ) self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe @@ -172,6 +177,7 @@ class NemotronHMoE(nn.Module): self.shared_experts = NemotronHMLP( config=config, + hidden_size=config.hidden_size, intermediate_size=intermediate_size, quant_config=quant_config, reduce_results=False, @@ -180,10 +186,12 @@ class NemotronHMoE(nn.Module): ) self.experts = SharedFusedMoE( - shared_experts=self.shared_experts, + # TODO: make it possible for shared experts to have + # different input in SharedFusedMoE + shared_experts=self.shared_experts if not self.use_latent_moe else None, num_experts=config.n_routed_experts, top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, + hidden_size=self.moe_hidden_size, intermediate_size=config.moe_intermediate_size, reduce_results=False, renormalize=config.norm_topk_prob, @@ -201,6 +209,32 @@ class NemotronHMoE(nn.Module): is_sequence_parallel=self.is_sequence_parallel, ) + if self.use_latent_moe: + # TODO: check if using ReplicatedLinear is better than + # ColumnParallelLinear + all_gather + self.fc1_latent_proj = ColumnParallelLinear( + input_size=config.hidden_size, + output_size=self.moe_hidden_size, + bias=config.mlp_bias, + quant_config=quant_config, + disable_tp=self.is_sequence_parallel, + # We need to gather the output to prepare input for moe + gather_output=True, + prefix=f"{prefix}.fc1_latent_proj", + ) + self.fc2_latent_proj = ReplicatedLinear( + input_size=self.moe_hidden_size, + output_size=config.hidden_size, + bias=config.mlp_bias, + quant_config=quant_config, + disable_tp=self.is_sequence_parallel, + prefix=f"{prefix}.fc2_latent_proj", + ) + + else: + self.fc1_latent_proj = None + self.fc2_latent_proj = None + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) @@ -210,12 +244,20 @@ class NemotronHMoE(nn.Module): # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) + shared_output = None + if self.use_latent_moe: + if self.shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + hidden_states, _ = self.fc1_latent_proj(hidden_states) fused_moe_out = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - shared_output, final_hidden_states = fused_moe_out + if self.use_latent_moe: + _, final_hidden_states = fused_moe_out + else: + shared_output, final_hidden_states = fused_moe_out # Fix FP16 overflow # See DeepseekV2DecoderLayer for more details. @@ -225,6 +267,13 @@ class NemotronHMoE(nn.Module): assert shared_output is not None shared_output *= 1.0 / self.routed_scaling_factor + # TODO: currently latent up_proj is done before all-reduce for simplicity. + # if and when shared experts will be part of SharedFusedMoE, + # we should do the up_proj after all-reduce, + # to have the all-reduce in the smaller latent dimension. + if self.use_latent_moe: + final_hidden_states, _ = self.fc2_latent_proj(final_hidden_states) + if self.shared_experts is not None: assert shared_output is not None final_hidden_states += shared_output @@ -268,6 +317,7 @@ class NemotronHMLPDecoderLayer(nn.Module): self.mixer = NemotronHMLP( config, + hidden_size=config.hidden_size, intermediate_size=intermediate_size, quant_config=quant_config, bias=config.mlp_bias, @@ -846,5 +896,5 @@ class NemotronHForCausalLM( return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self) + loader = AutoWeightsLoader(self, skip_prefixes=["mtp"]) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py index 68c40002098c8..86c117fd9d59f 100644 --- a/vllm/transformers_utils/configs/nemotron_h.py +++ b/vllm/transformers_utils/configs/nemotron_h.py @@ -189,6 +189,7 @@ class NemotronHConfig(PretrainedConfig): n_shared_experts=1, moe_intermediate_size=7688, moe_shared_expert_intermediate_size=7688, + moe_latent_size=None, num_experts_per_tok=2, routed_scaling_factor=1.0, n_group=1, @@ -254,6 +255,7 @@ class NemotronHConfig(PretrainedConfig): self.n_shared_experts = n_shared_experts self.moe_intermediate_size = moe_intermediate_size self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size # noqa: E501 + self.moe_latent_size = moe_latent_size self.num_experts_per_tok = num_experts_per_tok self.routed_scaling_factor = routed_scaling_factor self.n_group = n_group From d1b5e7afbf8a4ff45904c191748594167dd34e78 Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Mon, 8 Dec 2025 12:10:10 -0800 Subject: [PATCH 075/133] [TPU] Bump tpu-inference to 0.12.0 (#30221) Signed-off-by: Johnny Yang --- requirements/tpu.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index e6fff58f7b794..7695b4ba2f4cb 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -11,5 +11,4 @@ ray[default] ray[data] setuptools==78.1.0 nixl==0.3.0 -tpu_info==0.4.0 -tpu-inference==0.11.1 +tpu-inference==0.12.0 From 0d402d2600490bac17bc5d079e89b1136fe37eda Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Mon, 8 Dec 2025 15:15:10 -0500 Subject: [PATCH 076/133] online fp8 quant with streaming weight post-processing (#29196) Signed-off-by: vasiliy --- .../model_executor/layers/quantization/fp8.py | 67 ++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 0e3e13f5945ea..419ddd91b64e0 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -465,6 +465,30 @@ class Fp8LinearMethod(LinearMethodBase): output_size_per_partition, input_size_per_partition, weight_loader ) else: + + def patched_weight_loader(param, loaded_weight, *args, **kwargs): + # load the current weight chunk + res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] + + # track how many elements we have updated + if not hasattr(layer, "_loaded_numel"): + layer._loaded_numel = 0 + layer._loaded_numel += loaded_weight.numel() + + # if we have loaded all of the elements, call + # process_weights_after_loading + target_loaded_numel = layer.weight.numel() + if layer._loaded_numel == target_loaded_numel: + self.process_weights_after_loading(layer) + + # Delete the bookkeeping + del layer._loaded_numel + # Prevent the usual `process_weights_after_loading` call from doing + # anything + layer._already_called_process_weights_after_loading = True + + return res + # For non-serialized checkpoints, use original dtype weight = ModelWeightParameter( data=torch.empty( @@ -474,7 +498,7 @@ class Fp8LinearMethod(LinearMethodBase): ), input_dim=1, output_dim=0, - weight_loader=weight_loader, + weight_loader=patched_weight_loader, ) layer.register_parameter("weight", weight) @@ -515,6 +539,9 @@ class Fp8LinearMethod(LinearMethodBase): layer.register_parameter("input_scale", None) def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return + size_k_first = True input_scale = None # TODO(rob): refactor block quant into separate class. @@ -738,6 +765,41 @@ class Fp8MoEMethod(FusedMoEMethodBase): f"weight quantization block_k = {block_k}." ) + # if we are doing online quantization, patch the weight + # loaded to call `process_weights_after_loading` in a streaming fashion + # as soon as the last weight chunk is loaded + if not self.quant_config.is_checkpoint_fp8_serialized: + weight_loader = extra_weight_attrs["weight_loader"] + # create a new holder to prevent modifying behavior of any other + # objects which might depend on the old one + new_extra_weight_attrs = extra_weight_attrs + + def patched_weight_loader(param, loaded_weight, *args, **kwargs): + # load the current weight chunk + res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] + + # add a counter to track how many elements we have updated + if not hasattr(layer, "_loaded_numel"): + layer._loaded_numel = 0 + layer._loaded_numel += loaded_weight.numel() + + # if we have loaded all of the elements, call + # process_weights_after_loading + target_loaded_numel = layer.w13_weight.numel() + layer.w2_weight.numel() + if layer._loaded_numel == target_loaded_numel: + self.process_weights_after_loading(layer) + + # Delete the bookkeeping + del layer._loaded_numel + # Prevent the usual `process_weights_after_loading` call + # from doing anything + layer._already_called_process_weights_after_loading = True + + return res + + new_extra_weight_attrs["weight_loader"] = patched_weight_loader + extra_weight_attrs = new_extra_weight_attrs + # WEIGHTS w13_weight = torch.nn.Parameter( torch.empty( @@ -839,6 +901,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.rocm_aiter_moe_enabled = False def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return + # Lazy import to avoid importing triton too early. self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() From 799804d140fc99ce3964648ba91aaa810cf28fef Mon Sep 17 00:00:00 2001 From: Dmitry Tokarev Date: Mon, 8 Dec 2025 15:24:34 -0500 Subject: [PATCH 077/133] Bump nvshmem to 3.3.24 and fix CUDA 13 installation (#30149) Signed-off-by: Dmitry Tokarev Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tools/ep_kernels/install_python_libraries.sh | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 88be5cd778fff..1bb7fd8345238 100755 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -10,9 +10,10 @@ set -ex CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"} DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"} -NVSHMEM_VER=3.3.9 +NVSHMEM_VER=3.3.24 # Suppports both CUDA 12 and 13 WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace} MODE=${MODE:-install} +CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2) # Parse arguments while [[ $# -gt 0 ]]; do @@ -75,11 +76,9 @@ ARCH=$(uname -m) case "${ARCH,,}" in x86_64|amd64) NVSHMEM_SUBDIR="linux-x86_64" - NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz" ;; aarch64|arm64) NVSHMEM_SUBDIR="linux-sbsa" - NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz" ;; *) echo "Unsupported architecture: ${ARCH}" >&2 @@ -87,6 +86,7 @@ case "${ARCH,,}" in ;; esac +NVSHMEM_FILE="libnvshmem-${NVSHMEM_SUBDIR}-${NVSHMEM_VER}_cuda${CUDA_VERSION_MAJOR}-archive.tar.xz" NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}" pushd "$WORKSPACE" @@ -142,13 +142,6 @@ clone_repo() { fi } -deepep_cuda13_patch() { - cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2) - if [ ${cuda_version_major} -ge 13 ]; then - sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py" - fi -} - do_build() { local repo=$1 local name=$2 @@ -160,8 +153,9 @@ do_build() { clone_repo "$repo" "$name" "$key" "$commit" cd "$name" - if [ "$name" == "DeepEP" ]; then - deepep_cuda13_patch + # DeepEP CUDA 13 patch + if [[ "$name" == "DeepEP" && "${CUDA_VERSION_MAJOR}" -ge 13 ]]; then + sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py" fi if [ "$MODE" = "install" ]; then From ae0f69b16aaa54197257267a8262555beb7d5ae3 Mon Sep 17 00:00:00 2001 From: roikoren755 <26850796+roikoren755@users.noreply.github.com> Date: Mon, 8 Dec 2025 23:45:18 +0200 Subject: [PATCH 078/133] Add SpecDec support to `selective_state_update` (#29488) Signed-off-by: Roi Koren --- tests/kernels/mamba/test_mamba_ssm.py | 325 ++++++++++++++++++ .../layers/mamba/ops/mamba_ssm.py | 252 ++++++++++---- 2 files changed, 505 insertions(+), 72 deletions(-) diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 98edc959957d0..50e48aad6ebaa 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -425,6 +425,80 @@ def test_selective_state_update(dim, dstate, has_z, itype): assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("has_z", [False, True]) +@pytest.mark.parametrize("dstate", [16, 64]) +@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) +@pytest.mark.parametrize("max_seq_len", [1, 2, 4]) +def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len): + device = "cuda" + rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) + if itype == torch.bfloat16: + rtol, atol = 5e-2, 1.5e-1 + if torch.version.hip: + atol *= 2 + # set seed + current_platform.seed_everything(0) + batch_size = 4 + token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) + total_tokens = int(token_counts.sum().item()) + cu_seqlens = torch.tensor( + [0] + torch.cumsum(token_counts, dim=0).tolist(), + dtype=torch.int32, + device=device, + ) + state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) + x = torch.randn(total_tokens, dim, device=device, dtype=itype) + out = torch.empty_like(x) + dt = torch.randn(total_tokens, dim, device=device, dtype=itype) + dt_bias = torch.rand(dim, device=device) - 4.0 + A = -torch.rand(dim, dstate, device=device) - 1.0 + B = torch.randn(total_tokens, dstate, device=device) + C = torch.randn(total_tokens, dstate, device=device) + D = torch.randn(dim, device=device) + z = torch.randn_like(x) if has_z else None + state_ref = state.detach().clone() + selective_state_update( + state, + x, + dt, + A, + B, + C, + D=D, + z=z, + dt_bias=dt_bias, + dt_softplus=True, + out=out, + cu_seqlens=cu_seqlens, + ) + + out_ref_list = [] + for seq_idx in range(batch_size): + start_idx = cu_seqlens[seq_idx].item() + end_idx = cu_seqlens[seq_idx + 1].item() + num_tokens = end_idx - start_idx + for token_idx in range(num_tokens): + idx = start_idx + token_idx + out_ref_list.append( + selective_state_update_ref( + state_ref[seq_idx : seq_idx + 1], + x[idx : idx + 1], + dt[idx : idx + 1], + A, + B[idx : idx + 1], + C[idx : idx + 1], + D=D, + z=z[idx : idx + 1] if has_z else None, + dt_bias=dt_bias, + dt_softplus=True, + ) + ) + out_ref = torch.cat(out_ref_list, dim=0) + assert torch.allclose(state, state_ref, rtol=rtol, atol=atol) + assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) + + @pytest.mark.parametrize("wtype", [torch.float32]) @pytest.mark.parametrize("itype", [torch.float32]) @pytest.mark.parametrize("seqlen", [1, 256, 1024, 4096]) @@ -766,3 +840,254 @@ def test_selective_state_update_with_heads_with_batch_indices( print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol) assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("has_z", [False, True]) +@pytest.mark.parametrize("dstate", [16, 64]) +@pytest.mark.parametrize("dim", [2048, 4096]) +@pytest.mark.parametrize("max_seq_len", [2, 4]) +def test_selective_state_update_with_num_accepted_tokens( + dim, dstate, has_z, itype, max_seq_len +): + device = "cuda" + rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) + if itype == torch.bfloat16: + rtol, atol = 5e-2, 1.5e-1 + if torch.version.hip: + atol *= 2 + + current_platform.seed_everything(0) + batch_size = 4 + + tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) + total_tokens = int(tokens_per_seq.sum().item()) + + num_accepted_tokens = torch.randint(0, max_seq_len, (batch_size,), device=device) + num_accepted_tokens[0] = 0 # Add edge-case of no accepted tokens + num_accepted_tokens[1] = max_seq_len # Add edge-case of all tokens accepted + + cu_seqlens = torch.tensor( + [0] + torch.cumsum(tokens_per_seq, dim=0).tolist(), + dtype=torch.int32, + device=device, + ) + + total_state_slots = 50 + state = torch.randn(total_state_slots, dim, dstate, dtype=itype, device=device) + + state_batch_indices = torch.full( + (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device + ) + initial_state_slots = torch.randint( + 0, 15, (batch_size,), device=device, dtype=torch.int32 + ) + for seq_idx in range(batch_size): + token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0) + state_batch_indices[seq_idx, token_pos] = initial_state_slots[seq_idx] + + dst_state_batch_indices = torch.full( + (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device + ) + slot_offset = 15 + dst_slots_map = {} + for seq_idx in range(batch_size): + for token_idx in range(tokens_per_seq[seq_idx].item()): + dst_state_batch_indices[seq_idx, token_idx] = slot_offset + dst_slots_map[(seq_idx, token_idx)] = slot_offset + slot_offset += 1 + + x = torch.randn(total_tokens, dim, device=device, dtype=itype) + out = torch.empty_like(x) + dt = torch.randn(total_tokens, dim, device=device, dtype=itype) + dt_bias = torch.rand(dim, device=device) - 4.0 + A = -torch.rand(dim, dstate, device=device) - 1.0 + B = torch.randn(total_tokens, dstate, device=device) + C = torch.randn(total_tokens, dstate, device=device) + D = torch.randn(dim, device=device) + z = torch.randn_like(x) if has_z else None + + state_ref_intermediate = {} + out_ref_list = [] + + for seq_idx in range(batch_size): + seq_start = cu_seqlens[seq_idx].item() + seq_end = cu_seqlens[seq_idx + 1].item() + num_tokens = seq_end - seq_start + + token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0) + initial_slot = state_batch_indices[seq_idx, token_pos].item() + state_seq = state[initial_slot : initial_slot + 1].clone() + + for token_idx in range(num_tokens): + global_idx = seq_start + token_idx + + out_token = selective_state_update_ref( + state_seq, + x[global_idx : global_idx + 1], + dt[global_idx : global_idx + 1], + A, + B[global_idx : global_idx + 1], + C[global_idx : global_idx + 1], + D=D, + z=z[global_idx : global_idx + 1] if has_z else None, + dt_bias=dt_bias, + dt_softplus=True, + ) + out_ref_list.append(out_token) + state_ref_intermediate[(seq_idx, token_idx)] = state_seq.clone() + + out_ref = torch.cat(out_ref_list, dim=0) + + selective_state_update( + state, + x, + dt, + A, + B, + C, + D=D, + z=z, + dt_bias=dt_bias, + dt_softplus=True, + out=out, + cu_seqlens=cu_seqlens, + state_batch_indices=state_batch_indices, + dst_state_batch_indices=dst_state_batch_indices, + num_accepted_tokens=num_accepted_tokens, + pad_slot_id=PAD_SLOT_ID, + ) + + assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) + + for seq_idx in range(batch_size): + num_tokens = tokens_per_seq[seq_idx].item() + for token_idx in range(num_tokens): + dst_slot = dst_slots_map[(seq_idx, token_idx)] + state_ref = state_ref_intermediate[(seq_idx, token_idx)].squeeze(0) + assert torch.allclose(state[dst_slot], state_ref, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize("itype", [torch.float32, torch.bfloat16]) +@pytest.mark.parametrize("has_z", [False, True]) +@pytest.mark.parametrize("dstate", [16, 64]) +@pytest.mark.parametrize("dim", [2048, 4096]) +@pytest.mark.parametrize("max_seq_len", [2, 4]) +def test_selective_state_update_varlen_with_num_accepted( + dim, dstate, has_z, itype, max_seq_len +): + device = "cuda" + rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) + if itype == torch.bfloat16: + rtol, atol = 5e-2, 1.5e-1 + if torch.version.hip: + atol *= 2 + + current_platform.seed_everything(0) + batch_size = 4 + + tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) + total_tokens = int(tokens_per_seq.sum().item()) + + num_accepted_tokens = torch.randint(0, max_seq_len, (batch_size,), device=device) + num_accepted_tokens[0] = 0 # Add edge-case of no accepted tokens + num_accepted_tokens[1] = max_seq_len # Add edge-case of all tokens accepted + + cu_seqlens = torch.tensor( + [0] + torch.cumsum(tokens_per_seq, dim=0).tolist(), + dtype=torch.int32, + device=device, + ) + + total_state_slots = 50 + state = torch.randn(total_state_slots, dim, dstate, dtype=itype, device=device) + + state_batch_indices = torch.full( + (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device + ) + + initial_state_slots = torch.randint( + 0, 15, (batch_size,), device=device, dtype=torch.int32 + ) + for seq_idx in range(batch_size): + token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0) + state_batch_indices[seq_idx, token_pos] = initial_state_slots[seq_idx] + + dst_state_batch_indices = torch.full( + (batch_size, max_seq_len), PAD_SLOT_ID, dtype=torch.int32, device=device + ) + + slot_offset = 15 + dst_slots_map = {} + for seq_idx in range(batch_size): + for token_idx in range(tokens_per_seq[seq_idx].item()): + dst_state_batch_indices[seq_idx, token_idx] = slot_offset + dst_slots_map[(seq_idx, token_idx)] = slot_offset + slot_offset += 1 + + x = torch.randn(total_tokens, dim, device=device, dtype=itype) + out = torch.empty_like(x) + dt = torch.randn(total_tokens, dim, device=device, dtype=itype) + dt_bias = torch.rand(dim, device=device) - 4.0 + A = -torch.rand(dim, dstate, device=device) - 1.0 + B = torch.randn(total_tokens, dstate, device=device) + C = torch.randn(total_tokens, dstate, device=device) + D = torch.randn(dim, device=device) + z = torch.randn_like(x) if has_z else None + + state_ref_intermediate = {} + + for seq_idx in range(batch_size): + seq_start = cu_seqlens[seq_idx].item() + seq_end = cu_seqlens[seq_idx + 1].item() + num_tokens = seq_end - seq_start + + token_pos = max(num_accepted_tokens[seq_idx].item() - 1, 0) + initial_slot = state_batch_indices[seq_idx, token_pos].item() + state_seq = state[initial_slot : initial_slot + 1].clone() + + for token_idx in range(num_tokens): + global_idx = seq_start + token_idx + + selective_state_update_ref( + state_seq, + x[global_idx : global_idx + 1], + dt[global_idx : global_idx + 1], + A, + B[global_idx : global_idx + 1], + C[global_idx : global_idx + 1], + D=D, + z=z[global_idx : global_idx + 1] if has_z else None, + dt_bias=dt_bias, + dt_softplus=True, + ) + + state_ref_intermediate[(seq_idx, token_idx)] = state_seq.clone() + + selective_state_update( + state, + x, + dt, + A, + B, + C, + D=D, + z=z, + dt_bias=dt_bias, + dt_softplus=True, + out=out, + cu_seqlens=cu_seqlens, + state_batch_indices=state_batch_indices, + dst_state_batch_indices=dst_state_batch_indices, + num_accepted_tokens=num_accepted_tokens, + pad_slot_id=PAD_SLOT_ID, + ) + + for seq_idx in range(batch_size): + num_tokens = tokens_per_seq[seq_idx].item() + + for token_idx in range(num_tokens): + dst_slot = dst_slots_map[(seq_idx, token_idx)] + state_ref = state_ref_intermediate[(seq_idx, token_idx)].squeeze(0) + + assert torch.allclose(state[dst_slot], state_ref, rtol=rtol, atol=atol) diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 53fd5d5458b09..800f8bd840792 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -36,10 +36,14 @@ else: is not None } ) +@triton.heuristics( + {"IS_SPEC_DECODING": lambda args: args["num_accepted_tokens_ptr"] is not None} +) +@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens_ptr"] is not None}) @triton.heuristics( {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])} ) -@triton.jit +@triton.jit(do_not_specialize=["N"]) def _selective_scan_update_kernel( # Pointers to matrices state_ptr, @@ -55,8 +59,10 @@ def _selective_scan_update_kernel( state_batch_indices_ptr, dst_state_batch_indices_ptr, pad_slot_id, + num_accepted_tokens_ptr, + cu_seqlens_ptr, # Matrix dimensions - batch, + N, nheads, dim, dstate, @@ -91,6 +97,10 @@ def _selective_scan_update_kernel( stride_out_batch, stride_out_head, stride_out_dim, + stride_state_indices_batch, + stride_state_indices_T, + stride_dst_state_indices_batch, + stride_dst_state_indices_T, # Meta-parameters DT_SOFTPLUS: tl.constexpr, TIE_HDIM: tl.constexpr, @@ -99,22 +109,50 @@ def _selective_scan_update_kernel( HAS_D: tl.constexpr, HAS_Z: tl.constexpr, HAS_STATE_BATCH_INDICES: tl.constexpr, + IS_SPEC_DECODING: tl.constexpr, + IS_VARLEN: tl.constexpr, BLOCK_SIZE_DSTATE: tl.constexpr, ): pid_m = tl.program_id(axis=0) pid_b = tl.program_id(axis=1) pid_h = tl.program_id(axis=2) + if IS_VARLEN: + bos = tl.load(cu_seqlens_ptr + pid_b).to(tl.int64) + eos = tl.load(cu_seqlens_ptr + pid_b + 1).to(tl.int64) + seq_len = eos - bos + + if seq_len == 0: + return + else: + bos = pid_b + seq_len = 1 + + state_ptr_base = state_ptr + # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate # is taken from the state_batch_indices_ptr Otherwise, the state coordinate # is the same as the batch id. if HAS_STATE_BATCH_INDICES: - dst_state_batch_indices_ptr += pid_b - dst_state_batch_idx = tl.load(dst_state_batch_indices_ptr).to(tl.int64) - dst_state_ptr = state_ptr + ( - dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head + if IS_SPEC_DECODING: + num_accepted = tl.load(num_accepted_tokens_ptr + pid_b).to(tl.int64) + init_token_idx = tl.maximum(num_accepted - 1, 0) + else: + init_token_idx = 0 + + dst_state_batch_indices_ptr += pid_b * stride_dst_state_indices_batch + if not IS_SPEC_DECODING: + dst_state_batch_idx = tl.load( + dst_state_batch_indices_ptr + + init_token_idx * stride_dst_state_indices_T + ).to(tl.int64) + dst_state_ptr = state_ptr + ( + dst_state_batch_idx * stride_state_batch + pid_h * stride_state_head + ) + + state_batch_indices_ptr += ( + pid_b * stride_state_indices_batch + init_token_idx * stride_state_indices_T ) - state_batch_indices_ptr += pid_b state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64) state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head else: @@ -123,86 +161,112 @@ def _selective_scan_update_kernel( ) state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head - x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head - dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head + x_ptr += bos * stride_x_batch + pid_h * stride_x_head + dt_ptr += bos * stride_dt_batch + pid_h * stride_dt_head if HAS_DT_BIAS: dt_bias_ptr += pid_h * stride_dt_bias_head A_ptr += pid_h * stride_A_head - B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group - C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group + B_ptr += bos * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group + C_ptr += bos * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group if HAS_Z: - z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head - out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head + z_ptr += bos * stride_z_batch + pid_h * stride_z_head + out_ptr += bos * stride_out_batch + pid_h * stride_out_head offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offs_n = tl.arange(0, BLOCK_SIZE_DSTATE) state_ptrs = state_ptr + ( offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate ) - dst_state_ptrs = dst_state_ptr + ( - offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate - ) - x_ptrs = x_ptr + offs_m * stride_x_dim - dt_ptrs = dt_ptr + offs_m * stride_dt_dim + if not IS_SPEC_DECODING: + dst_state_ptrs = dst_state_ptr + ( + offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate + ) + + mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate) + if HAS_STATE_BATCH_INDICES: + mask &= state_batch_idx != pad_slot_id + state = tl.load(state_ptrs, mask=mask, other=0.0).to(tl.float32) + if HAS_DT_BIAS: dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim if HAS_D: D_ptr += pid_h * stride_D_head - A_ptrs = A_ptr + ( - offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate - ) - B_ptrs = B_ptr + offs_n * stride_B_dstate - C_ptrs = C_ptr + offs_n * stride_C_dstate - if HAS_D: D_ptrs = D_ptr + offs_m * stride_D_dim - if HAS_Z: - z_ptrs = z_ptr + offs_m * stride_z_dim - out_ptrs = out_ptr + offs_m * stride_out_dim - mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate) - if HAS_STATE_BATCH_INDICES: - mask &= state_batch_idx != pad_slot_id - state = tl.load(state_ptrs, mask=mask, other=0.0) + A_ptrs = A_ptr + offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate - x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - if not TIE_HDIM: - dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - if HAS_DT_BIAS: - dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - if DT_SOFTPLUS: - dt = softplus(dt) - A = tl.load( - A_ptrs, mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), other=0.0 - ).to(tl.float32) - dA = tl.exp(A * dt[:, None]) - else: - dt = tl.load(dt_ptr).to(tl.float32) - if HAS_DT_BIAS: - dt += tl.load(dt_bias_ptr).to(tl.float32) - if DT_SOFTPLUS: - dt = softplus(dt) - A = tl.load(A_ptr).to(tl.float32) - dA = tl.exp(A * dt) # scalar, not a matrix + for i_t in range(seq_len): + x_ptrs = x_ptr + offs_m * stride_x_dim + dt_ptrs = dt_ptr + offs_m * stride_dt_dim + B_ptrs = B_ptr + offs_n * stride_B_dstate + C_ptrs = C_ptr + offs_n * stride_C_dstate + if HAS_Z: + z_ptrs = z_ptr + offs_m * stride_z_dim + out_ptrs = out_ptr + offs_m * stride_out_dim - B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32) - C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32) - if HAS_D: - D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - if HAS_Z: - z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if not TIE_HDIM: + dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if HAS_DT_BIAS: + dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if DT_SOFTPLUS: + dt = softplus(dt) + A = tl.load( + A_ptrs, + mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate), + other=0.0, + ).to(tl.float32) + dA = tl.exp(A * dt[:, None]) + else: + dt = tl.load(dt_ptr).to(tl.float32) + if HAS_DT_BIAS: + dt += tl.load(dt_bias_ptr).to(tl.float32) + if DT_SOFTPLUS: + dt = softplus(dt) + A = tl.load(A_ptr).to(tl.float32) + dA = tl.exp(A * dt) # scalar, not a matrix - dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt - state = state * dA + dB * x[:, None] + B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32) + C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32) + if HAS_D: + D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) + if HAS_Z: + z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate) - if HAS_STATE_BATCH_INDICES: - mask &= state_batch_idx != pad_slot_id - tl.store(dst_state_ptrs, state, mask=mask) - out = tl.sum(state * C[None, :], axis=1) - if HAS_D: - out += x * D - if HAS_Z: - out *= z * tl.sigmoid(z) - tl.store(out_ptrs, out, mask=offs_m < dim) + dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt + state = state * dA + dB * x[:, None] + + if IS_SPEC_DECODING: + dst_idx_ptr = dst_state_batch_indices_ptr + i_t * stride_dst_state_indices_T + token_dst_idx = tl.load(dst_idx_ptr).to(tl.int64) + if token_dst_idx != pad_slot_id: + token_dst_ptrs = ( + state_ptr_base + + token_dst_idx * stride_state_batch + + pid_h * stride_state_head + + offs_m[:, None] * stride_state_dim + + offs_n[None, :] * stride_state_dstate + ) + tl.store( + token_dst_ptrs, state.to(token_dst_ptrs.dtype.element_ty), mask=mask + ) + + out = tl.sum(state * C[None, :], axis=1) + if HAS_D: + out += x * D + if HAS_Z: + out *= z * tl.sigmoid(z) + tl.store(out_ptrs, out, mask=offs_m < dim) + + x_ptr += stride_x_batch + dt_ptr += stride_dt_batch + B_ptr += stride_B_batch + C_ptr += stride_C_batch + out_ptr += stride_out_batch + if HAS_Z: + z_ptr += stride_z_batch + + if not IS_SPEC_DECODING: + tl.store(dst_state_ptrs, state.to(dst_state_ptrs.dtype.element_ty), mask=mask) def selective_state_update( @@ -220,6 +284,8 @@ def selective_state_update( dst_state_batch_indices=None, pad_slot_id=PAD_SLOT_ID, out=None, + num_accepted_tokens=None, + cu_seqlens=None, ): """ Argument: @@ -240,6 +306,11 @@ def selective_state_update( indices 0 and 3 out: Preallocated ssm output tensor. Assume same shape as x. In-place updated. + num_accepted_tokens: (batch,) + number of accepted tokens from previous verification step, + tells the kernel which initial state to use + cu_seqlens: (batch,) + length per sequence, for variable length in speculative decoding cases """ if state.dim() == 3: state = state.unsqueeze(1) @@ -261,9 +332,26 @@ def selective_state_update( dt_bias = dt_bias.unsqueeze(0) if out.dim() == 2: out = out.unsqueeze(1) + if num_accepted_tokens is not None: + assert state_batch_indices is not None and state_batch_indices.dim() == 2 + assert dst_state_batch_indices is None or dst_state_batch_indices.dim() == 2 + if state_batch_indices is not None and state_batch_indices.dim() == 1: + state_batch_indices = state_batch_indices.unsqueeze(1) + if dst_state_batch_indices is not None and dst_state_batch_indices.dim() == 1: + dst_state_batch_indices = dst_state_batch_indices.unsqueeze(1) _, nheads, dim, dstate = state.shape batch = x.shape[0] + if cu_seqlens is not None: + N = len(cu_seqlens) - 1 + # Only used to verify the shape of + # state_batch_indices and dst_state_batch_indices + max_seqlen = ( + state_batch_indices.size(-1) if state_batch_indices is not None else 1 + ) + else: + N = batch + max_seqlen = 1 assert x.shape == (batch, nheads, dim) assert dt.shape == x.shape @@ -279,16 +367,30 @@ def selective_state_update( if dt_bias is not None: assert dt_bias.shape == (nheads, dim) if state_batch_indices is not None: - assert state_batch_indices.shape == (batch,) + assert state_batch_indices.shape[0] >= N + assert state_batch_indices.shape[1] >= max_seqlen if dst_state_batch_indices is not None: - assert dst_state_batch_indices.shape == (batch,) + assert dst_state_batch_indices.shape[0] >= N + assert dst_state_batch_indices.shape[1] >= max_seqlen else: # revert to the default behavior of in-place state updates dst_state_batch_indices = state_batch_indices assert out.shape == x.shape + if num_accepted_tokens is not None: + assert num_accepted_tokens.shape == (N,) - grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads) + grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), N, nheads) z_strides = (z.stride(0), z.stride(1), z.stride(2)) if z is not None else (0, 0, 0) + state_batch_indices_strides = ( + (state_batch_indices.stride(0), state_batch_indices.stride(1)) + if state_batch_indices is not None + else (0, 0) + ) + dst_state_batch_indices_strides = ( + (dst_state_batch_indices.stride(0), dst_state_batch_indices.stride(1)) + if dst_state_batch_indices is not None + else (0, 0) + ) # We don't want autotune since it will overwrite the state # We instead tune by hand. BLOCK_SIZE_M, num_warps = ( @@ -321,7 +423,9 @@ def selective_state_update( state_batch_indices, dst_state_batch_indices, pad_slot_id, - batch, + num_accepted_tokens, + cu_seqlens, + N, nheads, dim, dstate, @@ -353,6 +457,10 @@ def selective_state_update( out.stride(0), out.stride(1), out.stride(2), + state_batch_indices_strides[0], + state_batch_indices_strides[1], + dst_state_batch_indices_strides[0], + dst_state_batch_indices_strides[1], dt_softplus, tie_hdim, BLOCK_SIZE_M, From 6af70e11a0a3cb7109ce904e23ff90c73f573ef0 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Mon, 8 Dec 2025 15:58:30 -0600 Subject: [PATCH 079/133] [ROCm][CI] Fix test_max_len.py for Rocm (#29916) Signed-off-by: charlifu Signed-off-by: Charlie Fu --- tests/basic_correctness/test_basic_correctness.py | 5 ++++- tests/utils.py | 4 ++-- tests/v1/e2e/test_spec_decode.py | 4 ++-- tests/v1/spec_decode/test_eagle.py | 8 ++++++-- tests/v1/spec_decode/test_max_len.py | 2 +- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 521d6c33dd390..9e1cc309edd1d 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -13,12 +13,15 @@ import pytest import torch from vllm import LLM +from vllm.platforms import current_platform from vllm.v1.engine.llm_engine import LLMEngine from ..conftest import HfRunner, VllmRunner from ..models.utils import check_outputs_equal from ..utils import multi_gpu_test +ATTN_BACKEND = ["ROCM_ATTN"] if current_platform.is_rocm() else ["FLASH_ATTN"] + MODELS = [ "hmellor/tiny-random-Gemma2ForCausalLM", "meta-llama/Llama-3.2-1B-Instruct", @@ -57,7 +60,7 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) +@pytest.mark.parametrize("backend", ATTN_BACKEND) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("async_scheduling", [True, False]) diff --git a/tests/utils.py b/tests/utils.py index 539f67c47ac1d..ea3675b1461b8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1225,9 +1225,9 @@ def get_attn_backend_list_based_on_platform() -> list[str]: try: import aiter # noqa: F401 - attn_backend_list.append("FLASH_ATTN") + attn_backend_list.append("ROCM_AITER_FA") except Exception: - print("Skip FLASH_ATTN on ROCm as aiter is not installed") + print("Skip ROCM_AITER_FA on ROCm as aiter is not installed") return attn_backend_list elif current_platform.is_xpu(): diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 575a6a151f579..416b582dfaa63 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -417,9 +417,9 @@ def test_eagle_correctness( "multi-token eagle spec decode on current platform" ) - if attn_backend == "FLASH_ATTN" and current_platform.is_rocm(): + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): if "deepseek" in model_setup[1].lower(): - pytest.skip("FLASH_ATTN for deepseek not supported on ROCm platform") + pytest.skip("ROCM_AITER_FA for deepseek not supported on ROCm platform") else: m.setenv("VLLM_ROCM_USE_AITER", "1") diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 616e57de339e2..55e9b4d0660f5 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -339,7 +339,7 @@ def test_load_model( "multi-token eagle spec decode on current platform" ) - if attn_backend == "FLASH_ATTN" and current_platform.is_rocm(): + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") # Setup draft model mock @@ -434,7 +434,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): "because it requires special input mocking." ) - if attn_backend == "FLASH_ATTN" and current_platform.is_rocm(): + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") # Use GPU device @@ -541,6 +541,10 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): attn_metadata_builder_cls, _ = try_get_attention_backend( AttentionBackendEnum.TREE_ATTN ) + elif attn_backend == "ROCM_AITER_FA": + attn_metadata_builder_cls, _ = try_get_attention_backend( + AttentionBackendEnum.ROCM_AITER_FA + ) else: raise ValueError(f"Unsupported attention backend: {attn_backend}") diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index fa1d0437f7c71..81da8609aa6cf 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -47,7 +47,7 @@ def test_eagle_max_len( "multi-token eagle spec decode on current platform" ) - if attn_backend == "FLASH_ATTN" and current_platform.is_rocm(): + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): m.setenv("VLLM_ROCM_USE_AITER", "1") llm = LLM( From 1fb632fdb6b6391cd1fe3146d05cc2858c2446ab Mon Sep 17 00:00:00 2001 From: Lain Date: Mon, 8 Dec 2025 15:02:34 -0800 Subject: [PATCH 080/133] [Perf] Improve fp8 quant in mla; replace ReduceSum with ReduceScatterSum (#29795) Signed-off-by: Siyuan Fu --- .../device_communicators/cuda_communicator.py | 2 +- vllm/v1/attention/backends/mla/common.py | 33 ++++++++++++------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 2e878eef908ac..cd9c267beb5b5 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -225,7 +225,7 @@ class CudaCommunicator(DeviceCommunicatorBase): output_shape, dtype=input_tensor.dtype, device=input_tensor.device ) - if sizes is not None: + if sizes is not None and sizes.count(sizes[0]) != len(sizes): pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes) else: pynccl_comm.reduce_scatter(output, input_tensor) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 309ddee4fc2f0..0a5257a1d87d8 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -2037,21 +2037,30 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): if fp8_attention: ql_nope_shape = decode_ql_nope.shape - decode_ql_nope, _ = ops.scaled_fp8_quant( - decode_ql_nope.reshape( - [ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]] - ), - layer._q_scale, - ) - decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape) q_pe_shape = decode_q_pe.shape - decode_q_pe, _ = ops.scaled_fp8_quant( - decode_q_pe.reshape([q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]), + assert decode_ql_nope.shape[0] == decode_q_pe.shape[0] + assert decode_ql_nope.shape[1] == decode_q_pe.shape[1] + decode_q_shape = ( + ql_nope_shape[0], + ql_nope_shape[1], + ql_nope_shape[2] + q_pe_shape[2], + ) + # Using empty and copy since torch.cat introduces significant overhead. + decode_q0 = torch.empty( + decode_q_shape, + device=decode_ql_nope.device, + dtype=decode_ql_nope.dtype, + ) + decode_q0[..., : ql_nope_shape[2]].copy_(decode_ql_nope) + decode_q0[..., ql_nope_shape[2] :].copy_(decode_q_pe) + + decode_q, _ = ops.scaled_fp8_quant( + decode_q0.view(decode_q_shape[0], -1), layer._q_scale, ) - decode_q_pe = decode_q_pe.reshape(q_pe_shape) - - decode_q = (decode_ql_nope, decode_q_pe) + decode_q = decode_q.view(decode_q_shape) + else: + decode_q = (decode_ql_nope, decode_q_pe) if self.dcp_world_size > 1: assert not fp8_attention, "DCP not support fp8 kvcache now." # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P) From 60d17251c920ae3c9d02e4b4101b738e4905aee4 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Mon, 8 Dec 2025 16:01:08 -0800 Subject: [PATCH 081/133] [Disagg] Support large batch size in proxy server and update NixlConnector doc for DP (#28782) Signed-off-by: Ming Yang --- docs/features/nixl_connector_usage.md | 2 ++ .../disagg_proxy_server.py | 21 +++++++++++++++-- .../nixl_integration/toy_proxy_server.py | 23 +++++++++++++++++-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index f0e25e31aa0b3..84c8f9e77d6d3 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -146,6 +146,8 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ --decoder-ports 8000 8000 ``` +For multi-host DP deployment, only need to provide the host/port of the head instances. + ### KV Role Options - **kv_producer**: For prefiller instances that generate KV caches diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 5d8e38c73b89a..c8965e050ff0b 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -26,9 +26,21 @@ async def lifespan(app: FastAPI): ) app.state.prefill_client = httpx.AsyncClient( - timeout=None, base_url=prefiller_base_url + timeout=None, + base_url=prefiller_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ) + app.state.decode_client = httpx.AsyncClient( + timeout=None, + base_url=decoder_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), ) - app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url) yield @@ -105,6 +117,11 @@ async def send_request_to_service( headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} response = await client.post(endpoint, json=req_data, headers=headers) response.raise_for_status() + + # read/consume the response body to release the connection + # otherwise, it would http.ReadError + await response.aread() + return response diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py index 5768fcdb57ceb..b92d3fcd6fb8b 100644 --- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py @@ -30,7 +30,14 @@ async def lifespan(app: FastAPI): prefiller_base_url = f"http://{host}:{port}/v1" app.state.prefill_clients.append( { - "client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url), + "client": httpx.AsyncClient( + timeout=None, + base_url=prefiller_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ), "host": host, "port": port, "id": i, @@ -42,7 +49,14 @@ async def lifespan(app: FastAPI): decoder_base_url = f"http://{host}:{port}/v1" app.state.decode_clients.append( { - "client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url), + "client": httpx.AsyncClient( + timeout=None, + base_url=decoder_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ), "host": host, "port": port, "id": i, @@ -169,6 +183,10 @@ async def send_request_to_service( ) response.raise_for_status() + # read/consume the response body to release the connection + # otherwise, it would http.ReadError + await response.aread() + return response @@ -206,6 +224,7 @@ async def _handle_completions(api: str, request: Request): # Extract the needed fields response_json = response.json() + await response.aclose() # CRITICAL: Release connection back to pool kv_transfer_params = response_json.get("kv_transfer_params", {}) if kv_transfer_params: req_data["kv_transfer_params"] = kv_transfer_params From f1599ca55d79cb686cb94dc3ff2f65d82db94940 Mon Sep 17 00:00:00 2001 From: Victor Ziliang Peng Date: Mon, 8 Dec 2025 16:08:48 -0800 Subject: [PATCH 082/133] feat(metrics): Add prefill KV compute metric excluding cached tokens (#30189) Signed-off-by: Ziliang Peng --- tests/v1/metrics/test_stats.py | 103 ++++++++++++++++++++++++++++- vllm/v1/engine/output_processor.py | 1 + vllm/v1/metrics/loggers.py | 20 ++++++ vllm/v1/metrics/stats.py | 3 + 4 files changed, 126 insertions(+), 1 deletion(-) diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py index 48067def8357e..7d902bbc6fc24 100644 --- a/tests/v1/metrics/test_stats.py +++ b/tests/v1/metrics/test_stats.py @@ -1,8 +1,109 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.v1.metrics.stats import IterationStats +from vllm.v1.engine import FinishReason +from vllm.v1.metrics.stats import IterationStats, RequestStateStats def test_iteration_stats_repr(): iteration_stats = IterationStats() assert repr(iteration_stats).startswith("IterationStats(") + + +def test_prefill_kv_computed_with_cache(): + """Test that prefill KV compute correctly excludes cached tokens.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats(arrival_time=0.0) + req_stats.scheduled_ts = 0.1 + req_stats.first_token_ts = 0.5 + req_stats.last_token_ts = 5.0 + req_stats.num_generation_tokens = 50 + + # Case 1: With prefix cache (1200 tokens cached) + iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=10000, + max_tokens_param=100, + req_stats=req_stats, + num_cached_tokens=1200, + ) + + finished_req = iteration_stats.finished_requests[0] + assert finished_req.num_prompt_tokens == 10000 + assert finished_req.num_cached_tokens == 1200 + + # Verify calculation: prefill KV = prompt tokens - cached tokens + prefill_kv_computed = finished_req.num_prompt_tokens - max( + finished_req.num_cached_tokens, 0 + ) + assert prefill_kv_computed == 8800 # 10000 - 1200 + + +def test_prefill_kv_computed_no_cache(): + """Test prefill KV compute without prefix caching.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats(arrival_time=0.0) + req_stats.scheduled_ts = 0.1 + req_stats.first_token_ts = 0.5 + req_stats.last_token_ts = 2.0 + req_stats.num_generation_tokens = 10 + + # Case 2: No prefix cache + iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=2000, + max_tokens_param=100, + req_stats=req_stats, + num_cached_tokens=0, + ) + + finished_req = iteration_stats.finished_requests[0] + assert finished_req.num_prompt_tokens == 2000 + assert finished_req.num_cached_tokens == 0 + + # Verify calculation: prefill KV = full prompt when no cache + prefill_kv_computed = finished_req.num_prompt_tokens - max( + finished_req.num_cached_tokens, 0 + ) + assert prefill_kv_computed == 2000 + + +def test_prefill_kv_computed_edge_cases(): + """Test edge cases for prefill KV compute calculation.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats(arrival_time=0.0) + req_stats.scheduled_ts = 0.1 + req_stats.first_token_ts = 0.5 + req_stats.last_token_ts = 1.0 + req_stats.num_generation_tokens = 1 + + # Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully) + iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=100, + max_tokens_param=10, + req_stats=req_stats, + num_cached_tokens=-1, + ) + + finished_req = iteration_stats.finished_requests[0] + # max() should handle negative values + prefill_kv_computed = finished_req.num_prompt_tokens - max( + finished_req.num_cached_tokens, 0 + ) + assert prefill_kv_computed == 100 # Should treat negative as 0 + + # Case 4: All tokens cached (shouldn't happen in practice) + iteration_stats2 = IterationStats() + iteration_stats2.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=100, + max_tokens_param=10, + req_stats=req_stats, + num_cached_tokens=100, + ) + + finished_req2 = iteration_stats2.finished_requests[0] + prefill_kv_computed2 = finished_req2.num_prompt_tokens - max( + finished_req2.num_cached_tokens, 0 + ) + assert prefill_kv_computed2 == 0 # All cached, nothing computed diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index e85fbb4ee0fb0..9be3f4da7352d 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -650,6 +650,7 @@ class OutputProcessor: ), max_tokens_param=req_state.max_tokens_param, req_stats=req_state.stats, + num_cached_tokens=req_state.num_cached_tokens, ) self.lora_states.request_finished(req_state.request_id, req_state.lora_name) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 882e0ce0b2e03..9eaee1bb97bb9 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -870,6 +870,19 @@ class PrometheusStatLogger(AggregateStatLoggerBase): histogram_decode_time_request, engine_indexes, model_name ) + histogram_prefill_kv_computed_request = self._histogram_cls( + name="vllm:request_prefill_kv_computed_tokens", + documentation=( + "Histogram of new KV tokens computed during prefill " + "(excluding cached tokens)." + ), + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames, + ) + self.histogram_prefill_kv_computed_request = make_per_engine( + histogram_prefill_kv_computed_request, engine_indexes, model_name + ) + # # KV Cache residency metrics # @@ -1118,6 +1131,13 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.histogram_decode_time_request[engine_idx].observe( finished_request.decode_time ) + # Calculate prefill KV compute (excludes cached tokens) + prefill_kv_computed = finished_request.num_prompt_tokens - max( + finished_request.num_cached_tokens, 0 + ) + self.histogram_prefill_kv_computed_request[engine_idx].observe( + prefill_kv_computed + ) self.histogram_num_prompt_tokens_request[engine_idx].observe( finished_request.num_prompt_tokens ) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 733d3ae12e67f..a0cc58d0a64e8 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -224,6 +224,7 @@ class FinishedRequestStats: decode_time: float = 0.0 mean_time_per_output_token: float = 0.0 is_corrupted: bool = False + num_cached_tokens: int = 0 class IterationStats: @@ -330,6 +331,7 @@ class IterationStats: num_prompt_tokens: int, max_tokens_param: int | None, req_stats: RequestStateStats, + num_cached_tokens: int = 0, ): e2e_latency = self._time_since(req_stats.arrival_time) @@ -367,6 +369,7 @@ class IterationStats: decode_time=decode_time, mean_time_per_output_token=mean_time_per_output_token, is_corrupted=req_stats.is_corrupted, + num_cached_tokens=num_cached_tokens, ) self.finished_requests.append(finished_req) From 9d6235ca9a36e76911045999ed72e3c8aad66b8a Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Mon, 8 Dec 2025 16:29:36 -0800 Subject: [PATCH 083/133] [moe] Allow disabling DP chunking (#29936) Signed-off-by: Ming Yang --- vllm/envs.py | 4 ++++ vllm/model_executor/layers/fused_moe/layer.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 37711dece9abc..91d1b01076b11 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -144,6 +144,7 @@ if TYPE_CHECKING: VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 + VLLM_ENABLE_MOE_DP_CHUNK: bool = True VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict" VLLM_MARLIN_USE_ATOMIC_ADD: bool = False @@ -1101,6 +1102,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE # units. "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")), + "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool( + int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1")) + ), # Randomize inputs during dummy runs when using Data Parallel "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get( "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0" diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9b4d77a060c29..5df3486093cd9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -753,7 +753,7 @@ class FusedMoE(CustomOp): self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels) - ) + ) and envs.VLLM_ENABLE_MOE_DP_CHUNK @property def is_internal_router(self) -> bool: From d9417096d1347ead26b7c0cc1afb22fc0028e6e8 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 8 Dec 2025 19:31:57 -0500 Subject: [PATCH 084/133] [Feature] Batch invariant: Enable `TRITON_MLA` without prefix-caching (#29125) Signed-off-by: yewentao256 --- tests/v1/determinism/test_batch_invariance.py | 6 +--- .../test_online_batch_invariance.py | 5 ++- tests/v1/determinism/utils.py | 1 + vllm/attention/layer.py | 36 +++++++++++++++++++ vllm/model_executor/layers/batch_invariant.py | 2 +- 5 files changed, 43 insertions(+), 7 deletions(-) diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index 4311547baccf1..fc953a66f0820 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -185,7 +185,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - enable_prefix_caching=False, + # enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", # not everything is supported @@ -393,7 +393,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): gpu_memory_utilization=0.9, max_model_len=2048, dtype="bfloat16", - enable_prefix_caching=False, ) prompt = "the capital of france is" @@ -457,7 +456,6 @@ def test_logprobs_without_batch_invariance_should_fail( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", @@ -681,7 +679,6 @@ def test_decode_logprobs_match_prefill_logprobs( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", @@ -928,7 +925,6 @@ def LLM_with_max_seqs( max_model_len=max_model_len, dtype="bfloat16", tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), - enable_prefix_caching=False, # Enable for MOE models # enable_expert_parallel=True, ) diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py index d74b435797f8f..5e3b997364949 100644 --- a/tests/v1/determinism/test_online_batch_invariance.py +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -153,7 +153,10 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( } tp_size = os.getenv("VLLM_TP_SIZE", "1") - server_args: list[str] = [] + server_args: list[str] = [ + "--max-model-len=8192", + "--max-num-seqs=32", + ] if tp_size: server_args += ["-tp", tp_size] diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 0d7da107728b4..6aab50cf84ab6 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -17,6 +17,7 @@ skip_unsupported = pytest.mark.skipif( BACKENDS: list[str] = [ "FLASH_ATTN", + "TRITON_MLA", ] if has_flashinfer(): diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 340b161ea1e15..7e5adfe0742d3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -25,6 +25,7 @@ from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( ColumnParallelLinear, UnquantizedLinearMethod, @@ -251,6 +252,24 @@ class Attention(nn.Module, AttentionLayerBase): else: self.attn_backend = attn_backend + # prefix caching + batch invariance is currently not supported for + # FLASHINFER and TRITON_MLA. + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "FLASHINFER" + or self.attn_backend.get_name() == "TRITON_MLA" + ) + ): + logger.warning_once( + "Disabling prefix caching for FLASHINFER/TRITON_MLA " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + impl_cls = self.attn_backend.get_impl_cls() self.impl = impl_cls( num_heads, @@ -628,6 +647,23 @@ class MLAAttention(nn.Module, AttentionLayerBase): use_mla=True, use_sparse=use_sparse, ) + + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "TRITON_MLA" + or self.attn_backend.get_name() == "FLASHINFER" + ) + ): + logger.warning_once( + "Disabling prefix caching for TRITON_MLA / FLASHINFER " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) self.impl = impl_cls( num_heads=self.num_heads, diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 4154122636dcf..4cab47f4192a2 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -1006,11 +1006,11 @@ def override_envs_for_invariance(): "FLASH_ATTN", # best supported backend "FLASHINFER", "FLASH_ATTN_MLA", + "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 - # "TRITON_MLA", ] if curr_attn_backend not in supported_backends: error = ( From 0ee6416f678de761f827f4bd387b36f0cbc43a0a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 8 Dec 2025 19:44:01 -0500 Subject: [PATCH 085/133] [Perf] Optimize `group_topk` kernel, 1.9% Throughput improvement, 2.1% TPOT improvemnt (#30159) Signed-off-by: yewentao256 --- csrc/moe/grouped_topk_kernels.cu | 175 ++++++++++++++++++++++--------- 1 file changed, 128 insertions(+), 47 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 69b4c1fb11d1a..47ee5f021eb4a 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) { return cuda_cast(sigmoid_accurate(f)); } -template +template +__device__ inline T apply_scoring(T val) { + if constexpr (SF == SCORING_SIGMOID) { + return apply_sigmoid(val); + } else { + return val; + } +} + +template __device__ void topk_with_k2(T* output, T const* input, T const* bias, cg::thread_block_tile<32> const& tile, int32_t const lane_id, - int const num_experts_per_group, - int const scoring_func) { + int const num_experts_per_group) { // Get the top2 per thread T largest = neg_inf(); T second_largest = neg_inf(); if (num_experts_per_group > WARP_SIZE) { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { - T value = input[i]; - // Apply scoring function if needed - if (scoring_func == SCORING_SIGMOID) { - value = apply_sigmoid(value); - } + T value = apply_scoring(input[i]); value = value + bias[i]; if (value > largest) { @@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias, } } else { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { - T value = input[i]; - // Apply scoring function if needed - if (scoring_func == SCORING_SIGMOID) { - value = apply_sigmoid(value); - } + T value = apply_scoring(input[i]); value = value + bias[i]; largest = value; } @@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias, } } -template +template __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias, int64_t const num_tokens, int64_t const num_cases, int64_t const n_group, - int64_t const num_experts_per_group, - int const scoring_func) { + int64_t const num_experts_per_group) { int32_t warp_id = threadIdx.x / WARP_SIZE; int32_t lane_id = threadIdx.x % WARP_SIZE; @@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias, #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) asm volatile("griddepcontrol.wait;"); #endif - topk_with_k2(output, input, group_bias, tile, lane_id, - num_experts_per_group, scoring_func); + topk_with_k2(output, input, group_bias, tile, lane_id, + num_experts_per_group); } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) asm volatile("griddepcontrol.launch_dependents;"); #endif } -template +template __global__ void group_idx_and_topk_idx_kernel( T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices, T const* bias, int64_t const num_tokens, int64_t const n_group, int64_t const topk_group, int64_t const topk, int64_t const num_experts, int64_t const num_experts_per_group, bool renormalize, - double routed_scaling_factor, int scoring_func) { + double routed_scaling_factor) { int32_t warp_id = threadIdx.x / WARP_SIZE; int32_t lane_id = threadIdx.x % WARP_SIZE; int32_t case_id = @@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel( topk_values += case_id * topk; topk_indices += case_id * topk; + constexpr bool kUseStaticNGroup = (NGroup > 0); + // use int32 to avoid implicit conversion + int32_t const n_group_i32 = + kUseStaticNGroup ? NGroup : static_cast(n_group); + int32_t align_num_experts_per_group = warp_topk::round_up_to_multiple_of(num_experts_per_group); @@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { // calculate group_idx - int32_t target_num_min = WARP_SIZE - n_group + topk_group; + int32_t target_num_min = + WARP_SIZE - n_group_i32 + static_cast(topk_group); // The check is necessary to avoid abnormal input - if (lane_id < n_group && is_finite(group_scores[lane_id])) { + if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) { value = group_scores[lane_id]; } - int count_equal_to_top_value = WARP_SIZE - n_group; + int count_equal_to_top_value = WARP_SIZE - n_group_i32; int pre_count_equal_to_top_value = 0; // Use loop to find the largset top_group while (count_equal_to_top_value < target_num_min) { @@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel( int count_equalto_topkth_group = 0; bool if_proceed_next_topk = topk_group_value != neg_inf(); if (case_id < num_tokens && if_proceed_next_topk) { - for (int i_group = 0; i_group < n_group; i_group++) { + auto process_group = [&](int i_group) { if ((group_scores[i_group] > topk_group_value) || ((group_scores[i_group] == topk_group_value) && (count_equalto_topkth_group < num_equalto_topkth_group))) { @@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel( i += WARP_SIZE) { T candidates = neg_inf(); if (i < num_experts_per_group) { - // Apply scoring function (if any) and add bias + // apply scoring function (if any) and add bias T input = scores[offset + i]; if (is_finite(input)) { - T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) - : input; + T score = apply_scoring(input); candidates = score + bias[offset + i]; } } @@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel( count_equalto_topkth_group++; } } + }; + + if constexpr (kUseStaticNGroup) { +#pragma unroll + for (int i_group = 0; i_group < NGroup; ++i_group) { + process_group(i_group); + } + } else { + for (int i_group = 0; i_group < n_group_i32; ++i_group) { + process_group(i_group); + } } queue.done(); __syncwarp(); @@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel( if (i < topk) { // Load the score value (without bias) for normalization T input = scores[s_topk_idx[i]]; - value = - (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input; + value = apply_scoring(input); s_topk_value[i] = value; } - topk_sum += - cg::reduce(tile, cuda_cast(value), cg::plus()); + if (renormalize) { + topk_sum += + cg::reduce(tile, cuda_cast(value), cg::plus()); + } } } @@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { if (if_proceed_next_topk) { for (int i = lane_id; i < topk; i += WARP_SIZE) { - float value; - if (renormalize) { - value = cuda_cast(s_topk_value[i]) / topk_sum * - routed_scaling_factor; - } else { - value = cuda_cast(s_topk_value[i]) * routed_scaling_factor; - } + float base = cuda_cast(s_topk_value[i]); + float value = renormalize ? (base / topk_sum * routed_scaling_factor) + : (base * routed_scaling_factor); topk_indices[i] = s_topk_idx[i]; topk_values[i] = value; } @@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel( #endif } +template +inline void launch_group_idx_and_topk_kernel( + cudaLaunchConfig_t const& config, T* scores, T* group_scores, + float* topk_values, IdxT* topk_indices, T const* bias, + int64_t const num_tokens, int64_t const n_group, int64_t const topk_group, + int64_t const topk, int64_t const num_experts, + int64_t const num_experts_per_group, bool const renormalize, + double const routed_scaling_factor) { + auto launch = [&](auto* kernel_instance2) { + cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores, + topk_values, topk_indices, bias, num_tokens, n_group, + topk_group, topk, num_experts, num_experts_per_group, + renormalize, routed_scaling_factor); + }; + + switch (n_group) { + case 4: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + case 8: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + case 16: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + case 32: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + default: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + } +} + template void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, IdxT* topk_indices, T const* bias, int64_t const num_tokens, @@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, cudaStream_t const stream = 0) { int64_t num_cases = num_tokens * n_group; int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1; - auto* kernel_instance1 = &topk_with_k2_kernel; cudaLaunchConfig_t config; config.gridDim = topk_with_k2_num_blocks; config.blockDim = BLOCK_SIZE; @@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; config.numAttrs = 1; config.attrs = attrs; - cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias, - num_tokens, num_cases, n_group, num_experts / n_group, - scoring_func); + auto const sf = static_cast(scoring_func); + int64_t const num_experts_per_group = num_experts / n_group; + auto launch_topk_with_k2 = [&](auto* kernel_instance1) { + cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias, + num_tokens, num_cases, n_group, num_experts_per_group); + }; + switch (sf) { + case SCORING_NONE: { + auto* kernel_instance1 = &topk_with_k2_kernel; + launch_topk_with_k2(kernel_instance1); + break; + } + case SCORING_SIGMOID: { + auto* kernel_instance1 = &topk_with_k2_kernel; + launch_topk_with_k2(kernel_instance1); + break; + } + default: + // should be guarded by higher level checks. + TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc"); + } int64_t topk_with_k_group_num_blocks = (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1; size_t dynamic_smem_in_bytes = warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK, topk); - auto* kernel_instance2 = &group_idx_and_topk_idx_kernel; config.gridDim = topk_with_k_group_num_blocks; config.blockDim = BLOCK_SIZE; config.dynamicSmemBytes = dynamic_smem_in_bytes; @@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; config.numAttrs = 1; config.attrs = attrs; - cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores, - topk_values, topk_indices, bias, num_tokens, n_group, - topk_group, topk, num_experts, num_experts / n_group, - renormalize, routed_scaling_factor, scoring_func); + switch (sf) { + case SCORING_NONE: { + launch_group_idx_and_topk_kernel( + config, scores, group_scores, topk_values, topk_indices, bias, + num_tokens, n_group, topk_group, topk, num_experts, + num_experts_per_group, renormalize, routed_scaling_factor); + break; + } + case SCORING_SIGMOID: { + launch_group_idx_and_topk_kernel( + config, scores, group_scores, topk_values, topk_indices, bias, + num_tokens, n_group, topk_group, topk, num_experts, + num_experts_per_group, renormalize, routed_scaling_factor); + break; + } + default: + TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc"); + } } #define INSTANTIATE_NOAUX_TC(T, IdxT) \ From ae339b1a67ac6651dae926aae2afa0c168c1ddb5 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Mon, 8 Dec 2025 17:05:27 -0800 Subject: [PATCH 086/133] [Bugfix] Fix DeepGEMM after #29546 (#30267) Signed-off-by: zhewenli Signed-off-by: Zhewen Li --- .../layers/quantization/utils/fp8_utils.py | 16 ++++++++++------ vllm/utils/deep_gemm.py | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index ad92f4ec63c34..366c5778fa5d9 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -30,6 +30,7 @@ from vllm.model_executor.parameter import ( from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( + DeepGemmQuantScaleFMT, fp8_gemm_nt, is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -268,12 +269,15 @@ class W8A8BlockFp8LinearOp: weight: torch.Tensor, weight_scale: torch.Tensor, ) -> torch.Tensor: - assert self.deepgemm_input_quant_op is not None - q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( - input_2d, - group_size=self.act_quant_group_shape.col, - use_ue8m0=True, - ) + if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0: + q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( + input_2d, + group_size=self.act_quant_group_shape.col, + use_ue8m0=True, + ) + else: + assert self.deepgemm_input_quant_op is not None + q_input, input_scale = self.deepgemm_input_quant_op(input_2d) output = torch.empty( (q_input.shape[0], weight.shape[0]), dtype=torch.bfloat16, diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 8545108a02666..a099fde1bdc45 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -399,6 +399,7 @@ def should_use_deepgemm_for_fp8_linear_for_nk( __all__ = [ "calc_diff", + "DeepGemmQuantScaleFMT", "fp8_gemm_nt", "m_grouped_fp8_gemm_nt_contiguous", "fp8_m_grouped_gemm_nt_masked", From 7b35011ad136748b3d18d069103feda9a239459f Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Mon, 8 Dec 2025 17:14:10 -0800 Subject: [PATCH 087/133] Mark qwen2_5_vl as xfail (#30283) Signed-off-by: Yanan Cao --- tests/compile/fullgraph/test_multimodal_compile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py index 621f6a51a918f..e2897b2271b8f 100644 --- a/tests/compile/fullgraph/test_multimodal_compile.py +++ b/tests/compile/fullgraph/test_multimodal_compile.py @@ -17,6 +17,7 @@ def test_compile(): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") +@pytest.mark.xfail def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): """Test that Qwen2.5-VL vision submodules are compiled. From e41312a2f5086cc9199e024c5d451e65a303a4d1 Mon Sep 17 00:00:00 2001 From: Christina Norman Date: Mon, 8 Dec 2025 19:52:43 -0600 Subject: [PATCH 088/133] [Bugfix] Skip generation config fallback for GGUF to prevent multi-process hang (#30209) Co-authored-by: Claude Opus 4.5 --- vllm/transformers_utils/config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 773fc05a52ef3..d761802da9403 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -954,6 +954,13 @@ def try_get_generation_config( revision: str | None = None, config_format: str | ConfigFormat = "auto", ) -> GenerationConfig | None: + # GGUF files don't have generation_config.json - their config is embedded + # in the file header. Skip all filesystem lookups to avoid re-reading the + # memory-mapped file, which can hang in multi-process scenarios when the + # EngineCore process already has the file mapped. + if is_gguf(model): + return None + try: return GenerationConfig.from_pretrained( model, From 78c75033642b8c2832fcfb8d483871b65673326b Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Mon, 8 Dec 2025 20:14:02 -0600 Subject: [PATCH 089/133] [ROCm][CI] Skip NVIDIA-Only Prime-RL Test in AMD CI (#29420) Signed-off-by: Micah Williamson --- .buildkite/scripts/run-prime-rl-test.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh index 5b25c358fc4aa..3fb7c82c8d333 100755 --- a/.buildkite/scripts/run-prime-rl-test.sh +++ b/.buildkite/scripts/run-prime-rl-test.sh @@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" PRIME_RL_DIR="${REPO_ROOT}/prime-rl" +if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then + echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..." + exit 0 +fi + echo "Setting up Prime-RL integration test environment..." # Clean up any existing Prime-RL directory From db14f61f2d6d33e38d382f2e3e7de514dac8e218 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 8 Dec 2025 18:25:43 -0800 Subject: [PATCH 090/133] [ci] Refactor CI file structure (#29343) --- .buildkite/ci_config.yaml | 24 +++ .buildkite/image_build/image_build.sh | 56 +++++ .buildkite/image_build/image_build.yaml | 57 +++++ .buildkite/image_build/image_build_cpu.sh | 36 ++++ .../image_build/image_build_cpu_arm64.sh | 33 +++ .buildkite/image_build/image_build_hpu.sh | 34 +++ .buildkite/test_areas/attention.yaml | 21 ++ .buildkite/test_areas/basic_correctness.yaml | 16 ++ .buildkite/test_areas/benchmarks.yaml | 19 ++ .buildkite/test_areas/compile.yaml | 57 +++++ .buildkite/test_areas/cuda.yaml | 22 ++ .buildkite/test_areas/distributed.yaml | 199 ++++++++++++++++++ .buildkite/test_areas/e2e_integration.yaml | 59 ++++++ .buildkite/test_areas/engine.yaml | 26 +++ .buildkite/test_areas/entrypoints.yaml | 68 ++++++ .buildkite/test_areas/expert_parallelism.yaml | 23 ++ .buildkite/test_areas/kernels.yaml | 117 ++++++++++ .buildkite/test_areas/lm_eval.yaml | 46 ++++ .buildkite/test_areas/lora.yaml | 31 +++ .buildkite/test_areas/misc.yaml | 163 ++++++++++++++ .buildkite/test_areas/model_executor.yaml | 17 ++ .buildkite/test_areas/models_basic.yaml | 62 ++++++ .buildkite/test_areas/models_distributed.yaml | 22 ++ .buildkite/test_areas/models_language.yaml | 91 ++++++++ .buildkite/test_areas/models_multimodal.yaml | 79 +++++++ .buildkite/test_areas/plugins.yaml | 34 +++ .buildkite/test_areas/pytorch.yaml | 50 +++++ .buildkite/test_areas/quantization.yaml | 46 ++++ .buildkite/test_areas/samplers.yaml | 14 ++ .buildkite/test_areas/tool_use.yaml | 23 ++ .buildkite/test_areas/weight_loading.yaml | 25 +++ 31 files changed, 1570 insertions(+) create mode 100644 .buildkite/ci_config.yaml create mode 100755 .buildkite/image_build/image_build.sh create mode 100644 .buildkite/image_build/image_build.yaml create mode 100755 .buildkite/image_build/image_build_cpu.sh create mode 100755 .buildkite/image_build/image_build_cpu_arm64.sh create mode 100755 .buildkite/image_build/image_build_hpu.sh create mode 100644 .buildkite/test_areas/attention.yaml create mode 100644 .buildkite/test_areas/basic_correctness.yaml create mode 100644 .buildkite/test_areas/benchmarks.yaml create mode 100644 .buildkite/test_areas/compile.yaml create mode 100644 .buildkite/test_areas/cuda.yaml create mode 100644 .buildkite/test_areas/distributed.yaml create mode 100644 .buildkite/test_areas/e2e_integration.yaml create mode 100644 .buildkite/test_areas/engine.yaml create mode 100644 .buildkite/test_areas/entrypoints.yaml create mode 100644 .buildkite/test_areas/expert_parallelism.yaml create mode 100644 .buildkite/test_areas/kernels.yaml create mode 100644 .buildkite/test_areas/lm_eval.yaml create mode 100644 .buildkite/test_areas/lora.yaml create mode 100644 .buildkite/test_areas/misc.yaml create mode 100644 .buildkite/test_areas/model_executor.yaml create mode 100644 .buildkite/test_areas/models_basic.yaml create mode 100644 .buildkite/test_areas/models_distributed.yaml create mode 100644 .buildkite/test_areas/models_language.yaml create mode 100644 .buildkite/test_areas/models_multimodal.yaml create mode 100644 .buildkite/test_areas/plugins.yaml create mode 100644 .buildkite/test_areas/pytorch.yaml create mode 100644 .buildkite/test_areas/quantization.yaml create mode 100644 .buildkite/test_areas/samplers.yaml create mode 100644 .buildkite/test_areas/tool_use.yaml create mode 100644 .buildkite/test_areas/weight_loading.yaml diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml new file mode 100644 index 0000000000000..199c33159fde3 --- /dev/null +++ b/.buildkite/ci_config.yaml @@ -0,0 +1,24 @@ +name: vllm_ci +job_dirs: + - ".buildkite/test_areas" + - ".buildkite/image_build" +run_all_patterns: + - "docker/Dockerfile" + - "CMakeLists.txt" + - "requirements/common.txt" + - "requirements/cuda.txt" + - "requirements/build.txt" + - "requirements/test.txt" + - "setup.py" + - "csrc/" + - "cmake/" +run_all_exclude_patterns: + - "docker/Dockerfile." + - "csrc/cpu/" + - "csrc/rocm/" + - "cmake/hipify.py" + - "cmake/cpu_extension.cmake" +registries: public.ecr.aws/q9t5s3a7 +repositories: + main: "vllm-ci-postmerge-repo" + premerge: "vllm-ci-test-repo" diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh new file mode 100755 index 0000000000000..9a2384e524b63 --- /dev/null +++ b/.buildkite/image_build/image_build.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -e + +if [[ $# -lt 8 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 +BRANCH=$4 +VLLM_USE_PRECOMPILED=$5 +VLLM_MERGE_BASE_COMMIT=$6 +CACHE_FROM=$7 +CACHE_TO=$8 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com + +# docker buildx +docker buildx create --name vllm-builder --driver docker-container --use +docker buildx inspect --bootstrap +docker buildx ls + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then + merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}" +else + merge_base_commit_build_args="" +fi + +# build +docker buildx build --file docker/Dockerfile \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg USE_SCCACHE=1 \ + --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \ + --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ + --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \ + ${merge_base_commit_build_args} \ + --cache-from type=registry,ref=${CACHE_FROM},mode=max \ + --cache-to type=registry,ref=${CACHE_TO},mode=max \ + --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \ + $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \ + --push \ + --target test \ + --progress plain . diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml new file mode 100644 index 0000000000000..d01c71dd9becf --- /dev/null +++ b/.buildkite/image_build/image_build.yaml @@ -0,0 +1,57 @@ +group: Abuild +steps: + - label: ":docker: Build image" + key: image-build + depends_on: [] + commands: + - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU image" + key: image-build-cpu + depends_on: [] + commands: + - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build HPU image" + soft_fail: true + depends_on: [] + key: image-build-hpu + commands: + - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU arm64 image" + key: cpu-arm64-image-build + depends_on: [] + optional: true + commands: + - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh new file mode 100755 index 0000000000000..a69732f430985 --- /dev/null +++ b/.buildkite/image_build/image_build_cpu.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg VLLM_CPU_AVX512BF16=true \ + --build-arg VLLM_CPU_AVX512VNNI=true \ + --build-arg VLLM_CPU_AMXBF16=true \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh new file mode 100755 index 0000000000000..615298b6555bd --- /dev/null +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh new file mode 100755 index 0000000000000..192447ef4577e --- /dev/null +++ b/.buildkite/image_build/image_build_hpu.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build \ + --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ + --progress plain \ + https://github.com/vllm-project/vllm-gaudi.git + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml new file mode 100644 index 0000000000000..6e444eae14c74 --- /dev/null +++ b/.buildkite/test_areas/attention.yaml @@ -0,0 +1,21 @@ +group: Attention +depends_on: + - image-build +steps: +- label: V1 attention (H100) + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: V1 attention (B200) + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml new file mode 100644 index 0000000000000..759d2b5358714 --- /dev/null +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -0,0 +1,16 @@ +group: Basic Correctness +depends_on: + - image-build +steps: +- label: Basic Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml new file mode 100644 index 0000000000000..574b642d407b0 --- /dev/null +++ b/.buildkite/test_areas/benchmarks.yaml @@ -0,0 +1,19 @@ +group: Benchmarks +depends_on: + - image-build +steps: +- label: Benchmarks + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml new file mode 100644 index 0000000000000..0ba00925a4838 --- /dev/null +++ b/.buildkite/test_areas/compile.yaml @@ -0,0 +1,57 @@ +group: Compile +depends_on: + - image-build +steps: +- label: Fusion and Compile Tests (B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +- label: Fusion E2E (2 GPUs)(B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/distributed/test_fusions_e2e.py + commands: + - nvidia-smi + # Run all e2e fusion tests + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml new file mode 100644 index 0000000000000..50c0c338c2434 --- /dev/null +++ b/.buildkite/test_areas/cuda.yaml @@ -0,0 +1,22 @@ +group: CUDA +depends_on: + - image-build +steps: +- label: Platform Tests (CUDA) + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Cudagraph + timeout_in_minutes: 20 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py \ No newline at end of file diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml new file mode 100644 index 0000000000000..2cc90698d916a --- /dev/null +++ b/.buildkite/test_areas/distributed.yaml @@ -0,0 +1,199 @@ +group: Distributed +depends_on: + - image-build +steps: +- label: Distributed Comm Ops + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: Distributed (2 GPUs) + timeout_in_minutes: 90 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Tests (4 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - cd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + +- label: Distributed Tests (8 GPUs)(H100) + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: Distributed Tests (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: Distributed Tests (2 GPUs)(H200) + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: Distributed Tests (2 GPUs)(B200) + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: 2 Node Test (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + +- label: Distributed NixlConnector PD accuracy (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + +- label: Pipeline + Context Parallelism (4 GPUs)) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py \ No newline at end of file diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml new file mode 100644 index 0000000000000..93d389815edac --- /dev/null +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -0,0 +1,59 @@ +group: E2E Integration +depends_on: + - image-build +steps: +- label: DeepSeek V2-Lite Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + timeout_in_minutes: 60 + gpu: b200 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +- label: Prime-RL Integration (2 GPUs) + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml new file mode 100644 index 0000000000000..a028e0e4af4c1 --- /dev/null +++ b/.buildkite/test_areas/engine.yaml @@ -0,0 +1,26 @@ +group: Engine +depends_on: + - image-build +steps: +- label: Engine + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +- label: V1 e2e + engine + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml new file mode 100644 index 0000000000000..0a789be943f37 --- /dev/null +++ b/.buildkite/test_areas/entrypoints.yaml @@ -0,0 +1,68 @@ +group: Entrypoints +depends_on: + - image-build +steps: +- label: Entrypoints Unit Tests + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration (LLM) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration (API Server) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + + +- label: Entrypoints Integration (Pooling) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + + +- label: Entrypoints V1 + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: OpenAI API Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml new file mode 100644 index 0000000000000..feb8252148c7f --- /dev/null +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -0,0 +1,23 @@ +group: Expert Parallelism +depends_on: + - image-build +steps: +- label: EPLB Algorithm + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py \ No newline at end of file diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml new file mode 100644 index 0000000000000..7ca099516d641 --- /dev/null +++ b/.buildkite/test_areas/kernels.yaml @@ -0,0 +1,117 @@ +group: Kernels +depends_on: + - image-build +steps: +- label: Kernels Core Operation Test + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N + timeout_in_minutes: 35 + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N + timeout_in_minutes: 60 + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test + timeout_in_minutes: 45 + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + +- label: Kernels (B200) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py \ No newline at end of file diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml new file mode 100644 index 0000000000000..9af43e0c375a8 --- /dev/null +++ b/.buildkite/test_areas/lm_eval.yaml @@ -0,0 +1,46 @@ +group: LM Eval +depends_on: + - image-build +steps: +- label: LM Eval Small Models + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: LM Eval Large Models (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (4 GPUs)(H100) + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +- label: LM Eval Small Models (B200) + timeout_in_minutes: 120 + gpu: b200 + optional: true + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml new file mode 100644 index 0000000000000..809b4138f44ba --- /dev/null +++ b/.buildkite/test_areas/lora.yaml @@ -0,0 +1,31 @@ +group: LoRA +depends_on: + - image-build +steps: +- label: LoRA %N + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 + + +- label: LoRA TP (Distributed) + timeout_in_minutes: 30 + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py \ No newline at end of file diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml new file mode 100644 index 0000000000000..072bccadb726a --- /dev/null +++ b/.buildkite/test_areas/misc.yaml @@ -0,0 +1,163 @@ +group: Miscellaneous +depends_on: + - image-build +steps: +- label: V1 Others + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Others (CPU) + depends_on: ~ + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + +- label: Regression + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Examples + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/chat.py # for basic + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Metrics, Tracing (2 GPUs) + timeout_in_minutes: 20 + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: Python-only Installation + depends_on: ~ + timeout_in_minutes: 20 + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Async Engine, Inputs, Utils, Worker + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config (CPU) + depends_on: ~ + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s tokenizers_ + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: GPT-OSS Eval (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Batch Invariance (H100) + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py \ No newline at end of file diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml new file mode 100644 index 0000000000000..996c8bb8b780a --- /dev/null +++ b/.buildkite/test_areas/model_executor.yaml @@ -0,0 +1,17 @@ +group: Model Executor +depends_on: + - image-build +steps: +- label: Model Executor + timeout_in_minutes: 35 + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml new file mode 100644 index 0000000000000..39a5d51c48833 --- /dev/null +++ b/.buildkite/test_areas/models_basic.yaml @@ -0,0 +1,62 @@ +group: Models - Basic +depends_on: + - image-build +steps: +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Transformers Nightly Models + working_dir: "/vllm-workspace/" + optional: true + soft_fail: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml new file mode 100644 index 0000000000000..b6bfbf2ddab47 --- /dev/null +++ b/.buildkite/test_areas/models_distributed.yaml @@ -0,0 +1,22 @@ +group: Models - Distributed +depends_on: + - image-build +steps: +- label: Distributed Model Tests (2 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml new file mode 100644 index 0000000000000..f70192c4ebc0a --- /dev/null +++ b/.buildkite/test_areas/models_language.yaml @@ -0,0 +1,91 @@ +group: Models - Language +depends_on: + - image-build +steps: +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml new file mode 100644 index 0000000000000..fc24068c20a46 --- /dev/null +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -0,0 +1,79 @@ +group: Models - Multimodal +depends_on: + - image-build +steps: +- label: Multi-Modal Models (Standard) # 60min + timeout_in_minutes: 80 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor # 44min + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: Multi-Modal Models (Extended) 1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models (Extended) 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models (Extended) 3 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml new file mode 100644 index 0000000000000..60c179aa098e1 --- /dev/null +++ b/.buildkite/test_areas/plugins.yaml @@ -0,0 +1,34 @@ +group: Plugins +depends_on: + - image-build +steps: +- label: Plugin Tests (2 GPUs) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml new file mode 100644 index 0000000000000..703c82eb1a91b --- /dev/null +++ b/.buildkite/test_areas/pytorch.yaml @@ -0,0 +1,50 @@ +group: PyTorch +depends_on: + - image-build +steps: +- label: PyTorch Compilation Unit Tests + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" + +- label: PyTorch Fullgraph Smoke Test + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;" + +- label: PyTorch Fullgraph + timeout_in_minutes: 40 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # fp8 kv scales not supported on sm89, tested on Blackwell instead + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh \ No newline at end of file diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml new file mode 100644 index 0000000000000..6e89d6af3b8d1 --- /dev/null +++ b/.buildkite/test_areas/quantization.yaml @@ -0,0 +1,46 @@ +group: Quantization +depends_on: + - image-build +steps: +- label: Quantization + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: Quantized MoE Test (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Quantized Models Test + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml new file mode 100644 index 0000000000000..ad377148fd073 --- /dev/null +++ b/.buildkite/test_areas/samplers.yaml @@ -0,0 +1,14 @@ +group: Samplers +depends_on: + - image-build +steps: +- label: Samplers Test + timeout_in_minutes: 75 + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml new file mode 100644 index 0000000000000..7040cd1d253b3 --- /dev/null +++ b/.buildkite/test_areas/tool_use.yaml @@ -0,0 +1,23 @@ +group: Tool use +depends_on: + - image-build +steps: +- label: OpenAI-Compatible Tool Use + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) + depends_on: ~ + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml new file mode 100644 index 0000000000000..cfc5bb20fe7ad --- /dev/null +++ b/.buildkite/test_areas/weight_loading.yaml @@ -0,0 +1,25 @@ +group: Weight Loading +depends_on: + - image-build +steps: +- label: Weight Loading Multiple GPU # 33min + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU - Large Models # optional + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt From ea657f2078072541f68fbc11ab83f81a585a9dc4 Mon Sep 17 00:00:00 2001 From: gnovack Date: Mon, 8 Dec 2025 18:35:16 -0800 Subject: [PATCH 091/133] Lora MoE Align Improvements (#29257) Signed-off-by: gnovack --- CMakeLists.txt | 1 - csrc/moe/moe_align_sum_kernels.cu | 425 ++++++++++++++++++++----- csrc/moe/moe_lora_align_sum_kernels.cu | 174 ---------- csrc/moe/moe_ops.h | 2 +- csrc/moe/torch_bindings.cpp | 3 +- tests/lora/test_moe_lora_align_sum.py | 2 +- vllm/_custom_ops.py | 2 + 7 files changed, 360 insertions(+), 249 deletions(-) delete mode 100644 csrc/moe/moe_lora_align_sum_kernels.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index e09972fe71995..69a538b06cba3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -944,7 +944,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" - "csrc/moe/moe_lora_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index ddcdcc38b4fea..5c9e474024082 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -14,7 +14,6 @@ namespace vllm { namespace moe { - namespace batched_moe_align_block_size { // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel. @@ -80,23 +79,30 @@ __global__ void batched_moe_align_block_size_kernel( } // namespace batched_moe_align_block_size template -__global__ void moe_align_block_size_kernel( +__device__ void _moe_align_block_size( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, int32_t* __restrict__ total_tokens_post_pad, int32_t* __restrict__ expert_map, int32_t num_experts, int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded, - bool has_expert_map) { + int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id, + int32_t topk_num, int32_t* token_mask, bool has_expert_map) { extern __shared__ int32_t shared_counts[]; - // Use a separate threadblock to fill sorted_token_ids. + // Compute input buffer offsets. Typically these will all be 0, except when + // using Multi LoRA. + int sorted_token_ids_offset = max_num_tokens_padded * model_offset; + int expert_ids_offset = max_num_m_blocks * model_offset; + int cumsum_offset = (num_experts + 1) * model_offset; + + // Use separate threadblocks to fill sorted_token_ids. // This is safe since the current kernel does not use sorted_token_ids. - if (blockIdx.x == 1) { + if (blockIdx.x % 2) { // Initialize sorted_token_ids with numel for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { - sorted_token_ids[it] = numel; + sorted_token_ids[sorted_token_ids_offset + it] = numel; } return; } @@ -127,7 +133,9 @@ __global__ void moe_align_block_size_kernel( } int warp_idx = expert_id / experts_per_warp; int expert_offset = expert_id % experts_per_warp; - atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); + int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num]; + atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], + mask); } __syncthreads(); @@ -148,77 +156,44 @@ __global__ void moe_align_block_size_kernel( int cumsum_val; BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val); if (expert_id <= num_experts) { - cumsum[expert_id] = cumsum_val; + cumsum[cumsum_offset + expert_id] = cumsum_val; } if (expert_id == num_experts) { - *total_tokens_post_pad = cumsum_val; + total_tokens_post_pad[model_offset] = cumsum_val; } __syncthreads(); if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; + for (int i = cumsum[cumsum_offset + threadIdx.x]; + i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) { + expert_ids[expert_ids_offset + i / block_size] = threadIdx.x; } } // Fill remaining expert_ids with 0 - const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x; - const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); - for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) { - expert_ids[i] = 0; - } -} - -template -__global__ void count_and_sort_expert_tokens_kernel( - const scalar_t* __restrict__ topk_ids, - int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, - int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, - bool has_expert_map) { - const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const size_t stride = blockDim.x * gridDim.x; - - for (size_t i = tid; i < numel; i += stride) { - int32_t expert_id = topk_ids[i]; - if (expert_id >= num_experts) { - continue; - } - if (has_expert_map) { - expert_id = expert_map[expert_id]; - // filter invalid experts - if (expert_id == -1) continue; - } - int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); - sorted_token_ids[rank_post_pad] = i; - } -} - -template -__global__ void moe_sum_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., topk, d] - const int d) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - scalar_t x = 0.0; -#pragma unroll - for (int k = 0; k < TOPK; ++k) { - x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]); - } - out[token_idx * d + idx] = x; + const size_t fill_start_idx = + cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x; + for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) { + expert_ids[expert_ids_offset + i] = inactive_expert_id; } } template -__global__ void moe_align_block_size_small_batch_expert_kernel( +__device__ void _moe_align_block_size_small_batch_expert( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, int32_t* __restrict__ total_tokens_post_pad, int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size, - size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) { + size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks, + int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num, + int32_t* token_mask, bool has_expert_map) { + // Compute input buffer offsets. Typically these will all be 0, except when + // using Multi LoRA. + int sorted_token_ids_offset = max_num_tokens_padded * model_offset; + int expert_ids_offset = max_num_m_blocks * model_offset; + // Use an additional group of threads to fill sorted_token_ids. // Since the current kernel will use sorted_token_ids afterward, // we fill sorted_token_ids within the same threadblock to make @@ -227,7 +202,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( // Initialize sorted_token_ids with numel for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += fill_threads) { - sorted_token_ids[it] = numel; + sorted_token_ids[sorted_token_ids_offset + it] = numel; } // Three __syncthreads() corresponding to the other threads __syncthreads(); @@ -254,7 +229,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( // filter invalid expert if (expert_id == -1) continue; } - ++tokens_cnts[(tid + 1) * num_experts + expert_id]; + int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num]; + tokens_cnts[(tid + 1) * num_experts + expert_id] += mask; } __syncthreads(); @@ -277,22 +253,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) * block_size; } - *total_tokens_post_pad = static_cast(cumsum[num_experts]); + total_tokens_post_pad[model_offset] = + static_cast(cumsum[num_experts]); } __syncthreads(); if (tid < num_experts) { for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) { - expert_ids[i / block_size] = tid; + expert_ids[expert_ids_offset + i / block_size] = tid; } } // Fill remaining expert_ids with 0 const size_t fill_start_idx = cumsum[num_experts] / block_size + tid; - const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); - for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) { - expert_ids[i] = 0; + for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) { + expert_ids[expert_ids_offset + i] = inactive_expert_id; } for (size_t i = tid; i < numel; i += stride) { @@ -304,11 +280,195 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( } int32_t rank_post_pad = tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id]; - sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[tid * num_experts + expert_id]; + + if (token_mask == nullptr || token_mask[i / topk_num]) { + sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i; + ++tokens_cnts[tid * num_experts + expert_id]; + } } } +template +__device__ void _count_and_sort_expert_tokens( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask, + int32_t model_offset, int32_t topk_num, bool has_expert_map) { + const size_t tid = blockIdx.y * blockDim.x + threadIdx.x; + const size_t stride = blockDim.x * gridDim.y; + + for (size_t i = tid; i < numel; i += stride) { + int32_t expert_id = topk_ids[i]; + if (expert_id >= num_experts) { + continue; + } + + if (has_expert_map) { + expert_id = expert_map[expert_id]; + // filter invalid experts + if (expert_id == -1) continue; + } + + if (token_mask == nullptr || token_mask[i / topk_num]) { + int32_t rank_post_pad = atomicAdd( + &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1); + sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] = + i; + } + } +} + +template +__global__ void moe_align_block_size_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int32_t* __restrict__ total_tokens_post_pad, + int32_t* __restrict__ expert_map, int32_t num_experts, + int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, + size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded, + int32_t topk_num, bool has_expert_map) { + _moe_align_block_size( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, padded_num_experts, experts_per_warp, block_size, numel, + cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size), + 0, 0, topk_num, nullptr, has_expert_map); +} + +template +__global__ void count_and_sort_expert_tokens_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) { + _count_and_sort_expert_tokens( + topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts, + max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map); +} + +template +__global__ void moe_sum_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., topk, d] + const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + scalar_t x = 0.0; +#pragma unroll + for (int k = 0; k < TOPK; ++k) { + x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]); + } + out[token_idx * d + idx] = x; + } +} + +template +__global__ void moe_align_block_size_small_batch_expert_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int32_t* __restrict__ total_tokens_post_pad, + int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size, + size_t numel, int32_t max_num_tokens_padded, int32_t topk_num, + bool has_expert_map) { + _moe_align_block_size_small_batch_expert( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, block_size, numel, max_num_tokens_padded, + CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr, + has_expert_map); +} + +template +__global__ void moe_lora_align_block_size_kernel( + scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping, + int64_t block_size, int32_t* __restrict__ expert_map, int num_experts, + int max_loras, size_t numel, int max_num_tokens_padded, + int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids, + int32_t* __restrict__ expert_ids, int32_t topk_num, + int32_t* total_tokens_post_pad, int32_t* adapter_enabled, + int32_t* __restrict__ cumsum, int32_t experts_per_warp, + int32_t padded_num_experts, int32_t* lora_ids, + int32_t* __restrict__ token_mask, bool has_expert_map) { + int lora_idx = blockIdx.x / 2; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1 || adapter_enabled[lora_id] == 0) { + return; + } + + // Populate the token_mask based on the token-LoRA mapping + int num_tokens = numel / topk_num; + if (threadIdx.x == 0) { + total_tokens_post_pad[lora_id] = 0; + + for (int i = 0; i < num_tokens; i++) { + token_mask[(lora_id * num_tokens) + i] = + (int)token_lora_mapping[i] == lora_id; + } + } + + __syncthreads(); + + _moe_align_block_size( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, padded_num_experts, experts_per_warp, block_size, numel, + cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num, + &token_mask[(lora_id * num_tokens)], has_expert_map); +} + +template +__global__ void lora_count_and_sort_expert_tokens_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask, + int32_t* lora_ids, bool has_expert_map) { + int lora_idx = blockIdx.x; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1) { + return; + } + + int num_tokens = numel / topk_num; + + _count_and_sort_expert_tokens( + topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts, + max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id, + topk_num, has_expert_map); +} + +template +__global__ void moe_lora_align_block_size_small_batch_expert_kernel( + scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping, + int64_t block_size, int32_t* __restrict__ expert_map, int num_experts, + int max_loras, size_t numel, int max_num_tokens_padded, + int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids, + int32_t* __restrict__ expert_ids, int topk_num, + int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids, + int32_t* token_mask, bool has_expert_map) { + int lora_idx = blockIdx.x; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1 || adapter_enabled[lora_id] == 0) { + return; + } + + int num_tokens = numel / topk_num; + if (threadIdx.x == 0) { + total_tokens_post_pad[lora_id] = 0; + + for (int i = 0; i < num_tokens; i++) { + token_mask[(lora_id * num_tokens) + i] = + (int)token_lora_mapping[i] == lora_id; + } + } + + __syncthreads(); + + _moe_align_block_size_small_batch_expert( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks, + -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)], + has_expert_map); +} + } // namespace moe } // namespace vllm @@ -365,7 +525,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), expert_map.data_ptr(), num_experts, block_size, - topk_ids.numel(), sorted_token_ids.size(0), has_expert_map); + topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1), + has_expert_map); } else { torch::Tensor cumsum_buffer = torch::empty({num_experts + 1}, options_int); @@ -386,21 +547,23 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, expert_map.data_ptr(), num_experts, padded_num_experts, experts_per_warp, block_size, topk_ids.numel(), cumsum_buffer.data_ptr(), sorted_token_ids.size(0), - has_expert_map); + topk_ids.size(1), has_expert_map); const int block_threads = std::min(256, (int)threads); const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads; const int max_blocks = 65535; const int actual_blocks = std::min(num_blocks, max_blocks); + dim3 gridDims(1, actual_blocks); auto sort_kernel = vllm::moe::count_and_sort_expert_tokens_kernel; - sort_kernel<<>>( + sort_kernel<<>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), cumsum_buffer.data_ptr(), expert_map.data_ptr(), - topk_ids.numel(), num_experts, has_expert_map); + topk_ids.numel(), num_experts, sorted_token_ids.size(0), + topk_ids.size(1), has_expert_map); } }); } @@ -474,3 +637,123 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] break; } } + +void moe_lora_align_block_size( + torch::Tensor topk_ids, torch::Tensor token_lora_mapping, + int64_t num_experts, int64_t block_size, int64_t max_loras, + int64_t max_num_tokens_padded, int64_t max_num_m_blocks, + torch::Tensor sorted_token_ids, torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, + torch::Tensor lora_ids, std::optional maybe_expert_map) { + const int topk_num = topk_ids.size(1); + + TORCH_CHECK(block_size > 0, "block_size should be greater than 0. "); + + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int64_t padded_num_experts = + ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; + + // BlockScan uses 1024 threads and assigns one thread per expert. + TORCH_CHECK(padded_num_experts < 1024, + "padded_num_experts must be less than 1024"); + + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + torch::Tensor token_mask = + torch::empty({max_loras * topk_ids.size(0)}, options_int); + bool has_expert_map = maybe_expert_map.has_value(); + torch::Tensor expert_map; + if (has_expert_map) { + expert_map = maybe_expert_map.value(); + } else { + expert_map = torch::empty({0}, options_int); + } + + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] { + bool small_batch_expert_mode = + (topk_ids.numel() < 1024) && (num_experts <= 64); + + if (small_batch_expert_mode) { + const int32_t num_thread = max((int32_t)num_experts, 128); + const int32_t shared_mem = + (num_thread + 1) * num_experts * sizeof(int32_t) + + (num_experts + 1) * sizeof(int32_t); + if (shared_mem > device_max_shared_mem) { + TORCH_CHECK(false, "Shared memory usage exceeds device limit."); + } + + // threadIdx.x >= fill_threads: counting experts and aligning + // threadIdx.x < fill_threads: filling sorted_token_ids + constexpr int32_t fill_threads = 256; + + dim3 blockDim(num_thread + fill_threads); + auto kernel = + vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel< + scalar_t, fill_threads>; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem)); + kernel<<>>( + topk_ids.data_ptr(), + token_lora_mapping.data_ptr(), block_size, + expert_map.data_ptr(), num_experts, max_loras, + topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks, + sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), topk_num, + num_tokens_post_pad.data_ptr(), + adapter_enabled.data_ptr(), lora_ids.data_ptr(), + token_mask.data_ptr(), has_expert_map); + } else { + int num_thread = 1024; + dim3 blockDim(num_thread); + size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE); + + size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t); + + // cumsum buffer + torch::Tensor cumsum = + torch::zeros({max_loras * (num_experts + 1)}, options_int); + + auto align_kernel = + vllm::moe::moe_lora_align_block_size_kernel; + + // launch two threadblocks for each lora + // blockIdx.x % 2 == 0: counting experts and aligning + // blockIdx.x % 2 == 1: filling sorted_token_ids + align_kernel<<>>( + topk_ids.data_ptr(), + token_lora_mapping.data_ptr(), block_size, + expert_map.data_ptr(), num_experts, max_loras, + topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks, + sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), topk_num, + num_tokens_post_pad.data_ptr(), + adapter_enabled.data_ptr(), cumsum.data_ptr(), + WARP_SIZE, padded_num_experts, lora_ids.data_ptr(), + token_mask.data_ptr(), has_expert_map); + + const int block_threads = std::min(256, (int)num_thread); + const int num_blocks = + (topk_ids.numel() + block_threads - 1) / block_threads; + + const int max_blocks = 65535; + const int actual_blocks = std::min(num_blocks, max_blocks); + + dim3 gridDims(max_loras, actual_blocks); + auto sort_kernel = + vllm::moe::lora_count_and_sort_expert_tokens_kernel; + + sort_kernel<<>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), cumsum.data_ptr(), + expert_map.data_ptr(), topk_ids.numel(), num_experts, + max_num_tokens_padded, topk_num, token_mask.data_ptr(), + lora_ids.data_ptr(), has_expert_map); + } + }); +} \ No newline at end of file diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu deleted file mode 100644 index 360f1312cf579..0000000000000 --- a/csrc/moe/moe_lora_align_sum_kernels.cu +++ /dev/null @@ -1,174 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "../cuda_compat.h" -#include "../dispatch_utils.h" -#include "core/math.hpp" - -namespace { - -__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, - int32_t col) { - return row * total_col + col; -} - -} // namespace - -// TODO: Refactor common parts with moe_align_sum_kernels -template -__global__ void moe_lora_align_sum_kernel( - scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping, - int64_t block_size, int num_experts, int max_loras, size_t numel, - int max_num_tokens_padded, int max_num_m_blocks, - int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, - int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled, - int32_t* lora_ids) { - const size_t tokens_per_thread = div_ceil(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; - - int lora_idx = blockIdx.x; - int lora_id = lora_ids[lora_idx]; - if (lora_id == -1 || adapter_enabled[lora_id] == 0) { - return; - } - extern __shared__ int32_t shared_mem[]; - int32_t* cumsum = shared_mem; - token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1); - - // Initialize sorted_token_ids with numel - for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { - sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel; - } - - // Initialize expert_ids with -1 - for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) { - expert_ids[lora_id * max_num_m_blocks + it] = -1; - } - - // Initialize total_tokens_post_pad with 0 - if (threadIdx.x == 0) { - total_tokens_post_pad[lora_id] = 0; - } - - for (int i = 0; i < num_experts; ++i) { - tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; - } - - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int mask = token_lora_mapping[i / topk_num] == lora_id; - int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]); - tokens_cnts[idx] += mask; - } - - __syncthreads(); - - // For each expert we accumulate the token counts from the different threads. - if (threadIdx.x < num_experts) { - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += - tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; - } - } - - __syncthreads(); - - // We accumulate the token counts of all experts in thread 0. - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i - 1] + - div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)], - block_size) * - block_size; - } - total_tokens_post_pad[lora_id] = static_cast(cumsum[num_experts]); - } - - __syncthreads(); - - /** - * For each expert, each thread processes the tokens of the corresponding - * blocks and stores the corresponding expert_id for each block. - */ - if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] = - threadIdx.x; - } - } - - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int32_t expert_id = topk_ids[i]; - /** The cumsum[expert_id] stores the starting index of the tokens that the - * expert with expert_id needs to process, and - * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens - * processed by the expert with expert_id within the current thread's token - * shard. - */ - int32_t rank_post_pad = - tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + - cumsum[expert_id]; - - int mask = (int)token_lora_mapping[i / topk_num] == lora_id; - atomicAdd( - &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)], - (i - numel) * mask); - tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask; - } -} - -void moe_lora_align_block_size( - torch::Tensor topk_ids, torch::Tensor token_lora_mapping, - int64_t num_experts, int64_t block_size, int64_t max_loras, - int64_t max_num_tokens_padded, int64_t max_num_m_blocks, - torch::Tensor sorted_token_ids, torch::Tensor expert_ids, - torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, - torch::Tensor lora_ids) { - const int topk_num = topk_ids.size(1); - - TORCH_CHECK(block_size > 0, "block_size should be greater than 0. "); - - int device_max_shared_mem; - auto dev = topk_ids.get_device(); - cudaDeviceGetAttribute(&device_max_shared_mem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE, - TORCH_CHECK(num_thread <= 1024, - "num_thread must be less than 1024, " - "and fallback is not implemented yet."); - const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) + - (num_experts + 1) * sizeof(int32_t); - - if (shared_mem > device_max_shared_mem) { - TORCH_CHECK(false, - "Shared memory usage exceeds device limit, and global memory " - "fallback is not implemented yet."); - } - - VLLM_DISPATCH_INTEGRAL_TYPES( - topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] { - dim3 blockDim(num_thread); - auto kernel = moe_lora_align_sum_kernel; - AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<>>( - topk_ids.data_ptr(), - token_lora_mapping.data_ptr(), block_size, num_experts, - max_loras, topk_ids.numel(), max_num_tokens_padded, - max_num_m_blocks, sorted_token_ids.data_ptr(), - expert_ids.data_ptr(), topk_num, - num_tokens_post_pad.data_ptr(), - adapter_enabled.data_ptr(), lora_ids.data_ptr()); - }); -} \ No newline at end of file diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 4c7accf03440a..337dcc50b079e 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -27,7 +27,7 @@ void moe_lora_align_block_size( int64_t max_num_tokens_padded, int64_t max_num_m_blocks, torch::Tensor sorted_token_ids, torch::Tensor expert_ids, torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, - torch::Tensor lora_ids); + torch::Tensor lora_ids, std::optional maybe_expert_map); #ifndef USE_ROCM torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor b_qweight, torch::Tensor b_scales, diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index fca57c31caf8e..779ad70ad1e09 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor !experts_ids," " Tensor !num_tokens_post_pad," " Tensor !adapter_enabled," - " Tensor !lora_ids) -> () "); + " Tensor !lora_ids," + " Tensor? maybe_expert_map) -> () "); m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size); #ifndef USE_ROCM diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py index 72f1d759f1e7a..3a17f3eba6e8b 100644 --- a/tests/lora/test_moe_lora_align_sum.py +++ b/tests/lora/test_moe_lora_align_sum.py @@ -32,7 +32,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num): @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920 @pytest.mark.parametrize("topk_num", [6]) -@pytest.mark.parametrize("num_experts", [64, 128]) +@pytest.mark.parametrize("num_experts", [64, 128, 256, 512]) @pytest.mark.parametrize("max_loras", [2, 32]) @pytest.mark.parametrize("block_size", [16]) def test_moe_lora_align_block_size( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 77d5453291e3c..56c780ceb1cb5 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1961,6 +1961,7 @@ def moe_lora_align_block_size( num_tokens_post_pad: torch.Tensor, adapter_enabled: torch.Tensor, lora_ids: torch.Tensor, + expert_map: torch.Tensor | None = None, ) -> None: torch.ops._moe_C.moe_lora_align_block_size( topk_ids, @@ -1975,6 +1976,7 @@ def moe_lora_align_block_size( num_tokens_post_pad, adapter_enabled, lora_ids, + expert_map, ) From f6227c22ab8976a24913122874c24624102da1b4 Mon Sep 17 00:00:00 2001 From: czhu-cohere Date: Mon, 8 Dec 2025 22:29:06 -0500 Subject: [PATCH 092/133] [Kernel]Support W4A8 Grouped GEMM on Hopper (#29691) Signed-off-by: czhu-cohere --- CMakeLists.txt | 5 +- csrc/ops.h | 3 +- .../cutlass_w4a8/get_group_starts.cuh | 104 ++++ .../cutlass_w4a8/w4a8_grouped_mm_entry.cu | 483 ++++++++++++++++++ .../cutlass_w4a8/w4a8_mm_entry.cu | 70 +-- csrc/quantization/cutlass_w4a8/w4a8_utils.cu | 90 ++++ csrc/quantization/cutlass_w4a8/w4a8_utils.cuh | 11 + .../quantization/w8a8/cutlass/moe/moe_data.cu | 8 +- .../w8a8/cutlass/scaled_mm_entry.cu | 8 +- csrc/torch_bindings.cpp | 26 +- .../kernels/quantization/test_cutlass_w4a8.py | 46 +- .../quantization/test_cutlass_w4a8_moe.py | 340 ++++++++++++ vllm/_custom_ops.py | 90 +++- .../layers/fused_moe/__init__.py | 4 + .../model_executor/layers/fused_moe/config.py | 29 ++ .../layers/fused_moe/cutlass_moe.py | 401 +++++++++++++++ .../layers/fused_moe/modular_kernel.py | 2 +- .../compressed_tensors/compressed_tensors.py | 2 +- .../compressed_tensors_moe.py | 340 +++++++++++- .../schemes/compressed_tensors_w4a8_fp8.py | 15 +- .../kernels/mixed_precision/cutlass.py | 19 +- .../layers/quantization/utils/quant_utils.py | 50 +- 22 files changed, 2045 insertions(+), 101 deletions(-) create mode 100644 csrc/quantization/cutlass_w4a8/get_group_starts.cuh create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_utils.cu create mode 100644 csrc/quantization/cutlass_w4a8/w4a8_utils.cuh create mode 100644 tests/kernels/quantization/test_cutlass_w4a8_moe.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 69a538b06cba3..6b93e3fe91603 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS) set(SRCS - "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu") + "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu" + "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu" + "csrc/quantization/cutlass_w4a8/w4a8_utils.cu" + ) set_gencode_flags_for_srcs( SRCS "${SRCS}" diff --git a/csrc/ops.h b/csrc/ops.h index 5fce3a1a3fea3..37e3aaf7499d5 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -262,7 +262,8 @@ void get_cutlass_moe_mm_data( void get_cutlass_moe_mm_problem_sizes( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets); + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt); void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, diff --git a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh new file mode 100644 index 0000000000000..fec142d0d87a1 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh @@ -0,0 +1,104 @@ +// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh +#pragma once + +#include +#include +#include + +#include "core/scalar_type.hpp" +#include "cutlass/bfloat16.h" +#include "cutlass/float8.h" + +// ElementB is int32 (packed int4) +// ElementGroupScale is cutlass::Array (packed fp8) +template +__global__ void get_group_gemm_starts( + int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets, + ElementC** out_offsets, ElementAccumulator** a_scales_offsets, + ElementAccumulator** b_scales_offsets, + ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int, + ElementB* b_base_as_int, ElementC* out_base_as_int, + ElementAccumulator* a_scales_base_as_int, + ElementAccumulator* b_scales_base_as_int, + ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k, + int64_t scale_k) { + int expert_id = threadIdx.x; + + int64_t expert_offset = expert_offsets[expert_id]; + + // same as w8a8 + a_offsets[expert_id] = a_base_as_int + expert_offset * k; + out_offsets[expert_id] = out_base_as_int + expert_offset * n; + a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset; + b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id); + + // w4a8 specific + constexpr int pack_factor = 8; // pack 8 int4 into int32 + b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor); + b_group_scales_offsets[expert_id] = + b_group_scales_base_as_int + (expert_id * scale_k * n); +} + +#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \ + else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ + get_group_gemm_starts> \ + <<<1, num_experts, 0, stream>>>( \ + static_cast(expert_offsets.data_ptr()), \ + static_cast(a_ptrs.data_ptr()), \ + static_cast(b_ptrs.data_ptr()), \ + static_cast(out_ptrs.data_ptr()), \ + static_cast(a_scales_ptrs.data_ptr()), \ + static_cast(b_scales_ptrs.data_ptr()), \ + static_cast**>( \ + b_group_scales_ptrs.data_ptr()), \ + static_cast(a_tensors.data_ptr()), \ + static_cast(b_tensors.data_ptr()), \ + static_cast(out_tensors.data_ptr()), \ + static_cast(a_scales.data_ptr()), \ + static_cast(b_scales.data_ptr()), \ + static_cast*>( \ + b_group_scales.data_ptr()), \ + n, k, scale_k); \ + } + +namespace { + +void run_get_group_gemm_starts( + torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs, + torch::Tensor& b_ptrs, torch::Tensor& out_ptrs, + torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs, + torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor& out_tensors, + torch::Tensor const& a_scales, torch::Tensor const& b_scales, + torch::Tensor const& b_group_scales, const int64_t b_group_size) { + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b_tensors.dtype() == torch::kInt32); // int4 8x packed into int32 + TORCH_CHECK(a_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_group_scales.dtype() == + torch::kFloat8_e4m3fn); // the underlying torch type is e4m3 + TORCH_CHECK(out_tensors.dtype() == + torch::kBFloat16); // only support bf16 for now + // expect int64_t to avoid overflow during offset calculations + TORCH_CHECK(expert_offsets.dtype() == torch::kInt64); + + int num_experts = static_cast(expert_offsets.size(0)); + // logical k, n + int64_t n = out_tensors.size(1); + int64_t k = a_tensors.size(1); + int64_t scale_k = cutlass::ceil_div(k, b_group_size); + + auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); + + if (false) { + } + __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t) + __CALL_GET_STARTS_KERNEL(torch::kFloat16, half) + else { + TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)"); + } +} + +} // namespace \ No newline at end of file diff --git a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu new file mode 100644 index 0000000000000..4b425790dbac7 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu @@ -0,0 +1,483 @@ +#include +#include + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/group_array_problem_shape.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/mixed_dtype_utils.hpp" + +// vllm includes +#include +#include +#include +#include "cutlass_extensions/torch_utils.hpp" +#include "cutlass_extensions/common.hpp" + +#include "core/registration.h" +#include "get_group_starts.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" +#include "w4a8_utils.cuh" + +namespace vllm::cutlass_w4a8_moe { + +using namespace cute; + +// ------------------------------------------------------------------------------------- +// Static configuration shared across all instantiations +// ------------------------------------------------------------------------------------- +using ProblemShape = + cutlass::gemm::GroupProblemShape>; // per + // group +using MmaType = cutlass::float_e4m3_t; +using QuantType = cutlass::int4b_t; + +constexpr int TileShapeK = 128 * 8 / sizeof_bits::value; +static int constexpr PackFactor = 8; // 8 int4 packed into int32 + +// A matrix configuration +using ElementA = MmaType; +using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand +constexpr int AlignmentA = + 128 / + cutlass::sizeof_bits::value; // Alignment of A matrix in units of + // elements (up to 16 bytes) + +// B matrix configuration +using ElementB = QuantType; // Element type for B matrix operand +using LayoutB = + cutlass::layout::ColumnMajor; // Layout type for B matrix operand +constexpr int AlignmentB = + 128 / cutlass::sizeof_bits< + ElementB>::value; // Memory access granularity/alignment of B + // matrix in units of elements (up to 16 bytes) + +// This example manually swaps and transposes, so keep transpose of input +// layouts +using LayoutA_Transpose = + typename cutlass::layout::LayoutTranspose::type; +using LayoutB_Transpose = + typename cutlass::layout::LayoutTranspose::type; + +// Need to pass a pointer type to make the 3rd dimension of Stride be _0 +using StrideA = + cute::remove_pointer_t>; +using StrideB = + cute::remove_pointer_t>; + +// Define the CuTe layout for reoredered quantized tensor B +// LayoutAtomQuant places values that will be read by the same thread in +// contiguous locations in global memory. It specifies the reordering within a +// single warp's fragment +using LayoutAtomQuant = + decltype(cutlass::compute_memory_reordering_atom()); +using LayoutB_Reordered = decltype(cute::tile_to_shape( + LayoutAtomQuant{}, Layout>, StrideB>{})); + +using ElementScale = cutlass::float_e4m3_t; +using LayoutScale = cutlass::layout::RowMajor; + +// C/D matrix configuration +using ElementC = + cutlass::bfloat16_t; // Element type for C and D matrix operands +using LayoutC = + cutlass::layout::RowMajor; // Layout type for C and D matrix operands +constexpr int AlignmentC = + 128 / cutlass::sizeof_bits< + ElementC>::value; // Memory access granularity/alignment of C + // matrix in units of elements (up to 16 bytes) + +// D matrix configuration +using ElementD = ElementC; +using LayoutD = LayoutC; +constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + +// Core kernel configurations +using ElementAccumulator = float; // Element type for internal accumulation +using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that + // supports the intended feature +using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag +using StageCountType = + cutlass::gemm::collective::StageCountAuto; // Stage count maximized based + // on the tile size + +// per-channel and per-token scales for epilogue +using ElementSChannel = float; + +template +struct W4A8GroupedGemmKernel { + using TileShape = + decltype(cute::append(TileShape_MN{}, cute::Int{})); + using ClusterShape = ClusterShape_MNK; + + // per-channel, per-token scales epilogue + using ChTokScalesEpilogue = + typename vllm::c3x::ScaledEpilogueArray; + using EVTCompute = typename ChTokScalesEpilogue::EVTCompute; + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, TileShape, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, + ElementSChannel, ElementC, + typename cutlass::layout::LayoutTranspose::type*, AlignmentC, + ElementD, typename cutlass::layout::LayoutTranspose::type*, + AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp; + + // =========================================================== MIXED INPUT + // WITH SCALES + // =========================================================================== + // The Scale information must get paired with the operand that will be scaled. + // In this example, B is scaled so we make a tuple of B's information and the + // scale information. + using CollectiveMainloopShuffled = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, + cute::tuple>, + LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*, + AlignmentA, ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal< + ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>; + + using GemmShuffled = + cutlass::gemm::device::GemmUniversalAdapter; + + using StrideC = typename GemmKernelShuffled::InternalStrideC; + using StrideD = typename GemmKernelShuffled::InternalStrideD; + + using StrideC_ref = cutlass::detail::TagToStrideC_t; + using StrideD_ref = cutlass::detail::TagToStrideC_t; + using StrideS = typename CollectiveMainloopShuffled::StrideScale; + using StrideS_ref = cutlass::detail::TagToStrideB_t; + + // static asserts for passing in strides/layouts + // pack to 2x int64 + static_assert(sizeof(StrideS) == 2 * sizeof(int64_t)); + // pack to 3xint32, + static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0, + "LayoutB_Reordered size must be divisible by 4 bytes"); + + static void grouped_mm( + torch::Tensor& out_tensors, const torch::Tensor& a_tensors, + const torch::Tensor& b_tensors, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, const torch::Tensor& b_group_scales, + const int64_t b_group_size, const torch::Tensor& expert_offsets, + const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides, + const torch::Tensor& b_strides, const torch::Tensor& c_strides, + const torch::Tensor& group_scale_strides) { + auto device = a_tensors.device(); + auto device_id = device.index(); + const at::cuda::OptionalCUDAGuard device_guard(device); + auto stream = at::cuda::getCurrentCUDAStream(device_id); + + int num_experts = static_cast(expert_offsets.size(0)); + int n = static_cast(b_tensors.size(1)); + int k = static_cast(b_tensors.size(2)) * PackFactor; + + auto options_int = + torch::TensorOptions().dtype(torch::kInt64).device(device); + torch::Tensor a_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_ptrs = torch::empty(num_experts, options_int); + torch::Tensor out_ptrs = torch::empty(num_experts, options_int); + torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int); + + // get the correct offsets to pass to gemm + run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs, + a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs, + a_tensors, b_tensors, out_tensors, a_scales, + b_scales, b_group_scales, b_group_size); + + // construct args + using Args = typename GemmShuffled::Arguments; + using MainloopArguments = typename GemmKernelShuffled::MainloopArguments; + using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments; + Args arguments; + + ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes = + static_cast( + problem_sizes_torch.data_ptr()); + ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr}; + + // SwapAB so B operands come first + MainloopArguments mainloop_arguments{ + static_cast(b_ptrs.data_ptr()), + static_cast(b_strides.data_ptr()), + static_cast(a_ptrs.data_ptr()), + static_cast(a_strides.data_ptr()), + static_cast**>( + b_group_scales_ptrs.data_ptr()), + static_cast(group_scale_strides.data_ptr()), + static_cast(b_group_size)}; + + EpilogueArguments epilogue_arguments{ + // since we are doing SwapAB the channel scales comes first, then token + // scales + ChTokScalesEpilogue::prepare_args( // see ScaledEpilogueArray + static_cast( + b_scales_ptrs.data_ptr()), // per-channel + static_cast( + a_scales_ptrs.data_ptr()), // per-token + true, true), + nullptr, // C + static_cast(c_strides.data_ptr()), // C + static_cast(out_ptrs.data_ptr()), // D + static_cast(c_strides.data_ptr()) // D + }; + + static const cutlass::KernelHardwareInfo hw_info{ + device_id, + cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + device_id)}; + + arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, + mainloop_arguments, epilogue_arguments, hw_info}; + + // Allocate workspace + size_t workspace_size = GemmShuffled::get_workspace_size(arguments); + torch::Tensor workspace = + torch::empty(workspace_size, + torch::TensorOptions().dtype(torch::kU8).device(device)); + + // Run GEMM + GemmShuffled gemm; + CUTLASS_CHECK(gemm.can_implement(arguments)); + CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream)); + CUTLASS_CHECK(gemm.run(stream)); + } +}; + +// ---------------------------------------------------------------------------- +// Kernel instantiations and dispatch logic +// ---------------------------------------------------------------------------- +using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative; +using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative; + +// Kernel_TileShape_ClusterShape_Schedule +using Kernel_128x16_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_128x16_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x16_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x16_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x32_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x32_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x64_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x64_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x128_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x128_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_128x256_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +void mm_dispatch( + torch::Tensor& out_tensors, const torch::Tensor& a_tensors, + const torch::Tensor& b_tensors, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, const torch::Tensor& b_group_scales, + const int64_t b_group_size, const torch::Tensor& expert_offsets, + const torch::Tensor& problem_sizes, const torch::Tensor& a_strides, + const torch::Tensor& b_strides, const torch::Tensor& c_strides, + const torch::Tensor& group_scale_strides, const std::string& schedule) { + if (schedule == "Kernel_128x16_1x1x1_Coop") { + Kernel_128x16_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_128x16_2x1x1_Coop") { + Kernel_128x16_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x16_1x1x1_Coop") { + Kernel_256x16_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x16_2x1x1_Coop") { + Kernel_256x16_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x32_1x1x1_Coop") { + Kernel_256x32_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x32_2x1x1_Coop") { + Kernel_256x32_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x64_1x1x1_Coop") { + Kernel_256x64_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x64_2x1x1_Coop") { + Kernel_256x64_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x128_1x1x1_Coop") { + Kernel_256x128_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x128_2x1x1_Coop") { + Kernel_256x128_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_128x256_2x1x1_Coop") { + Kernel_128x256_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else { + TORCH_CHECK(false, + "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule); + } +} + +void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors, + const torch::Tensor& b_tensors, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, const torch::Tensor& b_group_scales, + const int64_t b_group_size, const torch::Tensor& expert_offsets, + const torch::Tensor& problem_sizes, const torch::Tensor& a_strides, + const torch::Tensor& b_strides, const torch::Tensor& c_strides, + const torch::Tensor& group_scale_strides, + std::optional maybe_schedule) { + // user has specified a schedule + if (maybe_schedule) { + mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + b_group_scales, b_group_size, expert_offsets, problem_sizes, + a_strides, b_strides, c_strides, group_scale_strides, + *maybe_schedule); + return; + } + + // use heuristic + int m_full = a_tensors.size(0); + int n = b_tensors.size(1); + int k = b_tensors.size(2) * PackFactor; // logical k + int num_experts = b_tensors.size(0); + // per-expert batch size assuming uniform distribution + int m_expert = m_full / num_experts; + + std::string schedule; + if (m_expert <= 16) { + schedule = "Kernel_128x16_2x1x1_Coop"; + } else if (m_expert <= 32) { + schedule = "Kernel_256x32_1x1x1_Coop"; + } else if (m_expert <= 64) { + schedule = "Kernel_256x64_1x1x1_Coop"; + } else if (m_expert <= 128) { + schedule = "Kernel_256x128_2x1x1_Coop"; + } else { // m_expert > 128 + schedule = "Kernel_128x256_2x1x1_Coop"; + } + + mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + b_group_scales, b_group_size, expert_offsets, problem_sizes, + a_strides, b_strides, c_strides, group_scale_strides, schedule); +} + +std::tuple encode_and_reorder_int4b( + torch::Tensor const& b_tensors) { + TORCH_CHECK(b_tensors.dtype() == torch::kInt32); + TORCH_CHECK(b_tensors.dim() == 3); // (experts, n, k) + TORCH_CHECK(b_tensors.is_contiguous()); + TORCH_CHECK(b_tensors.is_cuda()); + + int n = static_cast(b_tensors.size(1)); + int k = static_cast(b_tensors.size(2)) * PackFactor; // logical k + + // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0. + // These misalignments cause silent OOB unless run under Compute Sanitizer. + TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256"); + TORCH_CHECK(n % 16 == 0, "n must be divisible by 16"); + + // we will store the layout to an int32 tensor; + // this is the number of elements we need per layout + constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t); + + torch::Tensor b_tensors_packed = torch::empty_like(b_tensors); + int num_experts = static_cast(b_tensors.size(0)); + + auto b_ptr = static_cast(b_tensors.const_data_ptr()); + auto b_packed_ptr = static_cast(b_tensors_packed.data_ptr()); + + // multiply by ull so result does not overflow int32 + size_t num_int4_elems = 1ull * num_experts * n * k; + bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr, + num_int4_elems); + TORCH_CHECK(ok, "unified_encode_int4b failed"); + + // construct the layout once; assumes each expert has the same layout + using LayoutType = LayoutB_Reordered; + std::vector layout_B_reordered_host(num_experts); + auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}}); + auto shape_B = cute::make_shape(n, k, Int<1>{}); + auto layout_B = make_layout(shape_B, stride_B); + LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B); + + // reorder weights for each expert + for (int i = 0; i < num_experts; i++) { + // since the storage type of int4b is 1 byte but one element is 4 bits + // we need to adjust the offset + int64_t offset = + 1ull * i * n * k * cutlass::sizeof_bits::value / 8; + cutlass::reorder_tensor(b_packed_ptr + offset, layout_B, + layout_B_reordered); + } + + // save the packed layout to torch tensor so we can re-use it + auto cpu_opts = + torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU); + torch::Tensor layout_cpu = + torch::empty({num_experts, layout_width}, cpu_opts); + + int32_t* layout_data = layout_cpu.data_ptr(); + for (int i = 0; i < num_experts; ++i) { + std::memcpy(layout_data + i * layout_width, // dst (int32*) + &layout_B_reordered, // src (LayoutType*) + sizeof(LayoutType)); // number of bytes + } + + torch::Tensor packed_layout = + layout_cpu.to(b_tensors.device(), /*non_blocking=*/false); + + return {b_tensors_packed, packed_layout}; +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("cutlass_w4a8_moe_mm", &mm); + m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b); +} + +} // namespace vllm::cutlass_w4a8_moe +///////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index 2d1568b08651c..f77af06cd6c08 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -7,6 +7,7 @@ #include #include #include "cutlass_extensions/torch_utils.hpp" +#include "w4a8_utils.cuh" #include "core/registration.h" @@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) { return packed_scales; } -/* - GPU-accelerated implementation of cutlass::unified_encode_int4b. - Constructs a lookup table in constant memory to map 8 bits - (two 4-bit values) at a time. Assumes memory is contiguous - and pointers are 16-byte aligned. -*/ -__constant__ uint8_t kNibbleLUT[256]; - -__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out, - size_t nbytes) { - constexpr size_t V = sizeof(uint4); // 16 bytes - const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const size_t nthreads = size_t(gridDim.x) * blockDim.x; - const size_t nvec = nbytes / V; - - // 1-D grid-stride loop over 16-byte chunks - for (size_t vec = tid; vec < nvec; vec += nthreads) { - uint4 v = reinterpret_cast(in)[vec]; - uint8_t* b = reinterpret_cast(&v); -#pragma unroll - for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]]; - reinterpret_cast(out)[vec] = v; - } -} - -static bool upload_lut() { - std::array lut{}; - auto map_nib = [](uint8_t v) -> uint8_t { - // 1..7 -> (8 - v); keep 0 and 8..15 - return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v); - }; - for (int b = 0; b < 256; ++b) { - uint8_t lo = b & 0xF; - uint8_t hi = (b >> 4) & 0xF; - lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo)); - } - cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(), - /*offset=*/0, cudaMemcpyHostToDevice); - - return (e == cudaSuccess); -} - -static bool unified_encode_int4b(cutlass::int4b_t const* in, - cutlass::int4b_t* out, size_t num_int4_elems) { - // Build/upload LUT - if (!upload_lut()) return false; - - static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1, - "int4 storage must be 1 byte"); - const size_t nbytes = num_int4_elems >> 1; - - auto* in_bytes = reinterpret_cast(in); - auto* out_bytes = reinterpret_cast(out); - - // kernel launch params - constexpr int block = 256; - const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors - int grid = int((nvec + block - 1) / block); - if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel - - unified_encode_int4b_device<<>>(in_bytes, out_bytes, nbytes); - cudaError_t err = cudaGetLastError(); - return (err == cudaSuccess); -} - torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { TORCH_CHECK(B.dtype() == torch::kInt32); TORCH_CHECK(B.dim() == 2); @@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { LayoutB_Reordered layout_B_reordered = cute::tile_to_shape(LayoutAtomQuant{}, shape_B); - bool ok = - vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k); + bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr, + n * k); TORCH_CHECK(ok, "unified_encode_int4b failed"); cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered); diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu new file mode 100644 index 0000000000000..f238d0a5b2d78 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu @@ -0,0 +1,90 @@ +#include "w4a8_utils.cuh" + +#include +#include +#include + +namespace vllm::cutlass_w4a8_utils { + +/* + GPU-accelerated implementation of cutlass::unified_encode_int4b. + Constructs a lookup table in constant memory to map 8 bits + (two 4-bit values) at a time. Assumes memory is contiguous + and pointers are 16-byte aligned. +*/ +__constant__ uint8_t kNibbleLUT[256]; + +__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out, + size_t nbytes) { + constexpr size_t V = sizeof(uint4); // 16 bytes + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t nthreads = size_t(gridDim.x) * blockDim.x; + const size_t nvec = nbytes / V; + + // 1-D grid-stride loop over 16-byte chunks + for (size_t vec = tid; vec < nvec; vec += nthreads) { + uint4 v = reinterpret_cast(in)[vec]; + uint8_t* b = reinterpret_cast(&v); +#pragma unroll + for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]]; + reinterpret_cast(out)[vec] = v; + } +} + +static bool upload_lut() { + std::array lut{}; + auto map_nib = [](uint8_t v) -> uint8_t { + // 1..7 -> (8 - v); keep 0 and 8..15 + return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v); + }; + for (int b = 0; b < 256; ++b) { + uint8_t lo = b & 0xF; + uint8_t hi = (b >> 4) & 0xF; + lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo)); + } + cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(), + /*offset=*/0, cudaMemcpyHostToDevice); + + return (e == cudaSuccess); +} + +bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out, + size_t num_int4_elems) { + // Build/upload LUT + if (!upload_lut()) return false; + + static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1, + "int4 storage must be 1 byte"); + const size_t nbytes = num_int4_elems >> 1; + + auto* in_bytes = reinterpret_cast(in); + auto* out_bytes = reinterpret_cast(out); + + // kernel launch params + constexpr int block = 256; + const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors + int grid = int((nvec + block - 1) / block); + if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel + + unified_encode_int4b_device<<>>(in_bytes, out_bytes, nbytes); + + // launch errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("unified_encode_int4b_device launch error: %s (%d)\n", + cudaGetErrorString(err), err); + return false; + } + + // runtime errors + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("unified_encode_int4b_device runtime error: %s (%d)\n", + cudaGetErrorString(err), err); + return false; + } + + return true; +} + +} // namespace vllm::cutlass_w4a8_utils \ No newline at end of file diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh new file mode 100644 index 0000000000000..25090091a368d --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh @@ -0,0 +1,11 @@ +#pragma once + +#include +#include "cutlass/numeric_types.h" + +namespace vllm::cutlass_w4a8_utils { + +bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out, + size_t num_int4_elems); + +} // namespace vllm::cutlass_w4a8_utils \ No newline at end of file diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu index 49cafcc32adc6..99fec8fd6febc 100644 --- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu +++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu @@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids, void get_cutlass_moe_mm_problem_sizes_caller( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets) { + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt) { auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); auto options_int32 = torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); // Swap-AB should be disabled for FP4 path - bool may_swap_ab = (!blockscale_offsets.has_value()) && - (topk_ids.numel() <= SWAP_AB_THRESHOLD); + bool may_swap_ab = + force_swap_ab.value_or((!blockscale_offsets.has_value()) && + (topk_ids.numel() <= SWAP_AB_THRESHOLD)); launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, atomic_buffer, num_experts, n, k, stream, diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu index c5012a8669317..5de21cfbbaafb 100644 --- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu +++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu @@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller( void get_cutlass_moe_mm_problem_sizes_caller( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets); + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt); void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, @@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data( void get_cutlass_moe_mm_problem_sizes( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets) { + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt) { int32_t version_num = get_sm_version_num(); #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \ (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120) get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, - blockscale_offsets); + blockscale_offsets, force_swap_ab); return; #endif TORCH_CHECK_NOT_IMPLEMENTED( diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 62212f98b4766..d4c6f8c67c516 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -350,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor"); // conditionally compiled so impl registration is in source file + // CUTLASS w4a8 grouped GEMM + ops.def( + "cutlass_w4a8_moe_mm(" + " Tensor! out_tensors," + " Tensor a_tensors," + " Tensor b_tensors," + " Tensor a_scales," + " Tensor b_scales," + " Tensor b_group_scales," + " int b_group_size," + " Tensor expert_offsets," + " Tensor problem_sizes," + " Tensor a_strides," + " Tensor b_strides," + " Tensor c_strides," + " Tensor group_scale_strides," + " str? maybe_schedule" + ") -> ()"); + ops.def( + "cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, " + "Tensor)"); + // conditionally compiled so impl registration is in source file + #endif // Dequantization for GGML. @@ -466,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, " " Tensor! problem_sizes2, " " int num_experts, int n, int k, " - " Tensor? blockscale_offsets) -> ()"); + " Tensor? blockscale_offsets, " + " bool? force_swap_ab) -> ()"); ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, &get_cutlass_moe_mm_problem_sizes); diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index 465e24fd7eb97..cccef28f5e931 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -12,8 +12,11 @@ import torch from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.quant_utils import ( + convert_packed_uint4b8_to_signed_int4_inplace, + pack_cols, pack_rows, quantize_weights, + unpack_quantized_values_into_int32, ) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -167,8 +170,7 @@ def create_test_tensors( # for the practical use case we need per-tok scales for fp8 activations w_tok_s = torch.randn((m,), device="cuda", dtype=types.token_scale_type) - # weights are already per-group quantized, use placeholder here - w_ch_s = torch.ones((n,), device="cuda", dtype=types.channel_scale_type) + w_ch_s = torch.randn((n,), device="cuda", dtype=types.channel_scale_type) return Tensors( w_ref=w_ref, @@ -211,7 +213,7 @@ def mm_test_helper( print(output_ref) torch.testing.assert_close( - output, output_ref.to(output.dtype), rtol=1e-3, atol=1e-3 + output, output_ref.to(output.dtype), rtol=1e-2, atol=1e-2 ) @@ -257,7 +259,7 @@ def test_w4a8_cuda_graph(): ) w_tok_s = torch.randn((m,), device="cuda", dtype=torch.float32) - w_ch_s = torch.ones((n,), device="cuda", dtype=torch.float32) + w_ch_s = torch.randn((n,), device="cuda", dtype=torch.float32) # Construct a trivial model with a single layer that calls the kernel model = W4A8Layer( @@ -287,4 +289,38 @@ def test_w4a8_cuda_graph(): output.zero_() g.replay() - torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(output, output_ref, rtol=1e-2, atol=1e-2) + + +@pytest.mark.skipif( + not IS_SUPPORTED_BY_GPU, reason="CUTLASS W4A8 is not supported on this GPU type." +) +@pytest.mark.parametrize("shape", MNK_SHAPES) +def test_convert_packed_uint4b8_to_signed_int4_inplace(shape): + """ + The W4A16 checkpoints encode the weights as int4b8 packed to int32. + The CUTLASS kernels expect signed int4 packed to int32. + This tests checks that the runtime int4b8 -> signed int4 conversion + matches the offline conversion step exactly. + """ + _, N, K = shape + # random weights packed to int32 + t = torch.randint( + low=torch.iinfo(torch.int32).min, + high=torch.iinfo(torch.int32).max + 1, + size=(N, K // 8), + dtype=torch.int32, + device="cuda", + ) + + # compute reference + unpacked = unpack_quantized_values_into_int32( + t.clone(), scalar_types.uint4b8, packed_dim=1 + ) + unpacked = unpacked - 8 # int4b8 -> signed int4 + ref = pack_cols(unpacked & 0x0F, 4, *unpacked.shape) + + out = convert_packed_uint4b8_to_signed_int4_inplace(t.clone()) + + assert torch.equal(ref, out) + assert not torch.equal(ref, t) diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py new file mode 100644 index 0000000000000..3560402a29e90 --- /dev/null +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -0,0 +1,340 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for the CUTLASS-based W4A8 grouped GEMM kernel and the full MoE layer. +""" + +import random +from dataclasses import dataclass + +import pytest +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_rows, + quantize_weights, +) +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 + + +def to_fp8(tensor: torch.Tensor) -> torch.Tensor: + finfo = torch.finfo(torch.float8_e4m3fn) + return tensor.clamp(min=finfo.min, max=finfo.max).to(dtype=torch.float8_e4m3fn) + + +def cutlass_quantize( + atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: torch.dtype | None, + group_size: int | None, + zero_points: bool = False, +): + """ + Quantize weights into W4 and compute reference dequantized weights. + + Encoding/reordering of weights and packing of scales is deferred + until after all experts are combined. + """ + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, wtype, group_size=group_size, zero_points=zero_points + ) + + # Since scales are later cast to fp8, recompute w_ref in atype here. + w_ref = ( + w_q.to(torch.float32) + * w_s.to(atype).to(torch.float32).repeat_interleave(group_size, dim=0) + ).to(atype) + + # Bit mask prevents sign extension of int4 when packing. + w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape) + # Make weights row-major (N, K). + w_q = w_q.t().contiguous() + + return w_ref, w_q, w_s.to(atype), w_zp + + +def cutlass_preprocess( + w_q_experts: list[torch.Tensor], w_s_experts: list[torch.Tensor] +): + """ + Reorder/encode expert weights and pack scales. + + Returns: + w_q_packed: Packed/encoded int4 weights for all experts. + w_s_packed: Packed fp8 scales for all experts. + packed_layout: Layout/stride metadata for grouped GEMM. + """ + w_s_packed = ops.cutlass_pack_scale_fp8(torch.stack(w_s_experts)) + w_q_packed, packed_layout = ops.cutlass_encode_and_reorder_int4b_grouped( + torch.stack(w_q_experts) + ) # expects dim 3 + return w_q_packed, w_s_packed, packed_layout + + +GROUP_SIZE = 128 +# (num_experts, N, K) +TEST_SHAPES = [ + (8, 512, 2048), + (8, 2048, 2048), + (64, 512, 1024), + (64, 2048, 2048), + (4, 2048, 768), + (8, 768, 2048), + (64, 1536, 2048), + (128, 8192, 4096), # test overflow int32 +] +ALIGNMENT = 16 # torch._scaled_mm alignment for M, needed for reference check + + +@dataclass +class MoETestSetup: + num_experts: int + K: int + N: int + Ms: list[int] + M_full: int + a: torch.Tensor + a_ref: torch.Tensor + a_strides: torch.Tensor + out: torch.Tensor + c_strides: torch.Tensor + per_tok_scales: torch.Tensor + per_chan_scales: torch.Tensor + w_refs: list[torch.Tensor] + w_q_packed: torch.Tensor + w_s_packed: torch.Tensor + problem_sizes: torch.Tensor + expert_offsets: torch.Tensor + b_strides: torch.Tensor + group_scale_strides: torch.Tensor + + +def make_moe_test_setup( + num_experts: int, + K: int, + N: int, + *, + alignment: int = ALIGNMENT, + max_blocks: int = 64, + device: str = "cuda", + random_zero: bool = False, +) -> MoETestSetup: + """Create a full set of tensors for testing cutlass_w4a8_moe_mm.""" + + assert K % GROUP_SIZE == 0 + # Token counts per expert (multiples of `alignment`). + Ms = [alignment * random.randint(1, max_blocks) for _ in range(num_experts)] + + # set random experts to 0 tokens + if random_zero and num_experts > 1: + num_zero = max(1, num_experts // 8) + zero_indices = random.sample(range(num_experts), k=num_zero) + for idx in zero_indices: + Ms[idx] = 0 + + M_full = sum(Ms) + assert M_full > 0 + + # Activations. + a = to_fp8(torch.randn((M_full, K), device=device)) + a_ref = a.to(torch.float32) + a_strides = torch.full((num_experts,), K, dtype=torch.int64, device=device) + + # Output buffer. + out = torch.empty((M_full, N), dtype=torch.bfloat16, device=device) + c_strides = torch.full((num_experts,), N, dtype=torch.int64, device=device) + + # Channel/token scales. + per_tok_scales = torch.randn((M_full, 1), dtype=torch.float32, device=device) + per_chan_scales = torch.randn( + (num_experts, N, 1), dtype=torch.float32, device=device + ) + + # Expert weights and scales. + wtype = scalar_types.int4 + atype = stype = torch.float8_e4m3fn + w_refs, w_qs, w_ss = [], [], [] + for _ in range(num_experts): + b = to_fp8(torch.randn((K, N), device=device)) + w_ref, w_q, w_s, _ = cutlass_quantize( + atype, b.to(torch.float16), wtype, stype, GROUP_SIZE, zero_points=False + ) + w_refs.append(w_ref) + w_qs.append(w_q) + w_ss.append(w_s) + + w_q_packed, w_s_packed, packed_layout = cutlass_preprocess(w_qs, w_ss) + + problem_sizes = torch.tensor( + [[N, M, K] for M in Ms], dtype=torch.int32, device=device + ) + + expert_offsets = torch.cat( + [ + torch.tensor([0], dtype=torch.int64), + torch.cumsum(torch.tensor(Ms, dtype=torch.int64), dim=0)[:-1], + ] + ).to(device=device) + + # B strides and group scale strides. + b_strides = packed_layout + group_scale_strides = torch.zeros( + (num_experts, 2), dtype=torch.int64, device=device + ) + group_scale_strides[:, 0] = N + + return MoETestSetup( + num_experts=num_experts, + K=K, + N=N, + Ms=Ms, + M_full=M_full, + a=a, + a_ref=a_ref, + a_strides=a_strides, + out=out, + c_strides=c_strides, + per_tok_scales=per_tok_scales, + per_chan_scales=per_chan_scales, + w_refs=w_refs, + w_q_packed=w_q_packed, + w_s_packed=w_s_packed, + problem_sizes=problem_sizes, + expert_offsets=expert_offsets, + b_strides=b_strides, + group_scale_strides=group_scale_strides, + ) + + +def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor: + """Compute reference output using torch._scaled_mm per expert.""" + out_ref = torch.empty_like(setup.out) + + ends = torch.cumsum(torch.tensor(setup.Ms), 0).tolist() + starts = setup.expert_offsets.cpu().tolist() + + for i in range(setup.num_experts): + start, end = starts[i], ends[i] + if start == end: + continue + + out_ref_i = torch._scaled_mm( + setup.a_ref[start:end].to(torch.float8_e4m3fn), + setup.w_refs[i].to(torch.float8_e4m3fn).t().contiguous().t(), + setup.per_tok_scales[start:end], # (M, 1) + setup.per_chan_scales[i].reshape(1, -1), # (1, N) + out_dtype=torch.bfloat16, + use_fast_accum=True, + ) + out_ref[start:end] = out_ref_i + + return out_ref + + +@pytest.mark.skipif( + not IS_SUPPORTED_BY_GPU, + reason="W4A8 Grouped GEMM is not supported on this GPU type.", +) +@pytest.mark.parametrize("shape", TEST_SHAPES) +@pytest.mark.parametrize("random_zero", [True, False]) +def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): + num_experts, N, K = shape + current_platform.seed_everything(42) + setup = make_moe_test_setup( + num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero + ) + + ops.cutlass_w4a8_moe_mm( + setup.out, + setup.a, + setup.w_q_packed, + setup.per_tok_scales, + setup.per_chan_scales, + setup.w_s_packed, + GROUP_SIZE, + setup.expert_offsets, + setup.problem_sizes, + setup.a_strides, + setup.b_strides, + setup.c_strides, + setup.group_scale_strides, + ) + torch.cuda.synchronize() + + out_ref = compute_moe_reference_output(setup) + torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2) + + +class W4A8MoELayer(torch.nn.Module): + """ + Minimal wrapper module to test cuda graphs + """ + + def __init__(self, setup: MoETestSetup): + super().__init__() + self.setup = setup + + def forward(self, a: torch.Tensor) -> torch.Tensor: + s = self.setup + ops.cutlass_w4a8_moe_mm( + s.out, + a, + s.w_q_packed, + s.per_tok_scales, + s.per_chan_scales, + s.w_s_packed, + GROUP_SIZE, + s.expert_offsets, + s.problem_sizes, + s.a_strides, + s.b_strides, + s.c_strides, + s.group_scale_strides, + ) + return s.out + + +@pytest.mark.skipif( + not IS_SUPPORTED_BY_GPU, + reason="W4A8 Grouped GEMM is not supported on this GPU type.", +) +def test_cutlass_w4a8_moe_mm_cuda_graph(): + current_platform.seed_everything(42) + # Fixed config for CUDA graph test (single parameter point). + num_experts = 8 + K = 512 + N = 2048 + + setup = make_moe_test_setup( + num_experts=num_experts, + K=K, + N=N, + max_blocks=32, + ) + + # Construct model that calls the grouped GEMM kernel. + model = W4A8MoELayer(setup) + + # Build reference output once. + out_ref = compute_moe_reference_output(setup) + + # Capture and run the model in a CUDA graph. + a_static = setup.a.clone() # static input tensor for graph replay + + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + out_static = model(a_static) + + out_static.zero_() + g.replay() + + torch.testing.assert_close(out_static, out_ref, rtol=1e-2, atol=1e-2) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 56c780ceb1cb5..6bbfe11b6e925 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -695,6 +695,10 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor: return torch.empty_like(b, memory_format=torch.contiguous_format) + @register_fake("_C::cutlass_encode_and_reorder_int4b_grouped") + def cutlass_encode_and_reorder_int4b_grouped_fake(b: torch.Tensor) -> torch.Tensor: + return torch.empty_like(b, memory_format=torch.contiguous_format) + if hasattr(torch.ops._C, "allspark_w8a16_gemm"): @@ -1058,6 +1062,7 @@ def get_cutlass_moe_mm_problem_sizes( n: int, k: int, blockscale_offsets: torch.Tensor | None = None, + force_swap_ab: bool | None = None, ): """ Compute only the per-expert problem sizes needed by the two grouped matrix @@ -1067,9 +1072,20 @@ def get_cutlass_moe_mm_problem_sizes( - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's multiplication for the two grouped MMs used in the fused MoE operation. + Optional: + - force_swap_ab: If set to True or False, explicitly enable or disable the + A/B input swap optimization. If None (default), the swap + is selected automatically based on tensor sizes. """ return torch.ops._C.get_cutlass_moe_mm_problem_sizes( - topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets + topk_ids, + problem_sizes1, + problem_sizes2, + num_experts, + n, + k, + blockscale_offsets, + force_swap_ab, ) @@ -1457,6 +1473,78 @@ def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor: return torch.ops._C.cutlass_encode_and_reorder_int4b(b) +def cutlass_w4a8_moe_mm( + out_tensors: torch.Tensor, + a_tensors: torch.Tensor, + b_tensors: torch.Tensor, + a_scales: torch.Tensor, + b_scales: torch.Tensor, + b_group_scales: torch.Tensor, + b_group_size: int, + expert_offsets: torch.Tensor, + problem_sizes: torch.Tensor, + a_strides: torch.Tensor, + b_strides: torch.Tensor, + c_strides: torch.Tensor, + group_scale_strides: torch.Tensor, + maybe_schedule: str | None = None, +): + """ + Executes the CUTLASS-based fused-MoE grouped matrix multiplication for the + W4A8 quantization scheme. Uses group-wise quantization (INT4 -> FP8) + and both per-channel + per-token scaling in the epilogue. + + Args: + out_tensors: + Output buffer for all experts (updated in-place). + a_tensors: + FP8 (E4M3FN) activations for all experts. + b_tensors: + INT4-packed weight matrix for all experts, packed to INT32 + a_scales: + Per-token FP8 activation scales, applied in the epilogue. + b_scales: + Per-channel FP8 weight scales for each expert, applied in the epilogue. + b_group_scales: + FP8 scale values for group-wise INT4 weight blocks. + b_group_size: + Number of elements grouped under each entry of b_group_scales. + expert_offsets: + Cumulative token offsets + problem_sizes: + Per-expert (M, N, K) GEMM sizes used by the grouped GEMM launcher. + a/b/c/group_scale_strides: + Strides describing the memory layout of the input tensors. + maybe_schedule: + Optional override to choose a specific kernel or epilogue schedule. + + Returns: + out_tensors updated in-place with the dequantized INT4xFP8 grouped GEMM result. + """ + return torch.ops._C.cutlass_w4a8_moe_mm( + out_tensors, + a_tensors, + b_tensors, + a_scales, + b_scales, + b_group_scales, + b_group_size, + expert_offsets, + problem_sizes, + a_strides, + b_strides, + c_strides, + group_scale_strides, + maybe_schedule, + ) + + +def cutlass_encode_and_reorder_int4b_grouped( + b_tensors: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops._C.cutlass_encode_and_reorder_int4b_grouped(b_tensors) + + if hasattr(torch.ops._C, "permute_cols"): @register_fake("_C::permute_cols") diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 9103e84aa7057..1e145a8fcd791 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -63,8 +63,10 @@ if HAS_TRITON: from vllm.model_executor.layers.fused_moe.cutlass_moe import ( CutlassBatchedExpertsFp8, CutlassExpertsFp8, + CutlassExpertsW4A8Fp8, cutlass_moe_fp4, cutlass_moe_fp8, + cutlass_moe_w4a8_fp8, ) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( @@ -88,8 +90,10 @@ if HAS_TRITON: "grouped_topk", "cutlass_moe_fp8", "cutlass_moe_fp4", + "cutlass_moe_w4a8_fp8", "CutlassExpertsFp8", "CutlassBatchedExpertsFp8", + "CutlassExpertsW4A8Fp8", "TritonExperts", "BatchedTritonExperts", "DeepGemmExperts", diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index e52845dfa246d..f35cafa0f77dc 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -143,6 +143,7 @@ class FusedMoEQuantDesc: scale: Union[torch.Tensor, "PrecisionConfig", None] = None # Quantization alphas or gscales, used for nvfp4 types. + # W4A8 FP8: used for per-channel scales # TODO(bnell): put some of these in subclasses alpha_or_gscale: torch.Tensor | None = None @@ -442,7 +443,9 @@ class FusedMoEQuantConfig: - a1_scale: Optional scale to be used for a1. - a2_scale: Optional scale to be used for a2. - g1_alphas: Optional global quantization scales for w1 (for nvfp4). + per-channel scales for w1 (for W4A8 FP8). - g2_alphas: Optional global quantization scales for w2 (for nvfp4). + per-channel scales for w2 (for W4A8 FP8). - a1_gscale: Optional global quantization scales for a1 (for nvfp4). - a2_gscale: Optional global quantization scales for a2 (for nvfp4). - w1_bias: Optional biases for w1 (GPT OSS Triton). @@ -461,6 +464,7 @@ class FusedMoEQuantConfig: "mxfp4", "mxfp6_e3m2", "mxfp6_e2m3", + "int4", } if weight_dtype is None: @@ -671,6 +675,31 @@ def int8_w8a16_moe_quant_config( ) +def int4_w4afp8_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for fp8 activations and int4 weights. + """ + return FusedMoEQuantConfig.make( + torch.float8_e4m3fn, # quant dtype for activations + w1_scale=w1_scale, + w2_scale=w2_scale, + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_out_ch_quant, + block_shape=block_shape, + weight_dtype="int4", # weight dtype for weights + ) + + def biased_moe_quant_config( w1_bias: torch.Tensor | None, w2_bias: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 30144ca5452eb..552e38a71bf98 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1052,3 +1052,404 @@ def run_cutlass_block_scaled_fused_experts( return ( c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype) ).sum(dim=1) + + +# W4A8 +def run_cutlass_moe_w4a8_fp8( + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation_callable: Callable, + global_num_experts: int, + expert_map: torch.Tensor | None, + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + w1_chan_scale: torch.Tensor, + w2_chan_scale: torch.Tensor, + a_strides1: torch.Tensor, + a_strides2: torch.Tensor, + b_strides1: torch.Tensor, + b_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + s_strides1: torch.Tensor, + s_strides2: torch.Tensor, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: torch.Tensor | None, + out_dtype: torch.dtype, + per_act_token: bool, + per_out_ch: bool, + use_batched_format: bool, + topk_weights: torch.Tensor | None, + group_size: int, +): + a1q = hidden_states + M = a1q.size(0) + local_E = w1.size(0) + device = a1q.device + _, K, N_packed = w2.shape + N = N_packed * 8 # logical N, pack 8 int4 into 1 int32 + + assert per_act_token, "W4A8 must use per-token scales" + assert per_out_ch, "W4A8 must use per-channel scales" + assert w1_scale is not None + assert w2_scale is not None + assert w1_scale.dtype == torch.float8_e4m3fn + assert w2_scale.dtype == torch.float8_e4m3fn + assert w1.dtype == torch.int32 + assert w2.dtype == torch.int32 + assert w1_chan_scale.dtype == torch.float32 + assert w2_chan_scale.dtype == torch.float32 + assert w1.size(0) == w2.size(0), "Weights expert number mismatch" + assert a1q_scale is not None + assert a2_scale is None + assert out_dtype in [torch.bfloat16], f"Invalid output dtype: {out_dtype}" + if expert_map is not None: + assert expert_num_tokens is None + assert not use_batched_format, "batched format not supported yet" + assert group_size == 128, f"Only group size 128 supported but got {group_size=}" + + assert global_num_experts != -1 + assert w1.size(2) * 8 == K, ( + f"w1 hidden size mismatch: got {w1.size(2) * 8}, expected {K=}" + ) + + # Translate info from expert_map to topk_ids + if expert_map is not None: + local_topk_ids = torch.where( + expert_map[topk_ids] != -1, expert_map[topk_ids], -1 + ) + else: + local_topk_ids = topk_ids + + topk = local_topk_ids.size(1) + a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K)) + mm1_out = _resize_cache(workspace13, (M * topk, N * 2)) + act_out = _resize_cache(workspace2, (M * topk, N)) + # original workspace are based on input hidden_states dtype (bf16) + quant_out = _resize_cache( + workspace13.view(dtype=torch.float8_e4m3fn), (M * topk, N) + ) + mm2_out = _resize_cache(workspace2, (M * topk, K)) + + problem_sizes1 = torch.empty( + (global_num_experts, 3), dtype=torch.int32, device=device + ) + problem_sizes2 = torch.empty( + (global_num_experts, 3), dtype=torch.int32, device=device + ) + + num_expert = global_num_experts if expert_map is None else expert_map.size(0) + # permuted a1q reuses workspace2 + a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, + a1q_scale, + topk_ids, + num_expert, + local_E, + expert_map, + permuted_hidden_states=a1q_perm, + ) + expert_offsets = expert_offsets[:-1] + + # For RS gemm SwapAB is always enabled (swap logical M, N in the problem shape) + ops.get_cutlass_moe_mm_problem_sizes( + local_topk_ids, + problem_sizes1, + problem_sizes2, + global_num_experts, + N, + K, + force_swap_ab=True, + ) + + ops.cutlass_w4a8_moe_mm( + mm1_out, + a1q, + w1, + a1q_scale, + w1_chan_scale, + w1_scale, + group_size, + expert_offsets, + problem_sizes1, + a_strides1, + b_strides1, + c_strides1, + s_strides1, + ) + + activation_callable(act_out, mm1_out) + + a2q, a2q_scale = ops.scaled_fp8_quant( + act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out + ) + + if expert_map is not None: + mm2_out.fill_(0) + + ops.cutlass_w4a8_moe_mm( + mm2_out, + a2q, + w2, + a2q_scale, + w2_chan_scale, + w2_scale, + group_size, + expert_offsets, + problem_sizes2, + a_strides2, + b_strides2, + c_strides2, + s_strides2, + ) + + # for non-chunking mode the output is resized from workspace13 + # so we need to make sure mm2_out uses workspace2. + moe_unpermute( + out=output, + permuted_hidden_states=mm2_out, + topk_weights=topk_weights, + inv_permuted_idx=inv_perm, + ) + + +class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + out_dtype: torch.dtype | None, + a_strides1: torch.Tensor, + a_strides2: torch.Tensor, + b_strides1: torch.Tensor, + b_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + s_strides1: torch.Tensor, + s_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + group_size: int, + ): + super().__init__(quant_config) + self.out_dtype = out_dtype + self.a_strides1 = a_strides1 + self.a_strides2 = a_strides2 + self.b_strides1 = b_strides1 + self.b_strides2 = b_strides2 + self.c_strides1 = c_strides1 + self.c_strides2 = c_strides2 + self.s_strides1 = s_strides1 + self.s_strides2 = s_strides2 + self.group_size = group_size + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # topk weights and reduction are fused in moe_unpermute cuda kernel + return TopKWeightAndReduceNoOP() + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + return self.out_dtype if self.out_dtype is not None else act_dtype + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk, max(N // 2, K)) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor | None, + workspace2: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE" + assert self.w2_zp is None, "w2_zp is not supported in CUTLASS MoE" + + expert_num_tokens = None + activation_callable = lambda o, i: self.activation(activation, o, i) + + use_batched_format = ( + self.activation_formats[0] == mk.FusedMoEActivationFormat.BatchedExperts + ) + assert not use_batched_format, "batched format not supported" + + in_dtype = hidden_states.dtype + + run_cutlass_moe_w4a8_fp8( + output, + hidden_states, + w1, + w2, + topk_ids, + activation_callable, + global_num_experts, + expert_map, + self.w1_scale, + self.w2_scale, + a1q_scale, + a2_scale, + self.g1_alphas, # per-channel scales + self.g2_alphas, # per-channel scales + self.a_strides1, + self.a_strides2, + self.b_strides1, + self.b_strides2, + self.c_strides1, + self.c_strides2, + self.s_strides1, + self.s_strides2, + workspace13, + workspace2, + expert_num_tokens, + self.out_dtype if self.out_dtype is not None else in_dtype, + self.per_act_token_quant, + self.per_out_ch_quant, + use_batched_format, + topk_weights, + self.group_size, + ) + + +def cutlass_moe_w4a8_fp8( + a: torch.Tensor, + w1_q: torch.Tensor, + w2_q: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + a_strides1: torch.Tensor, + a_strides2: torch.Tensor, + b_strides1: torch.Tensor, + b_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + s_strides1: torch.Tensor, + s_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + activation: str = "silu", + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + group_size: int = 128, +) -> torch.Tensor: + """ + This function computes a w4a8-quantized Mixture of Experts (MoE) layer + using two sets of quantized weights, w1_q and w2_q, and top-k gating + mechanism. The matrix multiplications are implemented with CUTLASS + mixed-dtype grouped gemm. + + Parameters: + - a (torch.Tensor): The input tensor to the MoE layer. + Shape: [M, K] + - w1_q (torch.Tensor): The first set of fp8-quantized expert weights. + Shape: [num_experts, 2*N, K // packed_factor] + - w2_q (torch.Tensor): The second set of fp8-quantized expert weights. + Shape: [num_experts, K, N // packed_factor] + - topk_weights (torch.Tensor): The weights of each token->expert mapping. + - topk_ids (torch.Tensor): The token->expert mappings. + - a_strides1 (torch.Tensor): The input strides for the first gemm. + Shape: [num_experts] + - a_strides2 (torch.Tensor): The input strides for the second gemm. + Shape: [num_experts] + - b_strides1 (torch.Tensor): The packed layout for the first gemm weights. + Shape: [num_experts, 3] + dtype: torch.int32 + - b_strides2 (torch.Tensor): The packed layout for the second gemm weights. + Shape: [num_experts, 3] + dtype: torch.int32 + - c_strides1 (torch.Tensor): The output strides for the first gemm. + Shape: [num_experts] + - c_strides2 (torch.Tensor): The output strides for the second gemm. + Shape: [num_experts] + - s_strides1 (torch.Tensor): strides for the group-wise scales for the first gemm. + Shape: [num_experts, 2] + dtype: torch.int64 + - s_strides2 (torch.Tensor): strides for the group-wise scales for the second gemm. + Shape: [num_experts, 2] + dtype: torch.int64 + - per_act_token (Optional[bool]): Whether the scale is per-token or + per-tensor. + - activation (str): The activation function to use. + - expert_map (Optional[torch.Tensor]): In the case of Expert parallel, + every Rank is responsible for a subset of experts. expert_map is a + mapping from global expert-id to local expert-id. When expert_map[i] + is -1, it means that this Rank is not responsible for global + expert-id i. + - apply_router_weight_on_input (bool): When true, the topk weights are + applied directly on the inputs. This is only applicable when topk is 1. + - global_num_experts (int): The total number of experts. + - group_size (int): The number of weights per scale factor + + Returns: + - torch.Tensor: The bf16 output tensor after applying the MoE layer. + """ + assert quant_config is not None + + num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0) + + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + CutlassExpertsW4A8Fp8( + out_dtype=a.dtype, + a_strides1=a_strides1, + a_strides2=a_strides2, + b_strides1=b_strides1, + b_strides2=b_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + s_strides1=s_strides1, + s_strides2=s_strides2, + quant_config=quant_config, + group_size=group_size, + ), + ) + + return fn( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 51d3299e7ddf1..075610ec588ae 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -367,7 +367,7 @@ class FusedMoEPrepareAndFinalize(ABC): class FusedMoEPermuteExpertsUnpermute(ABC): """ An abstract base class for the [Permute-Experts-Unpermute] step described - above. + above. """ def __init__( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b91ecb59fee18..21f4cfe51d08e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -256,7 +256,7 @@ class CompressedTensorsConfig(QuantizationConfig): if format is not None else is_activation_quantization_format(quant_format) ) - # TODO(czhu): w4a8fp8 is in packed-quantized format + # w4a8fp8 is in packed-quantized format # but needs input activation quantization input_activations = quant_config.get("input_activations") if act_quant_format or input_activations: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 8013b29f733bb..619162272c94f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -33,6 +33,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, int4_w4a16_moe_quant_config, + int4_w4afp8_moe_quant_config, int8_w8a8_moe_quant_config, int8_w8a16_moe_quant_config, nvfp4_moe_quant_config, @@ -79,7 +80,11 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_moe_fp8_layer_for_marlin, ) -from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + convert_bf16_scales_to_fp8, + convert_packed_uint4b8_to_signed_int4_inplace, + swizzle_blockscale, +) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, @@ -204,6 +209,11 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): return CompressedTensorsW8A8Int8MoEMethod( weight_quant, input_quant, layer.moe_config ) + elif quant_config._is_fp8_w4a8_sm90(weight_quant, input_quant): + logger.info_once("Using CompressedTensorsW4A8Fp8MoEMethod") + return CompressedTensorsW4A8Fp8MoEMethod( + weight_quant, input_quant, layer.moe_config + ) elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant): return CompressedTensorsW4A8Int8MoEMethod( weight_quant, input_quant, layer.moe_config @@ -2428,3 +2438,331 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): apply_router_weight_on_input, int(_act_kind(activation)), ) + + +class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): + def __init__( + self, + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, + moe: FusedMoEConfig, + layer_name: str | None = None, + ): + super().__init__(moe) + self.weight_quant = weight_quant + self.input_quant = input_quant + + self.group_size = self.weight_quant.group_size + self.num_bits = self.weight_quant.num_bits + self.packed_factor = 32 // self.num_bits + + assert self.weight_quant.symmetric, ( + "Only symmetric quantization is supported for W4A8 MoE" + ) + assert self.weight_quant.actorder != "group" + assert self.group_size == 128, "Only group size 128 supported for W4A8 MoE" + + self.disable_expert_map = False + self.layer_name = layer_name + + from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 + from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + ) + + self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + # requirement for CUTLASS reorder_tensor + assert hidden_size % 256 == 0, f"{hidden_size=} must be divisible by 256" + assert intermediate_size_per_partition % 256 == 0, ( + f"{intermediate_size_per_partition=} must be divisible by 256" + ) + # storage type, pack 8xint4 into int32 + params_dtype = torch.int32 + + # WEIGHTS + w13_weight_packed = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.packed_factor, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_packed", w13_weight_packed) + set_weight_attrs(w13_weight_packed, extra_weight_attrs) + + w2_weight_packed = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.packed_factor, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_packed", w2_weight_packed) + set_weight_attrs(w2_weight_packed, extra_weight_attrs) + + # SCALES + # weight_scale refers to the group-wise scales + # they are initially loaded as bf16, we will convert to fp8 + # after loading + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=layer.orig_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + + w2_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=layer.orig_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-GROUP quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.GROUP.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # weight shapes + w2_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + layer.register_parameter("w2_weight_shape", w2_weight_shape) + set_weight_attrs(w2_weight_shape, extra_weight_attrs) + w13_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + layer.register_parameter("w13_weight_shape", w13_weight_shape) + set_weight_attrs(w13_weight_shape, extra_weight_attrs) + + # don't use input scales + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer): + device = layer.w13_weight_packed.device + + # STRIDES + # A, C + self.a_strides1_c_strides2 = torch.full( + (layer.local_num_experts,), + layer.hidden_size, + device=device, + dtype=torch.int64, + ) + self.a_strides2 = torch.full( + (layer.local_num_experts,), + layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64, + ) + self.c_strides1 = torch.full( + (layer.local_num_experts,), + 2 * layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64, + ) + + # S (group-wise scales) + # sizeof(StrideS) = 16 bytes, so we need to use 2xint64 to encode it + self.s_strides1 = torch.zeros( + (layer.local_num_experts, 2), device=device, dtype=torch.int64 + ) + self.s_strides1[:, 0] = 2 * layer.intermediate_size_per_partition + + self.s_strides2 = torch.zeros( + (layer.local_num_experts, 2), device=device, dtype=torch.int64 + ) + self.s_strides2[:, 0] = layer.hidden_size + + # encode and reorder weight tensors, and get the layout to pass to + # the grouped gemm kernel. `b_strides1/2` specifies the entire layout + convert_packed_uint4b8_to_signed_int4_inplace(layer.w13_weight_packed) + w13_weight_shuffled, self.b_strides1 = ( + ops.cutlass_encode_and_reorder_int4b_grouped(layer.w13_weight_packed) + ) + replace_parameter(layer, "w13_weight_packed", w13_weight_shuffled) + convert_packed_uint4b8_to_signed_int4_inplace(layer.w2_weight_packed) + w2_weight_shuffled, self.b_strides2 = ( + ops.cutlass_encode_and_reorder_int4b_grouped(layer.w2_weight_packed) + ) + replace_parameter(layer, "w2_weight_packed", w2_weight_shuffled) + + # convert bf16 scales to (fp8_scales, channel_scales) + w13_weight_scale, w13_weight_chan_scale = convert_bf16_scales_to_fp8( + self.quant_fp8, layer.w13_weight_scale + ) + w2_weight_scale, w2_weight_chan_scale = convert_bf16_scales_to_fp8( + self.quant_fp8, layer.w2_weight_scale + ) + + # register channel scales + layer.register_parameter( + "w13_weight_chan_scale", + torch.nn.Parameter(w13_weight_chan_scale, requires_grad=False), + ) + layer.register_parameter( + "w2_weight_chan_scale", + torch.nn.Parameter(w2_weight_chan_scale, requires_grad=False), + ) + + # The scales are stored as (E, N, K // 128) but the kernel expects + # (E, K // 128, N) in row-major format, so we need to permute the last 2 dims + # and make it contiguous + w13_weight_scale_packed = ops.cutlass_pack_scale_fp8( + w13_weight_scale.permute(0, 2, 1).contiguous() + ) + replace_parameter(layer, "w13_weight_scale", w13_weight_scale_packed) + w2_weight_scale_packed = ops.cutlass_pack_scale_fp8( + w2_weight_scale.permute(0, 2, 1).contiguous() + ) + replace_parameter(layer, "w2_weight_scale", w2_weight_scale_packed) + + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: + return super().maybe_make_prepare_finalize(routing_tables) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + # Store quantization scales; both per-group and per-channel + # Note we haven't specified the group size here because + # the quant config logic assumes group-wise scaling + # and channel-wise scaling are exclusive. + return int4_w4afp8_moe_quant_config( + w1_scale=layer.w13_weight_scale, # group scale + w2_scale=layer.w2_weight_scale, # group scale + g1_alphas=layer.w13_weight_chan_scale, + g2_alphas=layer.w2_weight_chan_scale, + per_act_token_quant=True, # always use dynamc per-token + per_out_ch_quant=True, # always use per-channel + ) + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None + assert ( + prepare_finalize.activation_format == FusedMoEActivationFormat.Standard + ), "BatchedExperts not supported" + + from vllm.model_executor.layers.fused_moe import CutlassExpertsW4A8Fp8 + + experts: FusedMoEPermuteExpertsUnpermute + + logger.debug("CutlassExpertsW4A8Fp8(%s)", self.__class__.__name__) + experts = CutlassExpertsW4A8Fp8( + out_dtype=self.moe.in_dtype, + a_strides1=self.a_strides1_c_strides2, + a_strides2=self.a_strides2, + b_strides1=self.b_strides1, + b_strides2=self.b_strides2, + c_strides1=self.c_strides1, + c_strides2=self.a_strides1_c_strides2, + s_strides1=self.s_strides1, + s_strides2=self.s_strides2, + quant_config=self.moe_quant_config, + group_size=self.group_size, + ) + + num_dispatchers = prepare_finalize.num_dispatchers() + self.disable_expert_map = ( + num_dispatchers > 1 or not experts.supports_expert_map() + ) + + return experts + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ): + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." + ) + assert self.moe_quant_config is not None + topk_weights, topk_ids, _ = layer.select_experts( + hidden_states=x, + router_logits=router_logits, + ) + + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_w4a8_fp8, + ) + + return cutlass_moe_w4a8_fp8( + x, + layer.w13_weight_packed, + layer.w2_weight_packed, + topk_weights, + topk_ids, + quant_config=self.moe_quant_config, + activation=activation, + global_num_experts=global_num_experts, + expert_map=None if self.disable_expert_map else expert_map, + a_strides1=self.a_strides1_c_strides2, + a_strides2=self.a_strides2, + b_strides1=self.b_strides1, + b_strides2=self.b_strides2, + c_strides1=self.c_strides1, + c_strides2=self.a_strides1_c_strides2, + s_strides1=self.s_strides1, + s_strides2=self.s_strides2, + group_size=self.group_size, + ) + + @property + def supports_eplb(self) -> bool: + return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py index a23961e897534..9a25e08cbad75 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py @@ -128,14 +128,15 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): ), ) - # TODO(czhu): allocate the packed fp8 scales memory here? - # the scales will be expanded by 8x via `cutlass_pack_scale_fp8` + # After loading, we will transform bf16 -> fp8 -> + # expand by 8x via `cutlass_pack_scale_fp8` + # and construct per-channel fp32 scales. weight_scale_args = { "weight_loader": weight_loader, "data": torch.empty( output_size_per_partition, scales_and_zp_size, - dtype=torch.float8_e4m3fn, + dtype=params_dtype, ), } @@ -152,17 +153,9 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader ) - # per-channel scales - weight_chan_scale = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), dtype=torch.float32), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_packed", weight) layer.register_parameter("weight_scale", weight_scale) layer.register_parameter("weight_shape", weight_shape) - layer.register_parameter("weight_chan_scale", weight_chan_scale) self.kernel = kernel_type( mp_linear_kernel_config, diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py index 8ef6457c952f1..c9c1a3abf7fd3 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py @@ -6,7 +6,11 @@ import torch from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 -from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + convert_bf16_scales_to_fp8, + convert_packed_uint4b8_to_signed_int4_inplace, +) from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -48,7 +52,6 @@ class CutlassW4A8LinearKernel(MPLinearKernel): "CUTLASS W4A8, only supported int4", ) - # TODO(czhu): support -1 (column-wise) if c.group_size != 128: return False, "Only group_size 128 is supported" @@ -71,9 +74,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel): # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} # `weight_scale` is: {input_dim = 0, output_dim = 1} def process_weights_after_loading(self, layer: torch.nn.Module): - # TODO(czhu): optimize speed/mem usage def transform_w_q(x): assert isinstance(x, BasevLLMParameter) + convert_packed_uint4b8_to_signed_int4_inplace(x.data) permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t()) return x @@ -85,10 +88,18 @@ class CutlassW4A8LinearKernel(MPLinearKernel): x.data = ops.cutlass_pack_scale_fp8(x.data) return x + w_s = getattr(layer, self.w_s_name) + fp8_scales, chan_scales = convert_bf16_scales_to_fp8(self.quant_fp8, w_s.data) + w_s.data = fp8_scales + + # register per-channel scales + layer.register_parameter( + "weight_chan_scale", torch.nn.Parameter(chan_scales, requires_grad=False) + ) + # Encode/reorder weights and pack scales self._transform_param(layer, self.w_q_name, transform_w_q) self._transform_param(layer, self.w_s_name, transform_w_s) - self._transform_param(layer, "weight_chan_scale", lambda x: x) def apply_weights( self, diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 92ee8c498e01f..d01263f82007d 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This file is used for /tests and /benchmarks""" -from collections.abc import Mapping +from collections.abc import Callable, Mapping from dataclasses import dataclass from types import MappingProxyType from typing import ClassVar, NamedTuple @@ -691,3 +691,51 @@ def cutlass_fp4_supported() -> bool: capability_tuple = current_platform.get_device_capability() capability = -1 if capability_tuple is None else capability_tuple.to_int() return cutlass_scaled_mm_supports_fp4(capability) + + +def convert_bf16_scales_to_fp8( + quant_fp8: Callable, scales: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Convert a BF16 scale tensor into the pair of (fp8_scales, channel_scales) + expected by W4A8 GEMM kernels. + """ + assert scales.is_contiguous(), ( + f"scale tensor must be contiguous, got {scales.stride()=}" + ) + assert scales.is_cuda, "scales must be on gpu" + + orig_shape = scales.shape + k_groups = orig_shape[-1] + flat_scales = scales.view(-1, k_groups) + + fp8_scales, chan_scales = quant_fp8(flat_scales) + fp8_scales = (fp8_scales.float() / 8.0).to(torch.float8_e4m3fn) + chan_scales *= 8.0 + + # restore original shape + fp8_scales = fp8_scales.view(orig_shape) + chan_scales = chan_scales.view(orig_shape[:-1], -1) + + return fp8_scales, chan_scales + + +def convert_packed_uint4b8_to_signed_int4_inplace(t: torch.Tensor) -> torch.Tensor: + """ + Convert int4b8 (packed to int32) to signed int4 + """ + assert t.is_cuda, "tensor must be on gpu" + assert t.dtype == torch.int32, f"expected int32 packed weights but got {t.dtype}" + + # loop through the 8 4-bit nibbles in each int32 entry + for i in range(8): + shift = 4 * i + # extract the i-th 4-bit nibble + nib = (t >> shift) & 0xF + # clear the original nibble by masking out + t &= ~(0xF << shift) + # convert int4b8 [0..15] to signed int4 [-8..7] by subtracting 8 + # and update in-place + t |= ((nib - 8) & 0xF) << shift + + return t From 03b91f726214b4a022f73a084ee283001e5bba0c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 8 Dec 2025 23:44:28 -0500 Subject: [PATCH 093/133] [Bugfix] Fix compressed-tensors models failing to load with transformers backend (#30287) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../compressed_tensors/compressed_tensors.py | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 21f4cfe51d08e..f835584219cca 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -116,16 +116,37 @@ class CompressedTensorsConfig(QuantizationConfig): return "compressed-tensors" def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map) - self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) - self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict( - self.sparsity_scheme_map - ) - self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list( - self.sparsity_ignore_list - ) + """ + Transform layer paths in config targets to match vLLM's naming. + + The WeightsMapper is designed for weight paths, but some backends + (e.g. transformers) use broad prefix mappings like "" -> "model." + which would incorrectly transform non-path targets. + + compressed-tensors targets can be: + - Layer paths: "layers.0.self_attn.q_proj" -> transformed + - Module class names: "Linear" -> preserved (no ".") + - Regex patterns: "re:.*proj" -> preserved (starts with "re:") + """ + + def _map_target(target: str) -> str | None: + is_layer_path = "." in target and not target.startswith("re:") + if is_layer_path: + return hf_to_vllm_mapper._map_name(target) + return target + + def _apply_dict(d: dict) -> dict: + return {k: v for t, v in d.items() if (k := _map_target(t)) is not None} + + def _apply_list(lst: list) -> list: + return [t for x in lst if (t := _map_target(x)) is not None] + + self.target_scheme_map = _apply_dict(self.target_scheme_map) + self.ignore = _apply_list(self.ignore) + self.sparsity_scheme_map = _apply_dict(self.sparsity_scheme_map) + self.sparsity_ignore_list = _apply_list(self.sparsity_ignore_list) if self.kv_cache_scheme is not None: - self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(self.kv_cache_scheme) + self.kv_cache_scheme = _apply_dict(self.kv_cache_scheme) def get_quant_method( self, From 4c6fd258808ed42fc98a94f3a849f5fc9efebc20 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Tue, 9 Dec 2025 06:46:09 +0200 Subject: [PATCH 094/133] kv_transfer: Rename the shared storage connectors (#30201) Signed-off-by: Or Ozeri --- .../scripts/hardware_ci/run-xpu-test.sh | 2 +- docs/features/disagg_encoder.md | 6 +- docs/features/disagg_prefill.md | 4 +- .../decode_example.py | 2 +- .../prefill_example.py | 2 +- .../kv_load_failure_recovery/README.md | 4 +- .../decode_example.py | 6 +- ....py => load_recovery_example_connector.py} | 20 ++--- .../prefill_example.py | 2 +- .../disaggregated_encoder/README.md | 6 +- .../disagg_1e1p1d_example.sh | 4 +- .../disagg_1e1pd_example.sh | 4 +- tests/distributed/test_kvlayout.py | 2 +- tests/v1/core/test_scheduler.py | 6 +- tests/v1/core/utils.py | 4 +- .../integration/run_epd_correctness_test.sh | 8 +- ...nector.py => test_ec_example_connector.py} | 86 +++++++++---------- tests/v1/engine/test_engine_core.py | 4 +- .../unit/test_backwards_compatibility.py | 8 +- ...connector.py => test_example_connector.py} | 6 +- .../unit/test_kv_connector_lifecyle.py | 10 +-- .../kv_connector/unit/test_multi_connector.py | 22 ++--- tests/v1/kv_connector/unit/utils.py | 10 +-- ...rage_connector.py => example_connector.py} | 8 +- .../ec_transfer/ec_connector/factory.py | 6 +- .../kv_transfer/kv_connector/factory.py | 6 +- ...rage_connector.py => example_connector.py} | 10 +-- 27 files changed, 129 insertions(+), 129 deletions(-) rename examples/offline_inference/kv_load_failure_recovery/{rogue_shared_storage_connector.py => load_recovery_example_connector.py} (88%) rename tests/v1/ec_connector/unit/{test_ec_shared_storage_connector.py => test_ec_example_connector.py} (90%) rename tests/v1/kv_connector/unit/{test_shared_storage_connector.py => test_example_connector.py} (97%) rename vllm/distributed/ec_transfer/ec_connector/{shared_storage_connector.py => example_connector.py} (96%) rename vllm/distributed/kv_transfer/kv_connector/v1/{shared_storage_connector.py => example_connector.py} (98%) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 1d5dba3f26f5a..dfc9db512d1e9 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -47,6 +47,6 @@ docker run \ pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/test_serial_utils.py ' diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md index 7d40af7069822..f18a0e85e4b3b 100644 --- a/docs/features/disagg_encoder.md +++ b/docs/features/disagg_encoder.md @@ -32,14 +32,14 @@ Design doc: None: - assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata) + assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata) index, failed_request = next( ( (i, x) @@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector): ) -> tuple[set[str] | None, set[str] | None]: if self._async_load: meta = self._get_connector_metadata() - assert isinstance(meta, RogueSharedStorageConnectorMetadata) + assert isinstance(meta, LoadRecoveryExampleConnectorMetadata) if meta.req_to_block_ids: return None, set(meta.req_to_block_ids) @@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector): ) -> KVConnectorMetadata: if not self._async_load: base = super().build_connector_meta(scheduler_output) - meta = RogueSharedStorageConnectorMetadata.from_base(base) + meta = LoadRecoveryExampleConnectorMetadata.from_base(base) else: - meta = RogueSharedStorageConnectorMetadata() + meta = LoadRecoveryExampleConnectorMetadata() if self._requests_need_load: for req_id, request in self._requests_need_load.items(): meta.add_request( diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py index 047b81c82df53..ee4a84fd95003 100644 --- a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py +++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py @@ -26,7 +26,7 @@ def main(): enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ), diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md index 5813a3cecf73b..b2c3bb974dfab 100644 --- a/examples/online_serving/disaggregated_encoder/README.md +++ b/examples/online_serving/disaggregated_encoder/README.md @@ -50,12 +50,12 @@ The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url" ## EC connector and KV transfer -The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration: +The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration: ```bash # Add to encoder instance: --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -64,7 +64,7 @@ The `ECSharedStorageConnector` is used to store the encoder cache on local disk # Add to prefill/prefill+decode instance: --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh index 57489df64f51e..95a418374ad28 100644 --- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh @@ -102,7 +102,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -126,7 +126,7 @@ vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh index 6073e0580b11d..c4a591d7438cb 100644 --- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh @@ -96,7 +96,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -117,7 +117,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py index b190b2820451b..c8177f1c7c2ff 100644 --- a/tests/distributed/test_kvlayout.py +++ b/tests/distributed/test_kvlayout.py @@ -61,7 +61,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): kv_role="kv_both", kv_connector_extra_config={ "connectors": [ - {"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"}, + {"kv_connector": "ExampleConnector", "kv_role": "kv_both"}, {"kv_connector": "NixlConnector", "kv_role": "kv_both"}, ] }, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index c6c4a5085bff7..1999e9f6c3b99 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1536,7 +1536,7 @@ def create_scheduler_with_priority( ) kv_transfer_config = ( KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) @@ -1552,7 +1552,7 @@ def create_scheduler_with_priority( ec_transfer_config = ( ECTransferConfig( - ec_connector="ECSharedStorageConnector", + ec_connector="ECExampleConnector", ec_role=ec_role, ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"}, ) @@ -2413,7 +2413,7 @@ def _assert_right_ec_connector_metadata( metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas} # Check all required identifiers exist in metadata; and no extra - # In ECSharedStorageConnector format + # In ECExampleConnector format # NOTE: even having same identifier, the mm_features can be different # since their mm_position can be in different offsets, etc identifiers_dict = {f.identifier for f in mm_features_list} diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index f5ba613d38db1..531b9c595b04d 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -108,7 +108,7 @@ def create_scheduler( ) elif use_kv_connector: kv_transfer_config = KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) @@ -121,7 +121,7 @@ def create_scheduler( ec_transfer_config = ( ECTransferConfig( - ec_connector="ECSharedStorageConnector", + ec_connector="ECExampleConnector", ec_role=ec_role, ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"}, ) diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh index 55dd39c0a957f..0c2666306558c 100644 --- a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh +++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh @@ -148,7 +148,7 @@ run_epd_1e_1pd() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -167,7 +167,7 @@ run_epd_1e_1pd() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -348,7 +348,7 @@ run_epd_1e_1p_1d() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -369,7 +369,7 @@ run_epd_1e_1p_1d() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py similarity index 90% rename from tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py rename to tests/v1/ec_connector/unit/test_ec_example_connector.py index a58daa2628e21..7e9eb21310031 100644 --- a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py +++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Unit tests for ECSharedStorageConnector. +Unit tests for ECExampleConnector. """ import os @@ -13,9 +13,9 @@ import torch from vllm.config import VllmConfig from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole -from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( - ECSharedStorageConnector, - ECSharedStorageConnectorMetadata, +from vllm.distributed.ec_transfer.ec_connector.example_connector import ( + ECExampleConnector, + ECExampleConnectorMetadata, MMMeta, ) from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange @@ -81,12 +81,12 @@ def mock_request_with_3_mm(): # ------------------ Unit Tests ------------------ # -class TestECSharedStorageConnectorBasics: +class TestECExampleConnectorBasics: """Test basic EC connector functionality.""" def test_initialization_producer(self, mock_vllm_config_producer, temp_storage): """Test connector initializes correctly as producer.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -98,7 +98,7 @@ class TestECSharedStorageConnectorBasics: def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage): """Test connector initializes correctly as consumer.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) @@ -109,11 +109,11 @@ class TestECSharedStorageConnectorBasics: def test_role_assignment(self, mock_vllm_config_producer): """Test role is correctly assigned.""" - scheduler_connector = ECSharedStorageConnector( + scheduler_connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) - worker_connector = ECSharedStorageConnector( + worker_connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -133,7 +133,7 @@ class TestCacheExistence: ): """Test has_caches returns True when all 3 caches exist.""" # Test for producer first - producer = ECSharedStorageConnector( + producer = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -154,7 +154,7 @@ class TestCacheExistence: assert all(producer_result), f"Expected all True, got {producer_result}" # Also test consumer can check if cache exists - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.SCHEDULER, ) @@ -170,7 +170,7 @@ class TestCacheExistence: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test has_caches returns False when no caches exist.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -186,7 +186,7 @@ class TestCacheExistence: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test has_caches with some caches existing (1 of 3).""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -213,7 +213,7 @@ class TestStateManagement: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test state update after allocation for 3 MM items.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -238,7 +238,7 @@ class TestStateManagement: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test metadata building for 3 MM items.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -252,7 +252,7 @@ class TestStateManagement: metadata = connector.build_connector_meta(scheduler_output) # Assert - assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert isinstance(metadata, ECExampleConnectorMetadata) assert len(metadata.mm_datas) == 3 assert metadata.mm_datas[0].mm_hash == "img_hash_1" assert metadata.mm_datas[0].num_token == 100 @@ -266,7 +266,7 @@ class TestStateManagement: def test_build_connector_meta_empty(self, mock_vllm_config_producer): """Test metadata building with empty state.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -274,14 +274,14 @@ class TestStateManagement: scheduler_output = Mock(spec=SchedulerOutput) metadata = connector.build_connector_meta(scheduler_output) - assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert isinstance(metadata, ECExampleConnectorMetadata) assert len(metadata.mm_datas) == 0 def test_state_cleared_after_metadata_build( self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test that state is properly cleared after building metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -310,7 +310,7 @@ class TestCacheSaving: self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage ): """Test cache saving as producer for 3 different MM items.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -336,7 +336,7 @@ class TestCacheSaving: def test_save_caches_consumer_skips(self, mock_vllm_config_consumer): """Test cache saving is skipped for consumer.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) @@ -366,7 +366,7 @@ class TestCacheLoading: ): """Test consumer loads 3 caches from storage.""" # First, create producer to save caches - producer = ECSharedStorageConnector( + producer = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -379,13 +379,13 @@ class TestCacheLoading: producer.save_caches(saved_caches, mm_hash) # Now consumer loads - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) # Setup metadata for all 3 - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() for mm_hash in mm_hashes: metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100)) consumer.bind_connector_metadata(metadata) @@ -410,7 +410,7 @@ class TestCacheLoading: ): """Test cache loading skips already cached items.""" # Setup: producer saves cache - producer = ECSharedStorageConnector( + producer = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -420,12 +420,12 @@ class TestCacheLoading: producer.save_caches({mm_hash: saved_cache}, mm_hash) # Consumer setup - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100)) consumer.bind_connector_metadata(metadata) @@ -444,13 +444,13 @@ class TestCacheLoading: def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer): """Test loading with empty metadata does nothing.""" - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) # Setup empty metadata - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() consumer.bind_connector_metadata(metadata) # Load (should not raise) @@ -466,7 +466,7 @@ class TestFilenameGeneration: def test_generate_foldername(self, mock_vllm_config_producer, temp_storage): """Test folder name generation.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -479,7 +479,7 @@ class TestFilenameGeneration: def test_generate_filename(self, mock_vllm_config_producer, temp_storage): """Test filename generation.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -493,7 +493,7 @@ class TestFilenameGeneration: def test_generate_filename_consistency(self, mock_vllm_config_producer): """Test filename generation is consistent.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -510,12 +510,12 @@ class TestMetadataBindingLifecycle: def test_bind_connector_metadata(self, mock_vllm_config_consumer): """Test binding connector metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() metadata.add_mm_data(MMMeta.make_meta("hash_1", 100)) connector.bind_connector_metadata(metadata) @@ -524,12 +524,12 @@ class TestMetadataBindingLifecycle: def test_clear_connector_metadata(self, mock_vllm_config_consumer): """Test clearing connector metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() connector.bind_connector_metadata(metadata) connector.clear_connector_metadata() @@ -538,12 +538,12 @@ class TestMetadataBindingLifecycle: def test_get_connector_metadata(self, mock_vllm_config_consumer): """Test getting connector metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() connector.bind_connector_metadata(metadata) retrieved = connector._get_connector_metadata() @@ -552,7 +552,7 @@ class TestMetadataBindingLifecycle: def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer): """Test getting metadata when not set raises.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) @@ -566,7 +566,7 @@ class TestEdgeCases: def test_save_empty_cache(self, mock_vllm_config_producer): """Test saving empty tensor.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -579,12 +579,12 @@ class TestEdgeCases: def test_load_nonexistent_cache(self, mock_vllm_config_consumer): """Test loading cache that doesn't exist raises error.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100)) connector.bind_connector_metadata(metadata) @@ -596,7 +596,7 @@ class TestEdgeCases: def test_has_caches_empty_request(self, mock_vllm_config_producer): """Test has_caches with request that has no MM data.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 48be8c15aba9e..5fa16897b4e0c 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -507,7 +507,7 @@ def test_encoder_instance_zero_kv_cache( ) kv_transfer_config = ( KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) @@ -515,7 +515,7 @@ def test_encoder_instance_zero_kv_cache( else None ) ec_transfer_config = ECTransferConfig( - ec_connector="ECSharedStorageConnector", + ec_connector="ECExampleConnector", ec_role=ec_role, ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"}, ) diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py index 7cd23805c599d..0d29ca5fca5e5 100644 --- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py +++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py @@ -218,12 +218,12 @@ def test_internal_connector_uses_new_signature(): Test that internal connectors (registered in factory) always use the new signature and get kv_cache_config. """ - from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( - SharedStorageConnector, + from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( + ExampleConnector, ) vllm_config = create_vllm_config() - vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector" + vllm_config.kv_transfer_config.kv_connector = "ExampleConnector" scheduler = create_scheduler(vllm_config) kv_cache_config = scheduler.kv_cache_config @@ -233,7 +233,7 @@ def test_internal_connector_uses_new_signature(): ) assert connector is not None - assert isinstance(connector, SharedStorageConnector) + assert isinstance(connector, ExampleConnector) assert connector._kv_cache_config is not None assert connector._kv_cache_config == kv_cache_config diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py similarity index 97% rename from tests/v1/kv_connector/unit/test_shared_storage_connector.py rename to tests/v1/kv_connector/unit/test_example_connector.py index ff4697a978255..75edb79fb4af4 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -119,16 +119,16 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]): ) def test_shared_storage_connector_hashes(tmp_path): """ - Tests that SharedStorageConnector saves KV to the storage locations + Tests that ExampleConnector saves KV to the storage locations with proper hashes; that are unique for inputs with identical text but different images (same size), or same multiple images but different orders. """ # Using tmp_path as the storage path to store KV print(f"KV storage path at: {str(tmp_path)}") - # Configure the SharedStorageConnector + # Configure the ExampleConnector kv_transfer_config = KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": str(tmp_path)}, ) diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index d0a6eeae6286d..4ba6b2201d0e2 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa: E501 - SharedStorageConnectorMetadata, +from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( # noqa: E501 + ExampleConnectorMetadata, ) from vllm.distributed.kv_transfer.kv_transfer_state import ( ensure_kv_transfer_initialized, @@ -11,7 +11,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import ( from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin -# Importing utils registers TestSharedStorageConnector with the factory +# Importing utils registers TestExampleConnector with the factory from .utils import create_vllm_config @@ -26,13 +26,13 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - kv_connector_metadata=SharedStorageConnectorMetadata(), + kv_connector_metadata=ExampleConnectorMetadata(), ) def test_kv_connector_mixin_clears_metadata(): vllm_config = create_vllm_config() - vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector" + vllm_config.kv_transfer_config.kv_connector = "TestExampleConnector" vllm_config.kv_transfer_config.kv_role = "kv_both" vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = "unit" diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index ffa7d884d2762..9b6d52e7c294d 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -77,9 +77,9 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool: "https://github.com/ROCm/pytorch/issues/2822" ), ) -def test_multi_shared_storage_connector_consistency(): +def test_multi_example_connector_consistency(): """ - Tests that MultiConnector with two SharedStorageConnectors saves + Tests that MultiConnector with two ExampleConnectors saves identical KV cache data to separate storage locations. """ storage_1_path = Path("storage_1/") @@ -89,14 +89,14 @@ def test_multi_shared_storage_connector_consistency(): storage_1_path.mkdir() storage_2_path.mkdir() - # Configure MultiConnector with two SharedStorageConnectors + # Configure MultiConnector with two ExampleConnectors kv_transfer_config = KVTransferConfig( kv_connector="MultiConnector", kv_role="kv_both", kv_connector_extra_config={ "connectors": [ { - "kv_connector": "TestSharedStorageConnector", + "kv_connector": "TestExampleConnector", "kv_role": "kv_both", "kv_connector_extra_config": { "shared_storage_path": str(storage_1_path), @@ -105,7 +105,7 @@ def test_multi_shared_storage_connector_consistency(): "kv_connector_module_path": "tests.v1.kv_connector.unit.utils", }, { - "kv_connector": "TestSharedStorageConnector", + "kv_connector": "TestExampleConnector", "kv_role": "kv_both", "kv_connector_extra_config": { "shared_storage_path": str(storage_2_path), @@ -427,7 +427,7 @@ class TestMultiConnectorStats: def test_build_kv_connector_stats_skips_connectors_without_custom_stats(self): """Test that connectors without custom stats (return None) are skipped.""" - # SharedStorageConnector doesn't override build_kv_connector_stats, + # ExampleConnector doesn't override build_kv_connector_stats, # so it returns None and should be skipped serialized_data = { "NixlConnector": { @@ -440,7 +440,7 @@ class TestMultiConnectorStats: "num_failed_notifications": [], } }, - "SharedStorageConnector": {"data": {"some_field": [1, 2, 3]}}, + "ExampleConnector": {"data": {"some_field": [1, 2, 3]}}, } stats = MultiConnector.build_kv_connector_stats(data=serialized_data) @@ -451,8 +451,8 @@ class TestMultiConnectorStats: assert len(stats.data) == 1 assert "NixlConnector" in stats.data assert isinstance(stats.data["NixlConnector"], NixlKVConnectorStats) - # SharedStorageConnector should be skipped (returns None) - assert "SharedStorageConnector" not in stats.data + # ExampleConnector should be skipped (returns None) + assert "ExampleConnector" not in stats.data def test_build_kv_connector_stats_handles_malformed_data(self): """Test that malformed data raises appropriate errors.""" @@ -527,13 +527,13 @@ class TestMultiConnectorStats: ) stats2 = MultiKVConnectorStats( - data={"SharedStorageConnector": KVConnectorStats(data={"field": [1, 2]})} + data={"ExampleConnector": KVConnectorStats(data={"field": [1, 2]})} ) result = stats1.aggregate(stats2) assert "NixlConnector" in result.data - assert "SharedStorageConnector" in result.data + assert "ExampleConnector" in result.data def test_reduce(self): """Test that reduce() correctly reduces all nested connector stats.""" diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 58f1a7282352b..5cdb1f84b30d4 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -24,8 +24,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata, KVConnectorRole, ) -from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa - SharedStorageConnector, +from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( # noqa + ExampleConnector, ) from vllm.utils.hashing import sha256 from vllm.v1.core.kv_cache_manager import KVCacheBlocks @@ -264,10 +264,10 @@ def create_model_runner_output( ) -class TestSharedStorageConnector(SharedStorageConnector): +class TestExampleConnector(ExampleConnector): def __init__(self, config: VllmConfig, role, kv_cache_config): self.name = config.kv_transfer_config.kv_connector_extra_config["name"] - self._connector = SharedStorageConnector(config, role) + self._connector = ExampleConnector(config, role) self.call_record: dict[str, int] = defaultdict(int) # Use a unique temp file per connector self._event_file = ( @@ -394,7 +394,7 @@ class MockKVConnector(KVConnectorBase_V1): KVConnectorFactory.register_connector( - "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__ + "TestExampleConnector", __name__, TestExampleConnector.__name__ ) KVConnectorFactory.register_connector( diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py similarity index 96% rename from vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py rename to vllm/distributed/ec_transfer/ec_connector/example_connector.py index c8388141dcc97..5f2eff5a8e6a8 100644 --- a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py @@ -32,7 +32,7 @@ class MMMeta: @dataclass -class ECSharedStorageConnectorMetadata(ECConnectorMetadata): +class ECExampleConnectorMetadata(ECConnectorMetadata): mm_datas: list[MMMeta] def __init__(self): @@ -42,7 +42,7 @@ class ECSharedStorageConnectorMetadata(ECConnectorMetadata): self.mm_datas.append(mm_data) -class ECSharedStorageConnector(ECConnectorBase): +class ECExampleConnector(ECConnectorBase): # NOTE: This is Simple debug implementation of the EC connector. # It save / load the EC cache to / from the disk. @@ -76,7 +76,7 @@ class ECSharedStorageConnector(ECConnectorBase): # Get the metadata metadata: ECConnectorMetadata = self._get_connector_metadata() - assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert isinstance(metadata, ECExampleConnectorMetadata) assert encoder_cache is not None if metadata is None: logger.warning( @@ -160,7 +160,7 @@ class ECSharedStorageConnector(ECConnectorBase): Args: scheduler_output (SchedulerOutput): the scheduler output object. """ - meta = ECSharedStorageConnectorMetadata() + meta = ECExampleConnectorMetadata() for mm_hash, num_encoder_token in self._mm_datas_need_loads.items(): meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token)) self._mm_datas_need_loads.clear() diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py index e51b32e6f6dff..32f36ffbb14d2 100644 --- a/vllm/distributed/ec_transfer/ec_connector/factory.py +++ b/vllm/distributed/ec_transfer/ec_connector/factory.py @@ -79,7 +79,7 @@ class ECConnectorFactory: # only load the files corresponding to the current connector. ECConnectorFactory.register_connector( - "ECSharedStorageConnector", - "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector", - "ECSharedStorageConnector", + "ECExampleConnector", + "vllm.distributed.ec_transfer.ec_connector.example_connector", + "ECExampleConnector", ) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 02f51a1dce112..02d9a1ec9599e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -144,9 +144,9 @@ class KVConnectorFactory: # only load the files corresponding to the current connector. KVConnectorFactory.register_connector( - "SharedStorageConnector", - "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector", - "SharedStorageConnector", + "ExampleConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.example_connector", + "ExampleConnector", ) KVConnectorFactory.register_connector( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py similarity index 98% rename from vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py index ed641cfc43ddd..41243fc866b59 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py @@ -65,7 +65,7 @@ class ReqMeta: @dataclass -class SharedStorageConnectorMetadata(KVConnectorMetadata): +class ExampleConnectorMetadata(KVConnectorMetadata): requests: list[ReqMeta] = field(default_factory=list) def add_request( @@ -81,7 +81,7 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata): ) -class SharedStorageConnector(KVConnectorBase_V1): +class ExampleConnector(KVConnectorBase_V1): # NOTE: This is Simple debug implementation of the KV connector. # It save / load the KV cache to / from the disk. # It does extra work which will overwrite the existing prefix-cache in GPU @@ -157,7 +157,7 @@ class SharedStorageConnector(KVConnectorBase_V1): # Get the metadata metadata: KVConnectorMetadata = self._get_connector_metadata() - assert isinstance(metadata, SharedStorageConnectorMetadata) + assert isinstance(metadata, ExampleConnectorMetadata) if metadata is None: logger.warning( @@ -241,7 +241,7 @@ class SharedStorageConnector(KVConnectorBase_V1): return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] connector_metadata = self._get_connector_metadata() - assert isinstance(connector_metadata, SharedStorageConnectorMetadata) + assert isinstance(connector_metadata, ExampleConnectorMetadata) for request in connector_metadata.requests: if request.is_store: filename = self._generate_filename_debug( @@ -315,7 +315,7 @@ class SharedStorageConnector(KVConnectorBase_V1): Args: scheduler_output (SchedulerOutput): the scheduler output object. """ - meta = SharedStorageConnectorMetadata() + meta = ExampleConnectorMetadata() total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: From 4b03b502119f857bbead7064a29267b9ea8999e5 Mon Sep 17 00:00:00 2001 From: liangel-02 Date: Mon, 8 Dec 2025 23:46:35 -0500 Subject: [PATCH 095/133] update torchao safetensors impl (#30155) Signed-off-by: Angel Li --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0496b7a84507b..610e6a620ade2 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -641,7 +641,6 @@ def safetensors_weights_iterator( if safetensors_load_strategy == "eager": loading_desc += " (eager)" - state_dict = {} leftover_state_dict: dict[str, torch.Tensor] = {} for st_file in tqdm( @@ -667,6 +666,7 @@ def safetensors_weights_iterator( ) with safe_open(st_file, framework="pt") as f: + state_dict = {} for name in f.keys(): # noqa: SIM118 state_dict[name] = f.get_tensor(name) From e130845984b77b248436bdbe8f3afdf7b3107a62 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Tue, 9 Dec 2025 04:55:39 +0000 Subject: [PATCH 096/133] [CPU][CI] Enable fused MoE tests in Arm CI (#30132) Signed-off-by: Fadi Arafeh --- .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index b5f6b2494792f..9c6e7766b2ac4 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -40,7 +40,8 @@ function cpu_tests() { docker exec cpu-test bash -c " set -e pytest -x -v -s tests/kernels/test_onednn.py - pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic" # basic online serving docker exec cpu-test bash -c ' From c2e1987a6e794b745be4c060ae4bbd06864d8028 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Tue, 9 Dec 2025 13:16:44 +0800 Subject: [PATCH 097/133] [Doc] update Intel GPU MM status in Feature x Hardware matrix (#30294) Signed-off-by: Lin, Fanli --- docs/features/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/features/README.md b/docs/features/README.md index 684802301a44f..e9e5232929b72 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -68,8 +68,8 @@ th:not(:first-child) { | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | | [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | -| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | +| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | From 58d5b3f51455706bf4f1f2360a0feb83d161147e Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Tue, 9 Dec 2025 14:30:05 +0900 Subject: [PATCH 098/133] [Model][Quantization] Restore MoE + GGUF models support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116) Signed-off-by: Tsukasa OI Co-authored-by: Isotr0py --- .../layers/quantization/gguf.py | 1 + .../model_loader/gguf_loader.py | 24 ++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index bcdfafb50fc5a..ee819df292ed1 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig): return UnquantizedEmbeddingMethod() return GGUFEmbeddingMethod(self) elif isinstance(layer, FusedMoE): + # TODO: Select UnquantizedFusedMoEMethod on unquantized layers. return GGUFMoEMethod(self, layer.moe_config) return None diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 74052f72ceab9..7f94bd234fd38 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -4,6 +4,7 @@ import os from collections.abc import Generator import gguf +import regex as re import torch import torch.nn as nn from huggingface_hub import hf_hub_download @@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader): hasattr(config, "vision_config") and config.vision_config is not None ) gguf_to_hf_name_map = {} + sideload_params: list[re.Pattern] = [] # hack: ggufs have a different name than transformers if model_type == "cohere": model_type = "command-r" @@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader): gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( f"model.layers.{idx}.mlp.experts.0.up_proj.weight" ) + sideload_params.append( + re.compile( + f"model\\.layers\\.{idx}" + r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight" + ) + ) if model_type in ("qwen2_moe", "qwen3_moe"): model_type = model_type.replace("_", "") # GGUF layer map assumes that we will have a merged expert weights @@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader): gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( f"model.layers.{idx}.mlp.experts.0.up_proj.weight" ) + sideload_params.append( + re.compile( + f"model\\.layers\\.{idx}" + r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight" + ) + ) arch = None for key, value in gguf.MODEL_ARCH_NAMES.items(): @@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader): # Parameter not in manual overrides either unmapped_params.append(hf_name) - # All parameters must be mapped: both vision/projector and backbone + # All parameters (except those initialized by other means) must be mapped: + # both vision/projector and backbone + if unmapped_params: + unmapped_params = list( + filter( + lambda x: not any(re.fullmatch(p, x) for p in sideload_params), + unmapped_params, + ) + ) if unmapped_params: raise RuntimeError( f"Failed to map GGUF parameters " From e4605d225e020154bc98efd15a05e11b83eaefb7 Mon Sep 17 00:00:00 2001 From: Yongtao Huang Date: Tue, 9 Dec 2025 14:50:06 +0800 Subject: [PATCH 099/133] [Misc] Fix safetensors import for safe_open (#30300) Signed-off-by: Yongtao Huang --- vllm/lora/lora_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py index db170f13ae1c7..f5e36697ed18c 100644 --- a/vllm/lora/lora_model.py +++ b/vllm/lora/lora_model.py @@ -3,7 +3,7 @@ import os -import safetensors.torch +import safetensors import torch from vllm.logger import init_logger From aed846917fee3b27df876d21ade4d3a30d4de402 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 9 Dec 2025 02:24:01 -0500 Subject: [PATCH 100/133] [Attention] Make `split_decodes_and_prefills(..., require_uniform=True)` support padding (#29644) Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Co-authored-by: Benjamin Chislett --- .../v1/attention/test_attention_splitting.py | 25 ++++++++++++++++++- vllm/v1/attention/backends/utils.py | 10 +++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py index f60861e3489d6..f08e2f480e30f 100644 --- a/tests/v1/attention/test_attention_splitting.py +++ b/tests/v1/attention/test_attention_splitting.py @@ -154,7 +154,10 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata): def apply_split_decodes_and_prefills( - query_lens: list[int], decode_threshold: int, require_uniform: bool + query_lens: list[int], + decode_threshold: int, + require_uniform: bool, + padded_num_tokens: int | None = None, ): """Helper function to apply split_decodes_and_prefills and return the results.""" @@ -165,6 +168,10 @@ def apply_split_decodes_and_prefills( block_size=16, device=device, ) + + if padded_num_tokens is not None: + common_metadata.num_actual_tokens = padded_num_tokens + return split_decodes_and_prefills( common_metadata, decode_threshold=decode_threshold, @@ -271,6 +278,22 @@ def test_split_decodes_and_prefills_uniform_mixed_batch_non_uniform_decodes(): assert num_prefill_tokens == (sum(query_lens) - 2) # rest of the tokens +def test_split_decodes_and_prefills_uniform_padded_batch_all_same(): + """uniform batch where all query lengths are identical with 0 length padded reqs.""" + # All query lengths are 2, with decode_threshold=3 (so 2 <= 3) + # This triggers the padded uniform path at line 891 + query_lens = [2, 2, 2, 0] + padded_num_tokens = 8 + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + apply_split_decodes_and_prefills(query_lens, 3, True, padded_num_tokens) + ) + # With uniform batch, all requests are treated as decodes + assert num_decodes == 4 + assert num_prefills == 0 + assert num_decode_tokens == padded_num_tokens + assert num_prefill_tokens == 0 + + @pytest.mark.parametrize( "seq_lens,query_lens,split_point,expected_first_reqs,expected_second_reqs", [ diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 8edfbb5140bc9..5200bc48b3156 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -883,11 +883,15 @@ def split_decodes_and_prefills( return 0, num_reqs, 0, num_tokens if require_uniform: + # check if we are in a padded uniform batch; this is used for full-CGs, some + # requests may have a query length of 0 but since they are padding its fine + # to treat them as decodes (ensures num_decodes matches the captured size) + if torch.all((query_lens == query_lens[0]) | (query_lens == 0)): + assert num_reqs * query_lens[0] == num_tokens, "tokens not padded correctly" + return num_reqs, 0, num_tokens, 0 # all decodes is_prefill = query_lens != query_lens[0] else: - # 0-query len indicates a padded request; leave this at the back - # of the batch with the prefills - is_prefill = (query_lens > decode_threshold) | (query_lens == 0) + is_prefill = query_lens > decode_threshold if not torch.any(is_prefill): return num_reqs, 0, num_tokens, 0 From aeb82b1930454498fccc7e91f7c4e0f360cf658a Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Tue, 9 Dec 2025 01:33:34 -0600 Subject: [PATCH 101/133] [CI] Fix Flaky test_eagle_max_len Test (#30306) Signed-off-by: Micah Williamson --- tests/v1/spec_decode/test_max_len.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 81da8609aa6cf..15a6bd2659ea9 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -82,7 +82,7 @@ def test_eagle_max_len( len(o.prompt_token_ids) < 80 < len(o.prompt_token_ids) + len(o.outputs[0].token_ids) - < 200 + <= 200 ), ( "This test is only meaningful if the output " "is longer than the eagle max length" From 9c32df6101b24040c824dee5b0e5543a94dc6f45 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 9 Dec 2025 16:04:02 +0800 Subject: [PATCH 102/133] [Bugfix] Qwen 3 VL Embedding loading (#30303) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/models/adapters.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 70f203b9f7c64..9ba76f312edac 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -175,9 +175,14 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T: self.vllm_config = vllm_config # These are not used in pooling models - for attr in ("lm_head", "logits_processor"): - if hasattr(self, attr): - delattr(self, attr) + objects_to_clean = [self] + if language_model := getattr(self, "language_model", None): + objects_to_clean.append(language_model) + + for obj in objects_to_clean: + for attr in ("lm_head", "logits_processor"): + if hasattr(obj, attr): + delattr(obj, attr) # If the model already defines a pooler instance, don't overwrite it if not getattr(self, "pooler", None): From 67475a6e81abea915857f82e6f10d80b03b842c9 Mon Sep 17 00:00:00 2001 From: Jaya Yuan Date: Tue, 9 Dec 2025 16:22:14 +0800 Subject: [PATCH 103/133] [DCP][Bugfix][CI] Fix accuracy issue of DCP when using FLASH_ATTN_MLA (#30309) Signed-off-by: FENP --- tests/distributed/test_context_parallel.py | 5 ++++- vllm/v1/attention/backends/mla/flashattn_mla.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 3cb533dccd62c..aa47f28a34dd5 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -123,8 +123,11 @@ class CPTestSettings: CP_TEXT_GENERATION_MODELS = { "deepseek-ai/DeepSeek-V2-Lite-Chat": [ + CPTestSettings.detailed(dcp_multipliers=[1]), CPTestSettings.detailed( - dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64 + dcp_multipliers=[0.5], + cp_kv_cache_interleave_size=64, + attn_backend="FLASHMLA", ), ], "Qwen/Qwen2.5-1.5B-Instruct": [ diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index eccf4ec791095..b28814aceada9 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -105,13 +105,14 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] vllm_config: VllmConfig, device: torch.device, ): + interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size super().__init__( kv_cache_spec, layer_names, vllm_config, device, FlashAttnMLAMetadata, - supports_dcp_with_varlen=True, + supports_dcp_with_varlen=(interleave_size == 1), ) self.max_num_splits = 0 # No upper bound on the number of splits. self.fa_aot_schedule = get_flash_attn_version() == 3 From c72ea1072313b30fc8b4701697520fdd0b56fc35 Mon Sep 17 00:00:00 2001 From: Hubert de La Jonquiere Date: Tue, 9 Dec 2025 11:54:08 +0100 Subject: [PATCH 104/133] [Structured Output][Reasoning] Improves decoding throughput for models using single-token reasoning endings. (#30056) --- docs/features/reasoning_outputs.md | 3 ++ .../test_base_thinking_reasoning_parser.py | 35 +++++++++++++++++++ .../test_deepseekv3_reasoning_parser.py | 1 + .../test_reasoning_structured_output.py | 1 + vllm/reasoning/abs_reasoning_parsers.py | 25 +++++++++++++ vllm/reasoning/basic_parsers.py | 6 ++++ .../reasoning/deepseek_v3_reasoning_parser.py | 5 +++ vllm/reasoning/holo2_reasoning_parser.py | 5 +++ vllm/reasoning/identity_reasoning_parser.py | 5 +++ vllm/v1/structured_output/__init__.py | 4 ++- 10 files changed, 89 insertions(+), 1 deletion(-) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 3315c0949afca..93cca23856a9b 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner def is_reasoning_end(self, input_ids: list[int]) -> bool: return self.end_token_id in input_ids + + def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool: + return self.end_token_id in delta_token_ids ... ``` diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py index 34e9483de54b3..165e91a2c79f2 100644 --- a/tests/reasoning/test_base_thinking_reasoning_parser.py +++ b/tests/reasoning/test_base_thinking_reasoning_parser.py @@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods: is False ) + def test_is_reasoning_end_streaming(self, test_tokenizer): + """Test the is_reasoning_end_streaming method.""" + parser = TestThinkingReasoningParser(test_tokenizer) + end_token_id = parser.end_token_id + start_token_id = parser.start_token_id + + assert ( + parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id]) + is True + ) + assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False + assert parser.is_reasoning_end_streaming([], []) is False + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id], [end_token_id] + ) + is True + ) + assert ( + parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False + ) + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id, 2, start_token_id, 2], + [2], + ) + is False + ) + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id, 2, 2], [2] + ) + is False + ) + def test_extract_content_ids(self, test_tokenizer): """Test the extract_content_ids method.""" parser = TestThinkingReasoningParser(test_tokenizer) diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py index 6e8f0e8dcc9b9..874fdef778110 100644 --- a/tests/reasoning/test_deepseekv3_reasoning_parser.py +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer): input_tokens = tokenizer.tokenize(input_text) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) assert parser.is_reasoning_end(input_ids) is True + assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True # Test extract_content_ids returns all input_ids assert parser.extract_content_ids(input_ids) == input_ids diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 70047a993c3f9..ba52af3ad604d 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -70,6 +70,7 @@ class TestReasoningStructuredOutput: request.use_structured_output = True request.prompt_token_ids = [1, 2, 3, 4, 5] request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8] + request.num_computed_tokens = 5 return request def test_should_fill_bitmask_with_enable_in_reasoning( diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index d0661d1f23b06..bf593ca4e52a0 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -63,6 +63,31 @@ class ReasoningParser: True if the reasoning content ends in the input_ids. """ + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + """ + Check if the reasoning content ends in the input_ids on a + decode step. + + It is used in structured engines like `xgrammar` to check if the + reasoning content ends in the model output during a decode step. + `input_ids` the entire model output and `delta_ids` are the last few + computed tokens of the model output (like during a decode step). + + Parameters: + input_ids: list[int] + The entire model output. + delta_ids: list[int] + The last few computed tokens of the model output at the current decode step. + + Returns: + bool + True if the reasoning content ends in the `delta_ids` on a + decode step. + """ + return self.is_reasoning_end(input_ids) + @abstractmethod def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index e78ac4a5ebb37..43067ca4afe05 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser): return True return False + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + end_token_id = self.end_token_id + return end_token_id in delta_ids + def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ Extract the content after the end tokens diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index afdf73262aca0..6604f70badbcf 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) + def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py index 76de1c077c88b..f80190d28d6aa 100644 --- a/vllm/reasoning/holo2_reasoning_parser.py +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) + def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index e92f8add0391a..e998e071efcf6 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser): # Always return True, since we never treat reasoning specially return True + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return True + def extract_content_ids(self, input_ids: list[int]) -> list[int]: # Identity: return all tokens as content return input_ids diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 5ee88178cdf60..4dd478804049b 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -339,7 +339,9 @@ class StructuredOutputManager: return True # Check if reasoning ends in *this* step - if self.reasoner.is_reasoning_end(request.all_token_ids): + if self.reasoner.is_reasoning_end_streaming( + request.all_token_ids, request.all_token_ids[request.num_computed_tokens :] + ): # Reasoning just ended, so we shouldn't advance til # next pass structured_req.reasoning_ended = True From 03416eada6c01770fb71c3d988fc3c74958d8f5e Mon Sep 17 00:00:00 2001 From: haoyangli-amd Date: Tue, 9 Dec 2025 19:28:50 +0800 Subject: [PATCH 105/133] [bugfix][quantization] Fix fp8 per_tensor scale shape (#30257) Signed-off-by: Haoyang Li --- vllm/_custom_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6bbfe11b6e925..6d862c5812560 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1726,7 +1726,7 @@ def scaled_fp8_quant( output, input, scale, scale_ub ) else: - scale = torch.empty((1, 1), device=input.device, dtype=torch.float32) + scale = torch.empty(1, device=input.device, dtype=torch.float32) torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) else: assert scale.numel() == 1, f"{scale.shape}" From 1166c31cc78073378a16509fbbbed4cb4f040a4d Mon Sep 17 00:00:00 2001 From: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com> Date: Tue, 9 Dec 2025 07:20:21 -0500 Subject: [PATCH 106/133] [Bugfix]: Fix glm46 awq marlin moe wna16 compatibility (#30210) Signed-off-by: baonudesifeizhai --- .../layers/fused_moe/fused_moe.py | 45 +++++++++++++++++++ .../layers/quantization/moe_wna16.py | 9 ++-- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f3c158ee2f9dc..0b83a3f5c4803 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -895,6 +895,48 @@ def get_moe_configs( return None +def _ensure_block_size_k_divisible( + size_k: int, block_size_k: int, group_size: int +) -> int: + """Ensure block_size_k is a divisor of size_k and divisible by group_size. + + This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which + requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0. + + Args: + size_k: The size_k dimension that must be divisible by result. + block_size_k: Preferred block size (will be adjusted if needed). + group_size: The result must be divisible by this. + + Returns: + A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size. + """ + # Fast path: already valid + if size_k % block_size_k == 0 and block_size_k % group_size == 0: + return block_size_k + + # Find the largest value that: + # 1. Divides size_k (size_k % candidate == 0) + # 2. Is divisible by group_size (candidate % group_size == 0) + # 3. Is <= block_size_k (prefer smaller values close to block_size_k) + # + # Strategy: Search from min(block_size_k, size_k) down to group_size, + # stepping by group_size to ensure divisibility by group_size + max_search = min(block_size_k, size_k) + start = (max_search // group_size) * group_size + for candidate in range(start, group_size - 1, -group_size): + if size_k % candidate == 0: + return candidate + + # Fallback: if group_size divides size_k, use it + # This should always be true with correct group_size configuration + if size_k % group_size == 0: + return group_size + + # This should not happen with correct group_size, but ensure divisibility + return size_k + + def get_moe_wna16_block_config( config: dict[str, int], use_moe_wna16_cuda: bool, @@ -960,6 +1002,9 @@ def get_moe_wna16_block_config( # at the same time. block_size_n = 1024 + # Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility + block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size) + return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k} diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index cf348290a2716..8570b8c339987 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -60,7 +60,7 @@ class MoeWNA16Config(QuantizationConfig): if self.linear_quant_method == "gptq": self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config) - elif self.linear_quant_method == "awq": + elif self.linear_quant_method in ("awq", "awq_marlin"): capability_tuple = current_platform.get_device_capability() device_capability = ( -1 if capability_tuple is None else capability_tuple.to_int() @@ -107,7 +107,7 @@ class MoeWNA16Config(QuantizationConfig): if linear_quant_method == "gptq": has_zp = not cls.get_from_keys(config, ["sym"]) modules_to_not_convert = [] - elif linear_quant_method == "awq": + elif linear_quant_method in ("awq", "awq_marlin"): has_zp = cls.get_from_keys(config, ["zero_point"]) modules_to_not_convert = cls.get_from_keys_or( config, ["modules_to_not_convert"], None @@ -184,7 +184,7 @@ class MoeWNA16Config(QuantizationConfig): return GPTQConfig.from_config(self.full_config).get_quant_method( layer, prefix ) - elif self.linear_quant_method == "awq": + elif self.linear_quant_method in ("awq", "awq_marlin"): if self.use_marlin and check_marlin_supports_layer( layer, self.group_size ): @@ -468,7 +468,8 @@ class MoeWNA16Method(FusedMoEMethodBase): shard_size = layer.intermediate_size_per_partition # convert gptq and awq weight to a standard format - if layer.quant_config.linear_quant_method == "awq": + # awq_marlin uses the same weight format as awq + if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"): assert layer.quant_config.weight_bits == 4 if "weight" in weight_name: loaded_weight = convert_awq_tensor(loaded_weight, "qweight") From ee14644ba9a3696c83ede2c948b73ebc3e1ffb33 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 9 Dec 2025 22:27:37 +0800 Subject: [PATCH 107/133] [ROCm] Aiter Quant Kernels (#25552) Signed-off-by: vllmellm --- vllm/_aiter_ops.py | 87 +++++++++++++++++++ .../layers/quantization/input_quant_fp8.py | 31 +++++++ vllm/platforms/rocm.py | 7 +- 3 files changed, 123 insertions(+), 2 deletions(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 35920d826578e..94bbc9b00225e 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -9,6 +9,8 @@ import vllm.envs as envs from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer +_FP8_DTYPE = current_platform.fp8_dtype() + def is_aiter_found() -> bool: from importlib.util import find_spec @@ -467,6 +469,59 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( return torch.empty_like(x), torch.empty_like(residual) +def _rocm_aiter_per_tensor_quant_impl( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.quant import per_tensor_quant_hip + + return per_tensor_quant_hip(x, scale, quant_dtype) + + +def _rocm_aiter_per_tensor_quant_fake( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + return torch.empty_like(x, dtype=quant_dtype), torch.empty( + 1, dtype=torch.float32, device=x.device + ) + + +def _rocm_aiter_per_token_quant_impl( + x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.quant import dynamic_per_token_scaled_quant + + assert quant_dtype in [torch.int8, _FP8_DTYPE] + + out_shape = x.shape + out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device) + if scale is None: + scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device) + dynamic_per_token_scaled_quant( + out, + x, + scale, + scale_ub=None, + shuffle_scale=False, + num_rows=None, + num_rows_factor=1, + ) + return out, scale + + +def _rocm_aiter_per_token_quant_fake( + x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + out_shape = x.shape + return ( + torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device), + torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device), + ) + + # Global flag to ensure ops are registered only once _OPS_REGISTERED = False @@ -665,6 +720,22 @@ class rocm_aiter_ops: dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_per_tensor_quant", + op_func=_rocm_aiter_per_tensor_quant_impl, + mutates_args=[], + fake_impl=_rocm_aiter_per_tensor_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_per_token_quant", + op_func=_rocm_aiter_per_token_quant_impl, + mutates_args=["scale"], + fake_impl=_rocm_aiter_per_token_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + _OPS_REGISTERED = True @staticmethod @@ -859,6 +930,22 @@ class rocm_aiter_ops: kv_scale=kv_scale, ) + @staticmethod + def per_tensor_quant( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale) + + @staticmethod + def per_token_quant( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale) + @staticmethod def triton_fp4_gemm_dynamic_qaunt( x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 7ded8eea79060..a5db086fb4729 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -5,6 +5,7 @@ import torch import torch.nn.functional as F from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform @@ -45,10 +46,13 @@ class QuantFP8(CustomOp): super().__init__() self.static = static self.group_shape = group_shape + self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN self.num_token_padding = num_token_padding self.column_major_scales = column_major_scales self.use_ue8m0 = use_ue8m0 + self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled() + self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: assert not static, "Group quantization only supports dynamic mode" @@ -92,6 +96,33 @@ class QuantFP8(CustomOp): use_per_token_if_dynamic=self.use_per_token_if_dynamic, ) + def forward_hip( + self, + x: torch.Tensor, + scale: torch.Tensor | None = None, + scale_ub: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + use_aiter_quant = ( + not self.is_group_quant + and self.use_aiter + and scale_ub is None + and x.is_contiguous() + ) + use_aiter_per_tensor_quant = ( + use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR + ) + use_aiter_per_token_quant = ( + use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN + ) + + if use_aiter_per_tensor_quant: + return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale) + if use_aiter_per_token_quant: + return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale) + + # Fallback to CUDA implementation + return self.forward_cuda(x, scale, scale_ub) + def forward_native( self, x: torch.Tensor, diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ff0fc78517876..f7adecbd88746 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -381,6 +381,8 @@ class RocmPlatform(Platform): compilation_config = vllm_config.compilation_config parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE + use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() + use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled() if compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs @@ -400,8 +402,6 @@ class RocmPlatform(Platform): ) compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() - if cache_config and cache_config.block_size is None: cache_config.block_size = 16 @@ -415,6 +415,9 @@ class RocmPlatform(Platform): ): compilation_config.custom_ops.append("+rms_norm") + if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops: + compilation_config.custom_ops.append("+quant_fp8") + @classmethod def verify_model_arch(cls, model_arch: str) -> None: if model_arch in _ROCM_UNSUPPORTED_MODELS: From 5c213d2899f5a2d439c8d771a0abc156a5412a2b Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:55:38 +0100 Subject: [PATCH 108/133] [BUGFIX] Mistral tool call parser v11+ (#30332) Signed-off-by: juliendenize --- tests/tool_use/test_mistral_tool_parser.py | 16 ++++++++++ .../tool_parsers/mistral_tool_parser.py | 32 ++++++++----------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py index e5deb7f40eb35..2dd0399cb8eeb 100644 --- a/tests/tool_use/test_mistral_tool_parser.py +++ b/tests/tool_use/test_mistral_tool_parser.py @@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming( "single_tool_weather", "multiple_tool_calls", "content_before_tool", + "complex", ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming( ], "bla", ), + ( + # Complex + """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + ), + ) + ) + ], + "", + ), ], ) def test_extract_tool_calls_streaming_one_chunk( diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index aa5089ffe84d7..bc827f045606c 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -99,12 +99,7 @@ class MistralToolParser(ToolParser): self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) - if not _is_pre_v11_tokeniser(self.model_tokenizer): - self.fn_name_regex = re.compile( - r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL - ) - else: - self.fn_name_regex = None + self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer) if self.bot_token_id is None: raise RuntimeError( @@ -148,23 +143,24 @@ class MistralToolParser(ToolParser): tool_content = model_output.replace(self.bot_token, "").strip() try: - # we first try to directly load the json as parsing very nested - # jsons is difficult try: - if self.fn_name_regex: + if not self._is_pre_v11: function_call_arr = [] for single_tool_content in model_output.split(self.bot_token): - matches = self.fn_name_regex.findall(single_tool_content) + if "{" not in single_tool_content: + continue - for match in matches: - fn_name = match[0] - args = match[1] + end_name = single_tool_content.find("{") + fn_name, args = ( + single_tool_content[:end_name], + single_tool_content[end_name:], + ) - # fn_name is encoded outside serialized json dump - # only arguments are serialized - function_call_arr.append( - {"name": fn_name, "arguments": json.loads(args)} - ) + # fn_name is encoded outside serialized json dump + # only arguments are serialized + function_call_arr.append( + {"name": fn_name, "arguments": json.loads(args)} + ) else: function_call_arr = json.loads(tool_content) except json.JSONDecodeError: From 5dcd593baf9ceaad68a0e5e16f20f6ccccd10797 Mon Sep 17 00:00:00 2001 From: quanliu <18646313696@163.com> Date: Tue, 9 Dec 2025 23:01:38 +0800 Subject: [PATCH 109/133] [Feature] Batch-Invariant Support for FA2 and LoRA (#30018) Signed-off-by: quanliu <18646313696@163.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- tests/v1/determinism/test_batch_invariance.py | 10 ++++++++++ tests/v1/determinism/utils.py | 10 ++++++++-- vllm/model_executor/layers/batch_invariant.py | 6 +++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index fc953a66f0820..1c45e7fe366ff 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -10,6 +10,7 @@ from utils import ( BACKENDS, _extract_step_logprobs, _random_prompt, + is_device_capability_below_90, resolve_model_name, skip_unsupported, ) @@ -17,6 +18,8 @@ from utils import ( import vllm.model_executor.layers.batch_invariant as batch_invariant from vllm import LLM, SamplingParams +IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90() + @skip_unsupported @pytest.mark.timeout(1000) @@ -190,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( max_model_len=8192, dtype="bfloat16", # not everything is supported gpu_memory_utilization=0.9, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # Use more realistic prompts for better token generation @@ -393,6 +397,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): gpu_memory_utilization=0.9, max_model_len=2048, dtype="bfloat16", + enable_prefix_caching=False, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) prompt = "the capital of france is" @@ -459,6 +465,7 @@ def test_logprobs_without_batch_invariance_should_fail( max_num_seqs=32, max_model_len=8192, dtype="bfloat16", + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # build ragged prompts to change shapes significantly across BS=1 vs BS=N @@ -682,6 +689,7 @@ def test_decode_logprobs_match_prefill_logprobs( max_num_seqs=32, max_model_len=8192, dtype="bfloat16", + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # Use a few test prompts @@ -925,6 +933,8 @@ def LLM_with_max_seqs( max_model_len=max_model_len, dtype="bfloat16", tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), + enable_prefix_caching=False, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, # Enable for MOE models # enable_expert_parallel=True, ) diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 6aab50cf84ab6..a8013ed229cfc 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -11,8 +11,10 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer skip_unsupported = pytest.mark.skipif( - not (current_platform.is_cuda() and current_platform.has_device_capability(90)), - reason="Requires CUDA and >= Hopper (SM90)", + not (current_platform.is_cuda() and current_platform.has_device_capability(80)), + # Supports testing on Ampere and Ada Lovelace devices. + # Note: For devices with SM < 90, batch invariance does not support CUDA Graphs. + reason="Requires CUDA and >= Ampere (SM80)", ) BACKENDS: list[str] = [ @@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output): return t, inner.token_ids return None, None + + +def is_device_capability_below_90() -> bool: + return not current_platform.has_device_capability(90) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 4cab47f4192a2..b14e7dad77f9a 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -935,7 +935,11 @@ def enable_batch_invariant_mode(): # Batch invariant matmuls are no longer needed after cublas overrides if not is_torch_equal_or_newer("2.10.0.dev"): - if current_platform.is_device_capability(100): + if ( + current_platform.is_device_capability(100) + or current_platform.is_device_capability(80) + or current_platform.is_device_capability(89) + ): # For PyTorch 2.9, B200 uses GEMV for bs=1 # Requires https://github.com/pytorch/pytorch/pull/166735 _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") From 56037dfa2fc10a5f28e919d2c26e4c2abbd313a6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 9 Dec 2025 10:36:12 -0500 Subject: [PATCH 110/133] [BugFix] Fix `assert batch_descriptor.num_tokens == num_tokens_padded` (#30173) Signed-off-by: Lucas Wilkinson --- tests/v1/cudagraph/test_cudagraph_dispatch.py | 4 +- vllm/forward_context.py | 2 +- vllm/v1/cudagraph_dispatcher.py | 4 +- vllm/v1/spec_decode/eagle.py | 2 +- vllm/v1/worker/dp_utils.py | 49 ++++++++++++++----- vllm/v1/worker/gpu_model_runner.py | 37 ++++++++------ 6 files changed, 65 insertions(+), 33 deletions(-) diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index b86534d3d4381..0e71d6c63ce68 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -161,10 +161,10 @@ class TestCudagraphDispatcher: assert rt_mode == CUDAGraphMode.NONE assert key == BatchDescriptor(num_tokens=15) - # 4. Cascade attention should have a fall back mode + # 4. disable_full should have a fall back mode (e.g., cascade attention) desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False) rt_mode, key = dispatcher.dispatch( - num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True + num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True ) if "PIECEWISE" in cudagraph_mode_str: # string contains check assert rt_mode == CUDAGraphMode.PIECEWISE diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 173d366267e87..033cc1f544b3b 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -292,7 +292,7 @@ def set_forward_context( if num_tokens_across_dp is None: assert ubatch_slices is None assert num_tokens is not None - _, num_tokens_across_dp = coordinate_batch_across_dp( + _, num_tokens_across_dp, _ = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens, parallel_config=vllm_config.parallel_config, allow_microbatching=False, diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index ef0f8d9e67452..8a3500c0aac6b 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -145,7 +145,7 @@ class CudagraphDispatcher: num_tokens: int, uniform_decode: bool, has_lora: bool, - use_cascade_attn: bool = False, + disable_full: bool = False, ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using cascade attention), @@ -165,7 +165,7 @@ class CudagraphDispatcher: ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() - if not use_cascade_attn: + if not disable_full: # check if key exists for full cudagraph if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: return CUDAGraphMode.FULL, batch_desc diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 31428db2d3afc..9f7859a5c3565 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1258,7 +1258,7 @@ class EagleProposer: num_tokens_padded: int, ) -> tuple[int, torch.Tensor]: # TODO(Flechman): support DBO ubatching - should_ubatch, num_toks_across_dp = coordinate_batch_across_dp( + should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens_unpadded, parallel_config=self.vllm_config.parallel_config, allow_microbatching=False, diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 5da55d740c347..1b9646e1980a8 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -40,16 +40,18 @@ def _run_ar( should_dp_pad: bool, orig_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int, + cudagraph_mode: int, parallel_config: ParallelConfig, ) -> torch.Tensor: dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_rank device, group = _get_device_and_group(parallel_config) - tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32) + tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32) tensor[0][dp_rank] = orig_num_tokens_per_ubatch tensor[1][dp_rank] = padded_num_tokens_per_ubatch tensor[2][dp_rank] = 1 if should_ubatch else 0 tensor[3][dp_rank] = 1 if should_dp_pad else 0 + tensor[4][dp_rank] = cudagraph_mode dist.all_reduce(tensor, group=group) return tensor @@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch return num_tokens_across_dp.cpu() +def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int: + """ + Synchronize cudagraph_mode across DP ranks by taking the minimum. + If any rank has NONE (0), all ranks use NONE. + This ensures all ranks send consistent values (all padded or all unpadded). + """ + return int(tensor[4, :].min().item()) + + def _synchronize_dp_ranks( num_tokens_unpadded: int, num_tokens_padded: int, should_attempt_ubatching: bool, should_attempt_dp_padding: bool, + cudagraph_mode: int, parallel_config: ParallelConfig, -) -> tuple[bool, torch.Tensor | None]: +) -> tuple[bool, torch.Tensor | None, int]: """ 1. Decides if each DP rank is going to microbatch. Either all ranks run with microbatching or none of them do. @@ -104,10 +116,13 @@ def _synchronize_dp_ranks( When running microbatched or if should_attempt_dp_padding is True, all ranks will be padded out so that the run with the same number of tokens + 3. Synchronizes cudagraph_mode across ranks by taking the minimum. + Returns: tuple[ should_ubatch: Are all DP ranks going to microbatch num_tokens_after_padding: A tensor containing the total number of tokens per-microbatch for each DP rank including any DP padding. + synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) ] """ @@ -121,6 +136,7 @@ def _synchronize_dp_ranks( should_dp_pad=should_attempt_dp_padding, orig_num_tokens_per_ubatch=num_tokens_unpadded, padded_num_tokens_per_ubatch=num_tokens_padded, + cudagraph_mode=cudagraph_mode, parallel_config=parallel_config, ) @@ -148,7 +164,10 @@ def _synchronize_dp_ranks( should_dp_pad, ) - return should_ubatch, num_tokens_after_padding + # Synchronize cudagraph_mode across ranks (take min) + synced_cudagraph_mode = _post_process_cudagraph_mode(tensor) + + return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode def coordinate_batch_across_dp( @@ -159,7 +178,8 @@ def coordinate_batch_across_dp( num_tokens_padded: int | None = None, uniform_decode: bool | None = None, num_scheduled_tokens_per_request: np.ndarray | None = None, -) -> tuple[bool, torch.Tensor | None]: + cudagraph_mode: int = 0, +) -> tuple[bool, torch.Tensor | None, int]: """ Coordinates amongst all DP ranks to determine if and how the full batch should be split into microbatches. @@ -175,6 +195,7 @@ def coordinate_batch_across_dp( only contains single token decodes num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The number of tokens per request. + cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL) Returns: tuple[ ubatch_slices: if this is set then all DP ranks have agreed to @@ -183,12 +204,13 @@ def coordinate_batch_across_dp( tokens per-microbatch for each DP rank including padding. Will be padded up to the max value across all DP ranks when allow_dp_padding is True. + synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) ] """ if parallel_config.data_parallel_size == 1: # Early exit. - return False, None + return False, None, cudagraph_mode # If the caller has explicitly enabled microbatching. should_attempt_ubatching = False @@ -204,12 +226,15 @@ def coordinate_batch_across_dp( if num_tokens_padded is None: num_tokens_padded = num_tokens_unpadded - (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks( - num_tokens_unpadded, - num_tokens_padded, - should_attempt_ubatching, - allow_dp_padding, - parallel_config, + (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = ( + _synchronize_dp_ranks( + num_tokens_unpadded, + num_tokens_padded, + should_attempt_ubatching, + allow_dp_padding, + cudagraph_mode, + parallel_config, + ) ) - return (should_ubatch, num_tokens_after_padding) + return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 22a3f9d8d2dda..766c2acd0e1d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2788,17 +2788,19 @@ class GPUModelRunner( ) dispatch_cudagraph = ( - lambda num_tokens: self.cudagraph_dispatcher.dispatch( + lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens, has_lora=has_lora, - use_cascade_attn=use_cascade_attn, uniform_decode=uniform_decode, + disable_full=disable_full, ) if not force_eager else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) ) - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, use_cascade_attn + ) num_tokens_padded = batch_descriptor.num_tokens # Extra coordination when running data-parallel since we need to coordinate @@ -2813,23 +2815,28 @@ class GPUModelRunner( self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE ) - should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp( - num_tokens_unpadded=num_tokens, - parallel_config=self.parallel_config, - allow_microbatching=allow_microbatching, - allow_dp_padding=allow_dp_padding, - num_tokens_padded=num_tokens_padded, - uniform_decode=uniform_decode, - num_scheduled_tokens_per_request=num_scheduled_tokens_np, + should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = ( + coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens, + parallel_config=self.parallel_config, + allow_microbatching=allow_microbatching, + allow_dp_padding=allow_dp_padding, + num_tokens_padded=num_tokens_padded, + uniform_decode=uniform_decode, + num_scheduled_tokens_per_request=num_scheduled_tokens_np, + cudagraph_mode=cudagraph_mode.value, + ) ) - # Extract DP padding if there is any + # Extract DP-synced values if num_tokens_across_dp is not None: dp_rank = self.parallel_config.data_parallel_rank num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) - - # Re-dispatch with DP padding - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + # Re-dispatch with DP padding so we have the correct batch_descriptor + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, + ) # Assert to make sure the agreed upon token count is correct otherwise # num_tokens_across_dp will no-longer be valid assert batch_descriptor.num_tokens == num_tokens_padded From 83319b44c26af45de4753c74f55a07df8c637a25 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 9 Dec 2025 10:40:37 -0500 Subject: [PATCH 111/133] [Compile] Fix torch warning `TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled` (#29897) Signed-off-by: yewentao256 --- tests/v1/e2e/test_async_scheduling.py | 2 ++ vllm/envs.py | 9 +++++++++ vllm/v1/worker/gpu_worker.py | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 945276376d665..838d05f0486c1 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -124,6 +124,8 @@ def run_tests( with monkeypatch.context() as m: # avoid precision errors m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + # lock matmul precision to full FP32 + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( diff --git a/vllm/envs.py b/vllm/envs.py index 91d1b01076b11..bda9e6e423356 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,6 +75,7 @@ if TYPE_CHECKING: VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -452,6 +453,14 @@ environment_variables: dict[str, Callable[[], Any]] = { # Main CUDA version of vLLM. This follows PyTorch but can be overridden. "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", + # Controls PyTorch float32 matmul precision mode within vLLM workers. + # Valid options mirror torch.set_float32_matmul_precision + "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( + "VLLM_FLOAT32_MATMUL_PRECISION", + "highest", + ["highest", "high", "medium"], + case_sensitive=False, + ), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a46ec2bd118fe..24a3533a169f0 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -79,6 +79,10 @@ class Worker(WorkerBase): is_driver_worker=is_driver_worker, ) + # configure float32 matmul precision according to vLLM env. + precision = envs.VLLM_FLOAT32_MATMUL_PRECISION + torch.set_float32_matmul_precision(precision) + if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils.import_utils import init_cached_hf_modules From 804e3468c04b1a43c0019d2835dabc74b779c1fc Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 9 Dec 2025 11:31:30 -0600 Subject: [PATCH 112/133] Update AMD test definitions (2025-12-08) (#30298) Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 188 +++++++++++++++++++++++++++------------ 1 file changed, 130 insertions(+), 58 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6950ad774edd8..4038d32834e68 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -398,7 +398,8 @@ steps: timeout_in_minutes: 25 gpu: h100 source_file_dependencies: - - vllm/ + - vllm/v1/attention + - vllm/model_executor/layers - tests/v1/determinism/ commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn @@ -440,23 +441,29 @@ steps: working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints + - vllm/multimodal - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -718,6 +725,18 @@ steps: - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - label: OpenAI API correctness # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] @@ -727,7 +746,7 @@ steps: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - commands: # LMEval + commands: # LMEval+Transcription WER check # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - pytest -s entrypoints/openai/correctness/ @@ -963,6 +982,19 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] @@ -1098,7 +1130,6 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - vllm/model_executor/layers/fused_moe/layer.py - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py @@ -1132,12 +1163,25 @@ steps: - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/distributed/test_fusions_e2e.py - - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1155,6 +1199,16 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1397,6 +1451,39 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + + ##### H200 test ##### - label: Distributed Tests (H200) # optional mirror_hardwares: [amdexperimental] @@ -1440,29 +1527,6 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - -- label: Multi-Modal Accuracy Eval (Small Models) # 10min - timeout_in_minutes: 70 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - - label: LM Eval Large Models (4 Card) mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 @@ -1478,21 +1542,6 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: LM Eval Large Models (H100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - - label: ROCm LM Eval Large Models (8 Card) mirror_hardwares: [amdproduction] agent_pool: mi325_8 @@ -1517,6 +1566,20 @@ steps: - uv pip install --system 'gpt-oss[eval]==0.0.5' - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 @@ -1550,17 +1613,26 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 + agent_pool: mi325_4 # grade: Blocking - timeout_in_minutes: 30 + gpu: h100 optional: true - num_gpus: 2 + num_gpus: 4 working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh commands: - - bash .buildkite/scripts/run-prime-rl-test.sh + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 From 0b6a8a304cd8bab21383c4c2904064f6f6f2fd62 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 9 Dec 2025 18:57:55 +0100 Subject: [PATCH 113/133] [BugFix] Fix non detected failing tests (#30277) Signed-off-by: ilmarkov --- .buildkite/test-pipeline.yaml | 8 ++- .../fullgraph/test_multimodal_compile.py | 1 - tests/compile/test_compile_ranges.py | 6 ++ tests/compile/test_pass_manager.py | 65 ++++++++++--------- vllm/compilation/inductor_pass.py | 8 ++- vllm/compilation/piecewise_backend.py | 19 ++++++ 6 files changed, 73 insertions(+), 34 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0a99994e243ae..8fc3587f7813c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -468,7 +468,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -482,7 +484,9 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py index e2897b2271b8f..621f6a51a918f 100644 --- a/tests/compile/fullgraph/test_multimodal_compile.py +++ b/tests/compile/fullgraph/test_multimodal_compile.py @@ -17,7 +17,6 @@ def test_compile(): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") -@pytest.mark.xfail def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): """Test that Qwen2.5-VL vision submodules are compiled. diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py index d849a8617ebd2..14ae8233f1131 100644 --- a/tests/compile/test_compile_ranges.py +++ b/tests/compile/test_compile_ranges.py @@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache): vllm_config = VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, @@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges(): VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=compilation_config, ) @@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): ) scheduler_config = SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ) torch.set_default_device("cuda") diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b655031..6ed77b0085f51 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -5,9 +5,14 @@ import copy import pytest import torch -from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.compilation.inductor_pass import ( + CallableInductorPass, + InductorPass, + pass_context, +) from vllm.compilation.pass_manager import PostGradPassManager from vllm.config import ModelConfig, VllmConfig +from vllm.config.utils import Range # dummy custom pass that doesn't inherit @@ -42,35 +47,37 @@ class ProperPass(InductorPass): ], ) def test_pass_manager_uuid(callable): - # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + # Set the pass context as PassManager uuid uses it + with pass_context(Range(start=1, end=8)): + # Some passes need dtype to be set + config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) - pass_manager = PostGradPassManager() - pass_manager.configure(config) + pass_manager = PostGradPassManager() + pass_manager.configure(config) - # Check that UUID is different if the same pass is added 2x - pass_manager.add(callable) - uuid1 = pass_manager.uuid() - pass_manager.add(callable) - uuid2 = pass_manager.uuid() - assert uuid1 != uuid2 + # Check that UUID is different if the same pass is added 2x + pass_manager.add(callable) + uuid1 = pass_manager.uuid() + pass_manager.add(callable) + uuid2 = pass_manager.uuid() + assert uuid1 != uuid2 - # UUID should be the same as the original one, - # as we constructed in the same way. - pass_manager2 = PostGradPassManager() - pass_manager2.configure(config) - pass_manager2.add(callable) - assert uuid1 == pass_manager2.uuid() + # UUID should be the same as the original one, + # as we constructed in the same way. + pass_manager2 = PostGradPassManager() + pass_manager2.configure(config) + pass_manager2.add(callable) + assert uuid1 == pass_manager2.uuid() - # UUID should be different due to config change - config2 = copy.deepcopy(config) - config2.compilation_config.pass_config.fuse_norm_quant = ( - not config2.compilation_config.pass_config.fuse_norm_quant - ) - config2.compilation_config.pass_config.fuse_act_quant = ( - not config2.compilation_config.pass_config.fuse_act_quant - ) - pass_manager3 = PostGradPassManager() - pass_manager3.configure(config2) - pass_manager3.add(callable) - assert uuid1 != pass_manager3.uuid() + # UUID should be different due to config change + config2 = copy.deepcopy(config) + config2.compilation_config.pass_config.fuse_norm_quant = ( + not config2.compilation_config.pass_config.fuse_norm_quant + ) + config2.compilation_config.pass_config.fuse_act_quant = ( + not config2.compilation_config.pass_config.fuse_act_quant + ) + pass_manager3 = PostGradPassManager() + pass_manager3.configure(config2) + pass_manager3.add(callable) + assert uuid1 != pass_manager3.uuid() diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 8159b817f637a..dbf154eeb86a4 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + import functools import hashlib import inspect @@ -8,15 +10,17 @@ import json import types from collections.abc import Callable from contextlib import contextmanager -from typing import Any +from typing import TYPE_CHECKING, Any import torch from torch import fx from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily -from vllm.config.utils import Range from vllm.utils.torch_utils import is_torch_equal_or_newer +if TYPE_CHECKING: + from vllm.config.utils import Range + if is_torch_equal_or_newer("2.6"): from torch._inductor.custom_graph_pass import CustomGraphPass else: diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 129b9b5deea31..a15c693767a51 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -53,8 +53,27 @@ class PiecewiseBackend: self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1 self.is_full_graph = total_piecewise_compiles == 1 + # TODO: we need to generalize encoder compilation to other models + self.is_encoder_compilation = vllm_backend.prefix in [ + "Qwen2_5_VisionPatchEmbed", + "Qwen2_5_VisionPatchMerger", + "Qwen2_5_VisionBlock", + ] self.compile_ranges = self.compilation_config.get_compile_ranges() + if self.is_encoder_compilation: + # For encoder compilation we use the max int32 value + # to set the upper bound of the compile ranges + max_int32 = 2**31 - 1 + last_compile_range = self.compile_ranges[-1] + assert ( + last_compile_range.end + == vllm_config.scheduler_config.max_num_batched_tokens + ) + self.compile_ranges[-1] = Range( + start=last_compile_range.start, end=max_int32 + ) + log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" logger.debug_once(log_string) From 9e6562a3f625279fd7c8b9ac53c30fed3b01f5b9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 9 Dec 2025 09:59:54 -0800 Subject: [PATCH 114/133] [Model Runner V2] Fix Triton warning on tl.where (#30355) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/sample/penalties.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py index c8d4b7d81841d..b4fcc822ecfce 100644 --- a/vllm/v1/worker/gpu/sample/penalties.py +++ b/vllm/v1/worker/gpu/sample/penalties.py @@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel( mask=packed_block < tl.cdiv(vocab_size, 32), ) prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1 + prompt_bin_mask = prompt_bin_mask.to(tl.int1) prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE) # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. From d471b2aff09028f9c62e861f760a74fd8f99081d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 9 Dec 2025 10:00:49 -0800 Subject: [PATCH 115/133] [Model Runner V2] Support num NaNs in logits (#30187) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/async_utils.py | 41 +++++++++++++------------ vllm/v1/worker/gpu/metrics/__init__.py | 0 vllm/v1/worker/gpu/metrics/logits.py | 42 ++++++++++++++++++++++++++ vllm/v1/worker/gpu/model_runner.py | 2 +- vllm/v1/worker/gpu/sample/min_p.py | 4 +-- vllm/v1/worker/gpu/sample/output.py | 14 +++++++++ vllm/v1/worker/gpu/sample/sampler.py | 12 ++++++-- 7 files changed, 89 insertions(+), 26 deletions(-) create mode 100644 vllm/v1/worker/gpu/metrics/__init__.py create mode 100644 vllm/v1/worker/gpu/metrics/logits.py create mode 100644 vllm/v1/worker/gpu/sample/output.py diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index f6bc607c1ae67..a2e3decad0486 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -2,14 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager +import numpy as np import torch from vllm.v1.outputs import ( AsyncModelRunnerOutput, LogprobsTensors, ModelRunnerOutput, - SamplerOutput, ) +from vllm.v1.worker.gpu.sample.output import SamplerOutput class AsyncOutput(AsyncModelRunnerOutput): @@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput): with torch.cuda.stream(self.copy_stream): self.copy_stream.wait_stream(default_stream) - # NOTE(woosuk): We must ensure that CPU tensors are not freed - # before the device-to-host copy is fully completed. For instance, - # operations like - # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy() - # are unsafe because the underlying CPU tensor can be prematurely freed and - # reused by other tensors before the asynchronous copy finishes, potentially - # causing race conditions. To prevent this, we delay freeing by holding - # references until the copy event signals completion. - # Likewise, we also need to keep the reference to the GPU tensors. - # This is done by keeping the reference to sampler_output and - # model_runner_output. - self.sampled_token_ids = sampler_output.sampled_token_ids.to( - "cpu", non_blocking=True - ) + self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids) if sampler_output.logprobs_tensors is not None: self.logprobs_tensors: LogprobsTensors | None = ( sampler_output.logprobs_tensors.to_cpu_nonblocking() ) else: self.logprobs_tensors = None - self.num_sampled_tokens_cpu = num_sampled_tokens.to( - "cpu", non_blocking=True - ) + if sampler_output.num_nans is not None: + self.num_nans = async_copy_to_np(sampler_output.num_nans) + else: + self.num_nans = None + self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens) self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} if self.model_runner_output.prompt_logprobs_dict: for k, v in self.model_runner_output.prompt_logprobs_dict.items(): @@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput): def get_output(self) -> ModelRunnerOutput: self.copy_event.synchronize() - num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy() # NOTE(woosuk): The following code is to ensure compatibility with # the existing model runner. @@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput): # rather than Python lists. sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist() num_reqs = len(sampled_token_ids) + num_sampled_tokens = self.num_sampled_tokens_np.tolist() for i in range(num_reqs): - del sampled_token_ids[i][num_sampled_tokens_np[i] :] + del sampled_token_ids[i][num_sampled_tokens[i] :] self.model_runner_output.sampled_token_ids = sampled_token_ids + if self.num_nans is not None: + num_nans = self.num_nans.tolist() + self.model_runner_output.num_nans_in_logits = { + req_id: num_nans[i] + for i, req_id in enumerate(self.model_runner_output.req_ids) + } + if self.logprobs_tensors is not None: self.model_runner_output.logprobs = self.logprobs_tensors.tolists() self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict @@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None): finally: if event is not None: event.record() + + +def async_copy_to_np(x: torch.Tensor) -> np.ndarray: + return x.to("cpu", non_blocking=True).numpy() diff --git a/vllm/v1/worker/gpu/metrics/__init__.py b/vllm/v1/worker/gpu/metrics/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/worker/gpu/metrics/logits.py b/vllm/v1/worker/gpu/metrics/logits.py new file mode 100644 index 0000000000000..fd7b30beaa1f8 --- /dev/null +++ b/vllm/v1/worker/gpu/metrics/logits.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from torch._inductor.runtime.triton_helpers import libdevice + +from vllm.triton_utils import tl, triton + + +@triton.jit +def _num_nans_kernel( + logits_ptr, + logits_stride, + num_nans_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + num_nans = 0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, mask=mask, other=0 + ) + logits = logits.to(tl.float32) + is_nan = libdevice.isnan(logits).to(tl.int1) + num_nans += tl.sum(is_nan).to(tl.int32) + tl.store(num_nans_ptr + req_idx, num_nans) + + +def get_num_nans(logits: torch.Tensor) -> torch.Tensor: + num_reqs, vocab_size = logits.shape + BLOCK_SIZE = 8192 + num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device) + _num_nans_kernel[(num_reqs,)]( + logits, + logits.stride(0), + num_nans, + vocab_size, + BLOCK_SIZE=BLOCK_SIZE, + ) + return num_nans diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 464f7b7bd3532..9f4c6edfb6aa9 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -25,7 +25,6 @@ from vllm.v1.outputs import ( LogprobsTensors, ModelRunnerOutput, ) -from vllm.v1.sample.sampler import SamplerOutput from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier from vllm.v1.worker.gpu.attn_utils import ( build_attn_metadata, @@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import ( SamplingMetadata, expand_sampling_metadata, ) +from vllm.v1.worker.gpu.sample.output import SamplerOutput from vllm.v1.worker.gpu.sample.sampler import Sampler from vllm.v1.worker.gpu.spec_decode import init_speculator from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py index 0638818006f50..c98a42cb2b1bb 100644 --- a/vllm/v1/worker/gpu/sample/min_p.py +++ b/vllm/v1/worker/gpu/sample/min_p.py @@ -39,9 +39,7 @@ def _min_p_kernel( tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask) -def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None: - if min_p is None: - return +def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None: num_reqs, vocab_size = logits.shape BLOCK_SIZE = 1024 _min_p_kernel[(num_reqs,)]( diff --git a/vllm/v1/worker/gpu/sample/output.py b/vllm/v1/worker/gpu/sample/output.py new file mode 100644 index 0000000000000..13e8cf1d6c1ec --- /dev/null +++ b/vllm/v1/worker/gpu/sample/output.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import torch + +from vllm.v1.outputs import LogprobsTensors + + +@dataclass +class SamplerOutput: + sampled_token_ids: torch.Tensor + logprobs_tensors: LogprobsTensors | None + num_nans: torch.Tensor | None diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py index 9a4224d8fddef..84a3e18671b2c 100644 --- a/vllm/v1/worker/gpu/sample/sampler.py +++ b/vllm/v1/worker/gpu/sample/sampler.py @@ -3,13 +3,15 @@ import torch +import vllm.envs as envs from vllm.config.model import LogprobsMode -from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p +from vllm.v1.worker.gpu.metrics.logits import get_num_nans from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.min_p import apply_min_p +from vllm.v1.worker.gpu.sample.output import SamplerOutput from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature @@ -21,12 +23,16 @@ class Sampler: if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]: raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}") self.logprobs_mode = logprobs_mode + self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS # False by default. def __call__( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: + # NOTE(woosuk): We intentionally compute num_nans before sampling to make clear + # that num_nans is computed before applying penalties and temperature. + num_nans = get_num_nans(logits) if self.compute_nans else None sampled, processed_logits = self.sample(logits, sampling_metadata) if sampling_metadata.max_num_logprobs is not None: logits = ( @@ -49,6 +55,7 @@ class Sampler: # token per request. sampled_token_ids=sampled.view(-1, 1), logprobs_tensors=logprobs_tensors, + num_nans=num_nans, ) return sampler_output @@ -63,7 +70,8 @@ class Sampler: # Apply penalties and temperature in place. apply_penalties_and_temperature(logits, sampling_metadata) # Apply min_p in place. - apply_min_p(logits, sampling_metadata.min_p) + if sampling_metadata.min_p is not None: + apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. This might return a new tensor. logits = apply_top_k_top_p( logits, sampling_metadata.top_k, sampling_metadata.top_p From e858bfe05167a3bbb064e283da5a1a7709dee24e Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Tue, 9 Dec 2025 13:29:33 -0500 Subject: [PATCH 116/133] [Cleanup] Refactor profiling env vars into a CLI config (#29912) Signed-off-by: Benjamin Chislett Signed-off-by: Benjamin Chislett Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/auto_tune/auto_tune.sh | 5 +- .../benchmark_serving_structured_output.py | 3 +- docs/api/README.md | 1 + docs/contributing/profiling.md | 23 +- .../offline_inference/simple_profiling.py | 13 +- tests/v1/worker/test_gpu_profiler.py | 71 ++++--- vllm/benchmarks/latency.py | 12 +- vllm/benchmarks/serve.py | 3 +- vllm/benchmarks/throughput.py | 3 +- vllm/config/__init__.py | 3 + vllm/config/profiler.py | 199 ++++++++++++++++++ vllm/config/vllm.py | 5 + vllm/engine/arg_utils.py | 6 +- vllm/entrypoints/llm.py | 17 ++ vllm/entrypoints/serve/profile/api_router.py | 17 +- vllm/envs.py | 110 +++++----- vllm/profiler/{gpu_profiler.py => wrapper.py} | 72 ++++--- vllm/v1/engine/async_llm.py | 22 +- vllm/v1/worker/cpu_worker.py | 36 +--- vllm/v1/worker/gpu_worker.py | 18 +- vllm/v1/worker/tpu_worker.py | 4 +- vllm/v1/worker/xpu_worker.py | 42 +--- 22 files changed, 433 insertions(+), 252 deletions(-) create mode 100644 vllm/config/profiler.py rename vllm/profiler/{gpu_profiler.py => wrapper.py} (73%) diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 56b721cbb4021..25baa9cbda39c 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -96,8 +96,9 @@ start_server() { # This correctly passes each element as a separate argument. if [[ -n "$profile_dir" ]]; then # Start server with profiling enabled - VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ - vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}" + VLLM_SERVER_DEV_MODE=1 \ + vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 & else # Start server without profiling VLLM_SERVER_DEV_MODE=1 \ diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index df122b4c5e8db..a4e1b163dcca9 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -963,8 +963,7 @@ def create_argument_parser(): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--result-dir", diff --git a/docs/api/README.md b/docs/api/README.md index d3a141f327308..d51329ec2faa3 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes. - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] - [vllm.config.StructuredOutputsConfig][] +- [vllm.config.ProfilerConfig][] - [vllm.config.ObservabilityConfig][] - [vllm.config.KVTransferConfig][] - [vllm.config.CompilationConfig][] diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 65382afbe4f21..cbce14ce992ec 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -5,16 +5,15 @@ ## Profile with PyTorch Profiler -We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables: +We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config` +when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config: -- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default -- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default -- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default -- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default -- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default -- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default - -The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. +- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default +- `torch_profiler_with_memory` to record memory, off by default +- `torch_profiler_with_stack` to enable recording stack information, on by default +- `torch_profiler_with_flops` to enable recording FLOPs, off by default +- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default +- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag. @@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline #### OpenAI Server ```bash -VLLM_TORCH_PROFILER_DIR=./vllm_profile \ - vllm serve meta-llama/Llama-3.1-8B-Instruct +vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' ``` vllm bench command: @@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with ` ```bash # server -VLLM_TORCH_CUDA_PROFILE=1 \ nsys profile \ --trace-fork-before-exec=true \ --cuda-graph-trace=node \ --capture-range=cudaProfilerApi \ --capture-range-end repeat \ - vllm serve meta-llama/Llama-3.1-8B-Instruct + vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda # client vllm bench serve \ diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index 46858fffadc52..e8a75cd03befb 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,14 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import time from vllm import LLM, SamplingParams -# enable torch profiler, can also be set on cmd line -os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" - # Sample prompts. prompts = [ "Hello, my name is", @@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) def main(): # Create an LLM. - llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) + llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=1, + profiler_config={ + "profiler": "torch", + "torch_profiler_dir": "./vllm_profile", + }, + ) llm.start_profile() diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py index f7255fae05a4e..933ea42f18cd5 100644 --- a/tests/v1/worker/test_gpu_profiler.py +++ b/tests/v1/worker/test_gpu_profiler.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -import vllm.envs as envs -from vllm.profiler.gpu_profiler import WorkerProfiler +from vllm.config import ProfilerConfig +from vllm.profiler.wrapper import WorkerProfiler class ConcreteWorkerProfiler(WorkerProfiler): @@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler): A basic implementation of a worker profiler for testing purposes. """ - def __init__(self): + def __init__(self, profiler_config: ProfilerConfig): self.start_call_count = 0 self.stop_call_count = 0 self.should_fail_start = False - super().__init__() + super().__init__(profiler_config) def _start(self) -> None: if self.should_fail_start: @@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler): self.stop_call_count += 1 -@pytest.fixture(autouse=True) -def reset_mocks(): - """Fixture to reset mocks and env variables before each test.""" - envs.VLLM_PROFILER_DELAY_ITERS = 0 - envs.VLLM_PROFILER_MAX_ITERS = 0 +@pytest.fixture +def default_profiler_config(): + return ProfilerConfig( + profiler="torch", + torch_profiler_dir="/tmp/mock", + delay_iterations=0, + max_iterations=0, + ) -def test_immediate_start_stop(): +def test_immediate_start_stop(default_profiler_config): """Test standard start without delay.""" - profiler = ConcreteWorkerProfiler() - + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() assert profiler._running is True assert profiler._active is True @@ -48,10 +50,10 @@ def test_immediate_start_stop(): assert profiler.stop_call_count == 1 -def test_delayed_start(): +def test_delayed_start(default_profiler_config): """Test that profiler waits for N steps before actually starting.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) # User requests start profiler.start() @@ -71,10 +73,10 @@ def test_delayed_start(): assert profiler.start_call_count == 1 -def test_max_iterations(): +def test_max_iterations(default_profiler_config): """Test that profiler stops automatically after max iterations.""" - envs.VLLM_PROFILER_MAX_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.max_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() assert profiler._running is True @@ -95,12 +97,11 @@ def test_max_iterations(): assert profiler.stop_call_count == 1 -def test_delayed_start_and_max_iters(): +def test_delayed_start_and_max_iters(default_profiler_config): """Test combined delayed start and max iterations.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - envs.VLLM_PROFILER_MAX_ITERS = 2 - profiler = ConcreteWorkerProfiler() - + default_profiler_config.delay_iterations = 2 + default_profiler_config.max_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() # Step 1 @@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters(): assert profiler.stop_call_count == 1 -def test_idempotency(): +def test_idempotency(default_profiler_config): """Test that calling start/stop multiple times doesn't break logic.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) # Double Start profiler.start() @@ -142,10 +143,10 @@ def test_idempotency(): assert profiler.stop_call_count == 1 # Should only stop once -def test_step_inactive(): +def test_step_inactive(default_profiler_config): """Test that stepping while inactive does nothing.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) # Not started yet profiler.step() @@ -155,9 +156,9 @@ def test_step_inactive(): assert profiler.start_call_count == 0 -def test_start_failure(): +def test_start_failure(default_profiler_config): """Test behavior when the underlying _start method raises exception.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.should_fail_start = True profiler.start() @@ -168,9 +169,9 @@ def test_start_failure(): assert profiler.start_call_count == 0 # Logic failed inside start -def test_shutdown(): +def test_shutdown(default_profiler_config): """Test that shutdown calls stop only if running.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) # Case 1: Not running profiler.shutdown() @@ -182,10 +183,10 @@ def test_shutdown(): assert profiler.stop_call_count == 1 -def test_mixed_delay_and_stop(): +def test_mixed_delay_and_stop(default_profiler_config): """Test manual stop during the delay period.""" - envs.VLLM_PROFILER_DELAY_ITERS = 5 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 5 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() profiler.step() diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index b4f1751837f48..99c1c846f19af 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -12,7 +12,6 @@ from typing import Any import numpy as np from tqdm import tqdm -import vllm.envs as envs from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType @@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) engine_args = EngineArgs.from_cli_args(args) + if args.profile and not engine_args.profiler_config.profiler == "torch": + raise ValueError( + "The torch profiler is not enabled. Please provide profiler_config." + ) # Lazy import to avoid importing LLM when the bench command is not selected. from vllm import LLM, SamplingParams @@ -144,7 +142,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR + profile_dir = engine_args.profiler_config.torch_profiler_dir print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 568290aa894ff..2e2054a8a4b13 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--save-result", diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index ea693613fdd16..d824e982b7489 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--profile", action="store_true", default=False, - help="Use Torch Profiler. The env variable " - "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) # prefix repetition dataset diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0f84f3ca9d3e3..0e91dd57420a8 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig +from vllm.config.profiler import ProfilerConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -89,6 +90,8 @@ __all__ = [ "SpeechToTextConfig", # From vllm.config.structured_outputs "StructuredOutputsConfig", + # From vllm.config.profiler + "ProfilerConfig", # From vllm.config.utils "ConfigType", "SupportsMetricsInfo", diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py new file mode 100644 index 0000000000000..76cc546f3c9e2 --- /dev/null +++ b/vllm/config/profiler.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from typing import Any, Literal + +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash + +logger = init_logger(__name__) + +ProfilerKind = Literal["torch", "cuda"] + + +@config +@dataclass +class ProfilerConfig: + """Dataclass which contains profiler config for the engine.""" + + profiler: ProfilerKind | None = None + """Which profiler to use. Defaults to None. Options are: + + - 'torch': Use PyTorch profiler.\n + - 'cuda': Use CUDA profiler.""" + + torch_profiler_dir: str = "" + """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and + worker's traces (CPU & GPU) will be saved under this directory. Note that + it must be an absolute path.""" + + torch_profiler_with_stack: bool = True + """If `True`, enables stack tracing in the torch profiler. Enabled by default.""" + + torch_profiler_with_flops: bool = False + """If `True`, enables FLOPS counting in the torch profiler. Disabled by default.""" + + torch_profiler_use_gzip: bool = True + """If `True`, saves torch profiler traces in gzip format. Enabled by default""" + + torch_profiler_dump_cuda_time_total: bool = True + """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default.""" + + torch_profiler_record_shapes: bool = False + """If `True`, records tensor shapes in the torch profiler. Disabled by default.""" + + torch_profiler_with_memory: bool = False + """If `True`, enables memory profiling in the torch profiler. + Disabled by default.""" + + ignore_frontend: bool = False + """If `True`, disables the front-end profiling of AsyncLLM when using the + 'torch' profiler. This is needed to reduce overhead when using delay/limit options, + since the front-end profiling does not track iterations and will capture the + entire range. + """ + + delay_iterations: int = Field(default=0, ge=0) + """Number of engine iterations to skip before starting profiling. + Defaults to 0, meaning profiling starts immediately after receiving /start_profile. + """ + + max_iterations: int = Field(default=0, ge=0) + """Maximum number of engine iterations to profile after starting profiling. + Defaults to 0, meaning no limit. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None: + """Get field from env var if set, with deprecation warning.""" + + if envs.is_set(env_var_name): + value = getattr(envs, env_var_name) + logger.warning_once( + "Using %s environment variable is deprecated and will be removed in " + "v0.14.0 or v1.0.0, whichever is soonest. Please use " + "--profiler-config.%s command line argument or " + "ProfilerConfig(%s=...) config field instead.", + env_var_name, + field_name, + field_name, + ) + return value + return None + + def _set_from_env_if_set( + self, + field_name: str, + env_var_name: str, + to_bool: bool = True, + to_int: bool = False, + ) -> None: + """Set field from env var if set, with deprecation warning.""" + value = self._get_from_env_if_set(field_name, env_var_name) + if value is not None: + if to_bool: + value = value == "1" + if to_int: + value = int(value) + setattr(self, field_name, value) + + @model_validator(mode="after") + def _validate_profiler_config(self) -> Self: + maybe_use_cuda_profiler = self._get_from_env_if_set( + "profiler", "VLLM_TORCH_CUDA_PROFILE" + ) + if maybe_use_cuda_profiler is not None: + self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None + else: + self._set_from_env_if_set( + "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False + ) + if self.torch_profiler_dir: + self.profiler = "torch" + self._set_from_env_if_set( + "torch_profiler_record_shapes", + "VLLM_TORCH_PROFILER_RECORD_SHAPES", + ) + self._set_from_env_if_set( + "torch_profiler_with_memory", + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", + ) + self._set_from_env_if_set( + "torch_profiler_with_stack", + "VLLM_TORCH_PROFILER_WITH_STACK", + ) + self._set_from_env_if_set( + "torch_profiler_with_flops", + "VLLM_TORCH_PROFILER_WITH_FLOPS", + ) + self._set_from_env_if_set( + "ignore_frontend", + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", + ) + self._set_from_env_if_set( + "torch_profiler_use_gzip", + "VLLM_TORCH_PROFILER_USE_GZIP", + ) + self._set_from_env_if_set( + "torch_profiler_dump_cuda_time_total", + "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", + ) + + self._set_from_env_if_set( + "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True + ) + self._set_from_env_if_set( + "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True + ) + + has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0 + if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend: + logger.warning_once( + "Using 'torch' profiler with delay_iterations or max_iterations " + "while ignore_frontend is False may result in high overhead." + ) + + profiler_dir = self.torch_profiler_dir + if profiler_dir and self.profiler != "torch": + raise ValueError( + "torch_profiler_dir is only applicable when profiler is set to 'torch'" + ) + if self.profiler == "torch" and not profiler_dir: + raise ValueError("torch_profiler_dir must be set when profiler is 'torch'") + + if profiler_dir: + is_gs_path = ( + profiler_dir.startswith("gs://") + and profiler_dir[5:] + and profiler_dir[5] != "/" + ) + if not is_gs_path: + self.torch_profiler_dir = os.path.abspath( + os.path.expanduser(profiler_dir) + ) + + return self diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a74413536407b..614a3226cb711 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -39,6 +39,7 @@ from .lora import LoRAConfig from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig +from .profiler import ProfilerConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -218,6 +219,8 @@ class VllmConfig: You can specify the full compilation config like so: `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` """ + profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig) + """Profiling configuration.""" kv_transfer_config: KVTransferConfig | None = None """The configurations for distributed KV cache transfer.""" kv_events_config: KVEventsConfig | None = None @@ -296,6 +299,8 @@ class VllmConfig: vllm_factors.append("None") if self.structured_outputs_config: vllm_factors.append(self.structured_outputs_config.compute_hash()) + if self.profiler_config: + vllm_factors.append(self.profiler_config.compute_hash()) else: vllm_factors.append("None") vllm_factors.append(self.observability_config.compute_hash()) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ceac5407af6e2..2f307a7ccf16d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -50,6 +50,7 @@ from vllm.config import ( ObservabilityConfig, ParallelConfig, PoolerConfig, + ProfilerConfig, SchedulerConfig, SpeculativeConfig, StructuredOutputsConfig, @@ -532,6 +533,8 @@ class EngineArgs: worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls + profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config") + kv_transfer_config: KVTransferConfig | None = None kv_events_config: KVEventsConfig | None = None @@ -1164,7 +1167,7 @@ class EngineArgs: vllm_group.add_argument( "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"] ) - + vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"]) vllm_group.add_argument( "--optimization-level", **vllm_kwargs["optimization_level"] ) @@ -1782,6 +1785,7 @@ class EngineArgs: kv_transfer_config=self.kv_transfer_config, kv_events_config=self.kv_events_config, ec_transfer_config=self.ec_transfer_config, + profiler_config=self.profiler_config, additional_config=self.additional_config, optimization_level=self.optimization_level, ) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 913324fd5f9c3..5d5c4a1cdb77b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -20,6 +20,7 @@ from vllm.beam_search import ( from vllm.config import ( CompilationConfig, PoolerConfig, + ProfilerConfig, StructuredOutputsConfig, is_init_field, ) @@ -211,6 +212,7 @@ class LLM: structured_outputs_config: dict[str, Any] | StructuredOutputsConfig | None = None, + profiler_config: dict[str, Any] | ProfilerConfig | None = None, kv_cache_memory_bytes: int | None = None, compilation_config: int | dict[str, Any] | CompilationConfig | None = None, logits_processors: list[str | type[LogitsProcessor]] | None = None, @@ -282,6 +284,20 @@ class LLM: else: structured_outputs_instance = StructuredOutputsConfig() + if profiler_config is not None: + if isinstance(profiler_config, dict): + profiler_config_instance = ProfilerConfig( + **{ + k: v + for k, v in profiler_config.items() + if is_init_field(ProfilerConfig, k) + } + ) + else: + profiler_config_instance = profiler_config + else: + profiler_config_instance = ProfilerConfig() + # warn about single-process data parallel usage. _dp_size = int(kwargs.get("data_parallel_size", 1)) _distributed_executor_backend = kwargs.get("distributed_executor_backend") @@ -324,6 +340,7 @@ class LLM: mm_processor_kwargs=mm_processor_kwargs, pooler_config=pooler_config, structured_outputs_config=structured_outputs_instance, + profiler_config=profiler_config_instance, compilation_config=compilation_config_instance, logits_processors=logits_processors, **kwargs, diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py index 166f13764eb36..eeed6b45ef4e9 100644 --- a/vllm/entrypoints/serve/profile/api_router.py +++ b/vllm/entrypoints/serve/profile/api_router.py @@ -5,7 +5,7 @@ from fastapi import APIRouter, FastAPI, Request from fastapi.responses import Response -import vllm.envs as envs +from vllm.config import ProfilerConfig from vllm.engine.protocol import EngineClient from vllm.logger import init_logger @@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request): def attach_router(app: FastAPI): - if envs.VLLM_TORCH_PROFILER_DIR: + profiler_config = getattr(app.state.args, "profiler_config", None) + assert profiler_config is None or isinstance(profiler_config, ProfilerConfig) + if profiler_config is not None and profiler_config.profiler is not None: logger.warning_once( - "Torch Profiler is enabled in the API server. This should ONLY be " - "used for local development!" + "Profiler with mode '%s' is enabled in the " + "API server. This should ONLY be used for local development!", + profiler_config.profiler, ) - elif envs.VLLM_TORCH_CUDA_PROFILE: - logger.warning_once( - "CUDA Profiler is enabled in the API server. This should ONLY be " - "used for local development!" - ) - if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE: app.include_router(router) diff --git a/vllm/envs.py b/vllm/envs.py index bda9e6e423356..8246109eb73af 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -89,20 +89,23 @@ if TYPE_CHECKING: VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_PLUGINS: list[str] | None = None VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None - VLLM_TORCH_CUDA_PROFILE: bool = False + # Deprecated env variables for profiling, kept for backward compatibility + # See also vllm/config/profiler.py and `--profiler-config` argument + VLLM_TORCH_CUDA_PROFILE: str | None = None VLLM_TORCH_PROFILER_DIR: str | None = None - VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False - VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False - VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False + VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None + VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None + VLLM_TORCH_PROFILER_WITH_STACK: str | None = None + VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None + VLLM_TORCH_PROFILER_USE_GZIP: str | None = None + VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None + VLLM_PROFILER_DELAY_ITERS: str | None = None + VLLM_PROFILER_MAX_ITERS: str | None = None + # End of deprecated env variables for profiling VLLM_USE_AOT_COMPILE: bool = False VLLM_USE_BYTECODE_HOOK: bool = False VLLM_FORCE_AOT_LOAD: bool = False - VLLM_TORCH_PROFILER_WITH_STACK: bool = True - VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False - VLLM_PROFILER_DELAY_ITERS: int = 0 - VLLM_PROFILER_MAX_ITERS: int = 0 - VLLM_TORCH_PROFILER_USE_GZIP: bool = True - VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -850,71 +853,52 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv( "VLLM_LORA_RESOLVER_CACHE_DIR", None ), - # Enables torch CUDA profiling if set. - # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered. - "VLLM_TORCH_CUDA_PROFILE": lambda: bool( - os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0" - ), + # Enables torch CUDA profiling if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"), # Enables torch profiler if set. - # Both AsyncLLM's CPU traces as well as workers' - # traces (CPU & GPU) will be saved under this directory. - # Note that it must be an absolute path. - "VLLM_TORCH_PROFILER_DIR": lambda: ( - None - if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None - else ( - val - if val.startswith("gs://") and val[5:] and val[5] != "/" - else os.path.abspath(os.path.expanduser(val)) - ) + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"), + # Enable torch profiler to record shapes if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES") ), - # Enable torch profiler to record shapes if set - # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will - # not record shapes. - "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0" + # Enable torch profiler to profile memory if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY") ), - # Enable torch profiler to profile memory if set - # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler - # will not profile memory. - "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0" + # Enable torch profiler to profile stack if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_STACK": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_STACK") ), - # Enable torch profiler to profile stack if set - # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL - # profile stack by default. - "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0" + # Enable torch profiler to profile flops if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS") ), - # Enable torch profiler to profile flops if set - # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will - # not profile flops. - "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0" - ), - # Disable torch profiling of the AsyncLLMEngine process. - # If set to 1, will not profile the engine process. - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0" + # Disable torch profiling of the AsyncLLMEngine process if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM") ), # Delay number of iterations before starting profiling when using # the torch/torch CUDA profiler. If set to 0, will start profiling immediately. - "VLLM_PROFILER_DELAY_ITERS": lambda: int( - os.getenv("VLLM_PROFILER_DELAY_ITERS", "0") - ), + # Deprecated, see profiler_config. + "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")), # Maximum number of iterations to profile when using the torch/torch CUDA profiler. # If set to 0, will not limit the number of iterations. - "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")), + "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"), # Control whether torch profiler gzip-compresses profiling files. - # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default). - "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0" - ), + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"), # Control whether torch profiler dumps the self_cuda_time_total table. - # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping - # (enabled by default). - "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0" + # Set to 0 to disable dumping the table. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL") ), # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/wrapper.py similarity index 73% rename from vllm/profiler/gpu_profiler.py rename to vllm/profiler/wrapper.py index 798c615221b9f..a44a6a5eea0dd 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/wrapper.py @@ -3,26 +3,27 @@ from abc import ABC, abstractmethod from contextlib import nullcontext +from typing import Literal import torch from typing_extensions import override -import vllm.envs as envs +from vllm.config import ProfilerConfig from vllm.logger import init_logger logger = init_logger(__name__) class WorkerProfiler(ABC): - def __init__(self) -> None: - self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS + def __init__(self, profiler_config: ProfilerConfig) -> None: + self._delay_iters = profiler_config.delay_iterations if self._delay_iters > 0: logger.info_once( "GPU profiling will start " f"{self._delay_iters} steps after start_profile." ) - self._max_iters = envs.VLLM_PROFILER_MAX_ITERS + self._max_iters = profiler_config.max_iterations if self._max_iters > 0: logger.info_once( "GPU profiling will stop " @@ -133,12 +134,27 @@ class WorkerProfiler(ABC): return nullcontext() +TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"] +TorchProfilerActivityMap = { + "CPU": torch.profiler.ProfilerActivity.CPU, + "CUDA": torch.profiler.ProfilerActivity.CUDA, + "XPU": torch.profiler.ProfilerActivity.XPU, +} + + class TorchProfilerWrapper(WorkerProfiler): - def __init__(self, worker_name: str, local_rank: int) -> None: - super().__init__() + def __init__( + self, + profiler_config: ProfilerConfig, + worker_name: str, + local_rank: int, + activities: list[TorchProfilerActivity], + ) -> None: + super().__init__(profiler_config) self.local_rank = local_rank - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + self.profiler_config = profiler_config + torch_profiler_trace_dir = profiler_config.torch_profiler_dir if local_rank in (None, 0): logger.info( "Torch profiling enabled. Traces will be saved to: %s", @@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler): logger.debug( "Profiler config: record_shapes=%s," "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + profiler_config.torch_profiler_record_shapes, + profiler_config.torch_profiler_with_memory, + profiler_config.torch_profiler_with_stack, + profiler_config.torch_profiler_with_flops, ) + + self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1 self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + activities=[TorchProfilerActivityMap[activity] for activity in activities], + record_shapes=profiler_config.torch_profiler_record_shapes, + profile_memory=profiler_config.torch_profiler_with_memory, + with_stack=profiler_config.torch_profiler_with_stack, + with_flops=profiler_config.torch_profiler_with_flops, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, + use_gzip=profiler_config.torch_profiler_use_gzip, ), ) @@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler): def _stop(self) -> None: self.profiler.stop() - if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: - rank = self.local_rank - profiler_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = self.profiler_config + rank = self.local_rank + if profiler_config.torch_profiler_dump_cuda_time_total: + profiler_dir = profiler_config.torch_profiler_dir profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" sort_key = "self_cuda_time_total" table = self.profiler.key_averages().table(sort_by=sort_key) @@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler): # only print profiler results on rank 0 if rank == 0: print(table) + if self.dump_cpu_time_total and rank == 0: + logger.info( + self.profiler.key_averages().table( + sort_by="self_cpu_time_total", row_limit=50 + ) + ) @override def annotate_context_manager(self, name: str): @@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler): class CudaProfilerWrapper(WorkerProfiler): - def __init__(self) -> None: - super().__init__() + def __init__(self, profiler_config: ProfilerConfig) -> None: + super().__init__(profiler_config) # Note: lazy import to avoid dependency issues if CUDA is not available. import torch.cuda.profiler as cuda_profiler diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fd7e04dc02082..931d13be3d9b6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -166,32 +166,24 @@ class AsyncLLM(EngineClient): pass if ( - envs.VLLM_TORCH_PROFILER_DIR - and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM + vllm_config.profiler_config.profiler == "torch" + and not vllm_config.profiler_config.ignore_frontend ): + profiler_dir = vllm_config.profiler_config.torch_profiler_dir logger.info( "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 - envs.VLLM_TORCH_PROFILER_DIR, + profiler_dir, ) - if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0: - logger.warning_once( - "Torch profiler received max_iters or delay_iters setting. These " - "are not compatible with the AsyncLLM profiler and will be ignored " - "for the AsyncLLM process. Engine process profiling will still " - "respect these settings. Consider setting " - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable " - "AsyncLLM profiling." - ) worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, ], - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_stack=vllm_config.profiler_config.torch_profiler_with_stack, on_trace_ready=torch.profiler.tensorboard_trace_handler( - envs.VLLM_TORCH_PROFILER_DIR, + profiler_dir, worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, + use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip, ), ) else: diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index b080fea1d2dd6..e54b995ab908f 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -13,6 +13,7 @@ from vllm.logger import init_logger from vllm.model_executor.utils import set_random_seed from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo +from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment @@ -38,30 +39,17 @@ class CPUWorker(Worker): self.parallel_config.disable_custom_all_reduce = True - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # Torch profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, + self.profiler = TorchProfilerWrapper( + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU"], ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False - ), - ) - else: - self.profiler = None def init_device(self): # Setup OpenMP threads affinity. @@ -202,9 +190,3 @@ class CPUWorker(Worker): self.profiler.start() else: self.profiler.stop() - if self.local_rank == 0: - logger.info( - self.profiler.key_averages().table( - sort_by="self_cpu_time_total", row_limit=50 - ) - ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 24a3533a169f0..f2b6a1f76b0b9 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform -from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper +from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes @@ -92,17 +92,19 @@ class Worker(WorkerBase): # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} - # Torch/CUDA profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - # VLLM_TORCH_CUDA_PROFILE=1 + # Torch/CUDA profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" self.profiler = TorchProfilerWrapper( - worker_name=worker_name, local_rank=self.local_rank + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU", "CUDA"], ) - elif envs.VLLM_TORCH_CUDA_PROFILE: - self.profiler = CudaProfilerWrapper() + elif profiler_config.profiler == "cuda": + self.profiler = CudaProfilerWrapper(profiler_config) else: self.profiler = None diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ce18ca6c37165..7a10ac198985e 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -98,10 +98,10 @@ class TPUWorker: # MP runtime is initialized. self.profiler = None self.profile_dir = None - if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1: + if vllm_config.profiler_config.profiler == "torch" and self.rank < 1: # For TPU, we can only have 1 active profiler session for 1 profiler # server. So we only profile on rank0. - self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR + self.profile_dir = vllm_config.profiler_config.torch_profiler_dir logger.info( "Profiling enabled. Traces will be saved to: %s", self.profile_dir ) diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 267369c730368..1faa1a24ff0ea 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -6,12 +6,12 @@ from typing import Any import torch import torch.distributed -import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed import get_world_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform +from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.xpu_model_runner import XPUModelRunner @@ -36,41 +36,17 @@ class XPUWorker(Worker): assert device_config.device_type == "xpu" assert current_platform.is_xpu() - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # Torch profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, + self.profiler = TorchProfilerWrapper( + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU", "XPU"], ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.XPU, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, - worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, - ), - ) - else: - self.profiler = None # we provide this function due to `torch.xpu.mem_get_info()` doesn't # return correct free_gpu_memory on intel client GPU. We need to From 95501a70ec69d182b124774ff708c3050ab4e91e Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 9 Dec 2025 13:51:19 -0500 Subject: [PATCH 117/133] [BugFix] Fix DeepSeek-R1 hang with DP and MTP (#30119) Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Co-authored-by: Tyler Michael Smith Co-authored-by: Matthew Bonanni --- vllm/v1/worker/gpu_model_runner.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 766c2acd0e1d8..7398defd74a38 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4168,10 +4168,19 @@ class GPUModelRunner( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) + # Eagle currently only supports PIECEWISE cudagraphs. + # Therefore only use cudagraphs if the main model uses PIECEWISE + # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and not self.speculative_config.enforce_eager - ) + ( + is_graph_capturing + and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + or ( + not is_graph_capturing + and cudagraph_runtime_mode != CUDAGraphMode.NONE + ) + ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two # lora cases when cudagraph_specialize_lora is enabled. This is a From b37bf51e7594133772f4f06446dd5aa08aa5be0e Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 9 Dec 2025 13:52:20 -0500 Subject: [PATCH 118/133] [CI/Test] Fix FP8 per-tensor quant test reference scale shape (#30352) Signed-off-by: Lucas Wilkinson --- tests/kernels/quant_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 830d43569e98b..e29f66dca313f 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant( .clamp(fp8_traits_min, fp8_traits_max) .to(FP8_DTYPE) ) - return ref_out, ref_scale.view((1, 1)) + return ref_out, ref_scale.view(1) def native_w8a8_block_matmul( From 73a484caa1ad320d6e695f098c25c479a71e6774 Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Wed, 10 Dec 2025 04:13:10 +0900 Subject: [PATCH 119/133] [Model][Quantization] Fix / Add GGUF support for Qwen2 MoE models (#30307) Signed-off-by: Tsukasa OI --- vllm/model_executor/models/qwen2_moe.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5a428740082f6..cbc618f1abd08 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module): continue else: name = remapped_kv_scale_name + # GGUF: make sure that shared_expert_gate is a 2D tensor. + if ( + "mlp.shared_expert_gate" in name + and len(loaded_weight.shape) == 1 + ): + loaded_weight = loaded_weight[None, :] param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader From 7cab92fd45ce6ad7fd78d705ef31b12d3beebb4b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Dec 2025 20:03:16 +0000 Subject: [PATCH 120/133] Bump actions/checkout from 6.0.0 to 6.0.1 (#30233) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/cleanup_pr_body.yml | 2 +- .github/workflows/macos-smoke-test.yml | 2 +- .github/workflows/pre-commit.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 56fbe5ca704a1..df8910837715d 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -13,7 +13,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - name: Set up Python uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 3a12c4b3a8300..e80a5c0cc80f9 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -12,7 +12,7 @@ jobs: timeout-minutes: 30 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v6.0.1 - uses: astral-sh/setup-uv@v7 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index a03b979ad761d..1041653c2f57e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,7 +16,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: "3.12" From f8dacc66b69bfbd9a9addf542572487035f0a1db Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Dec 2025 20:12:14 +0000 Subject: [PATCH 121/133] Bump actions/stale from 10.1.0 to 10.1.1 (#30234) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/stale.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index c8a52f1a63269..44bf71db5e9de 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -15,7 +15,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 + - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months From 7618dc973dd1e56a46162bc7bd6e7625143bead0 Mon Sep 17 00:00:00 2001 From: rasmith Date: Tue, 9 Dec 2025 14:18:17 -0600 Subject: [PATCH 122/133] [CI/Build] Make test_mha_attn.py run on correct platform only and check for flash_attn_varlen_func in layer.py (#29145) --- tests/kernels/attention/test_mha_attn.py | 11 +++++++++-- vllm/attention/layer.py | 5 ++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index ae3c63cc62d6b..639abdf6f0487 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -26,7 +26,14 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() -@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +devices = ["cpu"] +if current_platform.is_cuda(): + devices.append("cuda") +if current_platform.is_rocm(): + devices.append("hip") + + +@pytest.mark.parametrize("device", devices) def test_mha_attn_platform(device: str): """ Test the attention selector between different platform and device. @@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str): patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()), ): attn = MultiHeadAttention(16, 64, scale=1) - assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA + assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN else: # Test CUDA with head_size=64 (divisible by 32) # - should use vLLM's FlashAttention diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7e5adfe0742d3..c77fc0fad0038 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -89,7 +89,10 @@ def maybe_get_vit_flash_attn_backend( if attn_backend == AttentionBackendEnum.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + try: + from vllm.attention.utils.fa_utils import flash_attn_varlen_func + except ImportError: + flash_attn_varlen_func = None else: flash_attn_varlen_func = None From 00e5cbb96789fd9d083b4015cdcff318e4de3808 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Tue, 9 Dec 2025 16:48:25 -0500 Subject: [PATCH 123/133] [MoE][Refactor] Remove most arguments to FusedMoEMethodBase.apply (#29066) Signed-off-by: Bill Nell Signed-off-by: Tyler Michael Smith Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Lucas Wilkinson Co-authored-by: Tyler Michael Smith --- .../layers/fused_moe/__init__.py | 6 +- .../layers/fused_moe/fused_moe_method_base.py | 18 -- .../fused_moe/fused_moe_modular_method.py | 26 +- vllm/model_executor/layers/fused_moe/layer.py | 74 ++--- .../fused_moe/unquantized_fused_moe_method.py | 224 +++++---------- .../layers/quantization/awq_marlin.py | 26 +- .../layers/quantization/bitsandbytes.py | 26 +- .../compressed_tensors_moe.py | 254 +++++------------- .../layers/quantization/experts_int8.py | 26 +- .../model_executor/layers/quantization/fp8.py | 104 +++---- .../layers/quantization/gguf.py | 25 +- .../layers/quantization/gptq_marlin.py | 26 +- .../layers/quantization/ipex_quant.py | 30 +-- .../layers/quantization/modelopt.py | 106 +++----- .../layers/quantization/moe_wna16.py | 26 +- .../layers/quantization/mxfp4.py | 97 +++---- .../layers/quantization/quark/quark_moe.py | 72 ++--- .../model_executor/layers/quantization/rtn.py | 24 +- 18 files changed, 318 insertions(+), 872 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 1e145a8fcd791..d71cfc5ad8200 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -4,7 +4,10 @@ from contextlib import contextmanager from typing import Any -from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + RoutingMethodType, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -49,6 +52,7 @@ __all__ = [ "FusedMoEPermuteExpertsUnpermute", "FusedMoEActivationFormat", "FusedMoEPrepareAndFinalize", + "RoutingMethodType", "SharedFusedMoE", "activation_without_mul", "override_config", diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index ef7090c349fc6..8c9d8a2777d58 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod -from collections.abc import Callable import torch @@ -100,22 +99,5 @@ class FusedMoEMethodBase(QuantizeMethodBase): layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index b33e7fd8a0215..1947423bf4777 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable import torch @@ -97,23 +96,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, zero_expert_result = layer.select_experts( hidden_states=x, @@ -127,10 +109,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): topk_weights=topk_weights, topk_ids=topk_ids, inplace=self.allow_inplace, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=None if self.disable_expert_map else layer.expert_map, ) if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5df3486093cd9..e635382068a63 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -33,10 +33,6 @@ from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton -from vllm.model_executor.layers.fused_moe.modular_kernel import ( - FusedMoEPermuteExpertsUnpermute, - FusedMoEPrepareAndFinalize, -) from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) @@ -57,11 +53,8 @@ from vllm.utils.torch_utils import ( from vllm.v1.worker.ubatching import dbo_current_ubatch_id if current_platform.is_cuda_alike(): - from .fused_moe import eplb_map_to_physical_and_record, fused_experts + from .fused_moe import eplb_map_to_physical_and_record else: - fused_experts = None # type: ignore - FusedMoEPermuteExpertsUnpermute = object # type: ignore - FusedMoEPrepareAndFinalize = object # type: ignore def _eplb_map_to_physical_and_record( topk_ids: torch.Tensor, @@ -483,7 +476,7 @@ class FusedMoE(CustomOp): enable_eplb=self.enable_eplb, ) - self.expert_map: torch.Tensor | None + self._expert_map: torch.Tensor | None local_num_experts, expert_map, expert_mask = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, @@ -493,7 +486,7 @@ class FusedMoE(CustomOp): return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts - self.register_buffer("expert_map", expert_map) + self.register_buffer("_expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) self._maybe_init_expert_routing_tables() logger.info_once( @@ -506,10 +499,10 @@ class FusedMoE(CustomOp): self.expert_placement_strategy, self.local_num_experts, self.global_num_experts, - get_compressed_expert_map(self.expert_map), + get_compressed_expert_map(self._expert_map), ) else: - self.local_num_experts, self.expert_map, self.expert_mask = ( + self.local_num_experts, self._expert_map, self.expert_mask = ( self.global_num_experts, None, None, @@ -781,7 +774,7 @@ class FusedMoE(CustomOp): ), ) - if self.expert_map is None: + if self._expert_map is None: return None routing_tables = self.ensure_round_robin_expert_routing_tables( @@ -789,7 +782,7 @@ class FusedMoE(CustomOp): ep_size=self.ep_size, ep_rank=self.ep_rank, local_num_experts=self.local_num_experts, - device=self.expert_map.device, + device=self._expert_map.device, ) global_to_physical, physical_to_global, local_global = routing_tables @@ -840,8 +833,8 @@ class FusedMoE(CustomOp): def update_expert_map(self): # ep_size and ep_rank should already be updated - assert self.expert_map is not None - with self.expert_map.device: + assert self._expert_map is not None + with self._expert_map.device: local_num_experts, expert_map, expert_mask = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, @@ -851,7 +844,7 @@ class FusedMoE(CustomOp): return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts - self.register_buffer("expert_map", expert_map) + self.register_buffer("_expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) self._maybe_init_expert_routing_tables() if self.aiter_fmoe_shared_expert_enabled: @@ -1068,9 +1061,9 @@ class FusedMoE(CustomOp): expert_data.copy_(loaded_weight) def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - if self.expert_map is None: + if self._expert_map is None: return expert_id - return self.expert_map[expert_id].item() + return self._expert_map[expert_id].item() def _init_aiter_shared_experts_topK_buffer( self, vllm_config: VllmConfig, dp_size: int @@ -1744,6 +1737,12 @@ class FusedMoE(CustomOp): reduce_output(fused_output)[..., :og_hidden_states], ) + @property + def expert_map(self) -> torch.Tensor | None: + return ( + self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask + ) + def forward_cuda( self, hidden_states: torch.Tensor, @@ -1805,24 +1804,6 @@ class FusedMoE(CustomOp): layer=self, x=staged_hidden_states, router_logits=staged_router_logits, - top_k=self.top_k, - renormalize=self.renormalize, - use_grouped_topk=self.use_grouped_topk, - global_num_experts=self.global_num_experts, - expert_map=self.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_mask, - topk_group=self.topk_group, - num_expert_group=self.num_expert_group, - custom_routing_function=self.custom_routing_function, - scoring_func=self.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, - e_score_correction_bias=self.e_score_correction_bias, - activation=self.activation, - enable_eplb=self.enable_eplb, - expert_load_view=self.expert_load_view, - logical_to_physical_map=self.logical_to_physical_map, - logical_replica_count=self.logical_replica_count, ) if has_separate_shared_experts: @@ -1968,25 +1949,6 @@ class FusedMoE(CustomOp): if do_naive_dispatch_combine else hidden_states, router_logits=router_logits, - top_k=self.top_k, - renormalize=self.renormalize, - use_grouped_topk=self.use_grouped_topk, - global_num_experts=self.global_num_experts, - expert_map=self.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_mask, - topk_group=self.topk_group, - num_expert_group=self.num_expert_group, - custom_routing_function=self.custom_routing_function, - scoring_func=self.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, - e_score_correction_bias=self.e_score_correction_bias, - activation=self.activation, - apply_router_weight_on_input=self.apply_router_weight_on_input, - enable_eplb=self.enable_eplb, - expert_load_view=self.expert_load_view, - logical_to_physical_map=self.logical_to_physical_map, - logical_replica_count=self.logical_replica_count, ) if has_separate_shared_experts: diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 48e5a8907f926..6182f10aa70f0 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable import torch import torch.nn.functional as F @@ -269,53 +268,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def apply( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - return self.forward( - x=x, layer=layer, + x=x, router_logits=router_logits, - top_k=top_k, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - topk_group=topk_group, - num_expert_group=num_expert_group, - global_num_experts=global_num_experts, - expert_map=expert_map, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - enable_eplb=enable_eplb, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, ) def get_fused_moe_quant_config( @@ -333,24 +293,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, zero_expert_result = layer.select_experts( hidden_states=x, @@ -364,9 +307,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - expert_map=expert_map, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=layer.expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) elif self.flashinfer_cutlass_moe_enabled: return self.flashinfer_cutlass_moe( @@ -375,8 +318,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: result = fused_experts( @@ -386,11 +329,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, + activation=layer.activation, quant_config=self.moe_quant_config, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, ) if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: @@ -405,148 +348,101 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if ( - enable_eplb is not False - or expert_load_view is not None - or logical_to_physical_map is not None - or logical_replica_count is not None + layer.enable_eplb is not False + or layer.expert_load_view is not None + or layer.logical_to_physical_map is not None + or layer.logical_replica_count is not None ): raise NotImplementedError("Expert load balancing is not supported for CPU.") + return layer.cpu_fused_moe( layer, x, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, - global_num_experts, - expert_map, - custom_routing_function, - scoring_func, - routed_scaling_factor, - e_score_correction_bias, - apply_router_weight_on_input, - activation, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, + layer.global_num_experts, + layer.expert_map, + layer.custom_routing_function, + layer.scoring_func, + layer.routed_scaling_factor, + layer.e_score_correction_bias, + layer.apply_router_weight_on_input, + layer.activation, ) def forward_xpu( self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if ( - enable_eplb is not False - or expert_load_view is not None - or logical_to_physical_map is not None - or logical_replica_count is not None + layer.enable_eplb is not False + or layer.expert_load_view is not None + or layer.logical_to_physical_map is not None + or layer.logical_replica_count is not None ): raise NotImplementedError("Expert load balancing is not supported for XPU.") return layer.ipex_fusion( x, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, - custom_routing_function=custom_routing_function, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, + custom_routing_function=layer.custom_routing_function, ) def forward_tpu( self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert not use_grouped_topk - assert num_expert_group is None - assert topk_group is None - assert custom_routing_function is None - assert apply_router_weight_on_input is False - if scoring_func != "softmax": + assert not layer.use_grouped_topk + assert layer.num_expert_group is None + assert layer.topk_group is None + assert layer.custom_routing_function is None + assert layer.apply_router_weight_on_input is False + if layer.scoring_func != "softmax": raise NotImplementedError( "Only softmax scoring function is supported for TPU." ) - if e_score_correction_bias is not None: + if layer.e_score_correction_bias is not None: raise NotImplementedError( "Expert score correction bias is not supported for TPU." ) - assert activation == "silu", f"{activation} is not supported for TPU." - assert routed_scaling_factor == 1.0, ( - f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU." + assert layer.activation == "silu", ( + f"{layer.activation} is not supported for TPU." + ) + assert layer.routed_scaling_factor == 1.0, ( + f"routed_scaling_factor {layer.routed_scaling_factor} is " + "not supported for TPU." ) if ( - enable_eplb is not False - or expert_load_view is not None - or logical_to_physical_map is not None - or logical_replica_count is not None + layer.enable_eplb is not False + or layer.expert_load_view is not None + or layer.logical_to_physical_map is not None + or layer.logical_replica_count is not None ): raise NotImplementedError("Expert load balancing is not supported for TPU.") return fused_moe_pallas( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, - topk=top_k, + topk=layer.top_k, gating_output=router_logits, - global_num_experts=global_num_experts, - expert_map=expert_map, - renormalize=renormalize, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + renormalize=layer.renormalize, ) if current_platform.is_tpu(): diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index d463e181fd2db..16aa4f1e22698 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import TYPE_CHECKING, Any, Optional import torch @@ -669,25 +668,8 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -708,9 +690,9 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): input_global_scale1=getattr(layer, "w13_input_global_scale", None), input_global_scale2=getattr(layer, "w2_input_global_scale", None), quant_type_id=self.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, w1_zeros=layer.w13_qzeros, w2_zeros=layer.w2_qzeros, workspace=layer.workspace, diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 1e57fa218b797..1fd959cb3423d 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Union import torch @@ -498,23 +497,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -534,10 +516,10 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 619162272c94f..5ad26f9318df3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -from collections.abc import Callable from enum import Enum import torch @@ -558,31 +557,14 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." if ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet." ) @@ -591,12 +573,12 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): layer=layer, x=x, router_logits=router_logits, - top_k=top_k, - global_num_experts=global_num_experts, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - e_score_correction_bias=e_score_correction_bias, + top_k=layer.top_k, + global_num_experts=layer.global_num_experts, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + custom_routing_function=layer.custom_routing_function, + e_score_correction_bias=layer.e_score_correction_bias, ) topk_weights, topk_ids, _ = layer.select_experts( @@ -619,9 +601,9 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): global_scale1=layer.w13_weight_scale_2, global_scale2=layer.w2_weight_scale_2, quant_type_id=scalar_types.float4_e2m1f.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, workspace=layer.workspace, ) @@ -646,15 +628,15 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): topk_ids=topk_ids, quant_config=self.moe_quant_config, inplace=False, # TODO(shuw): fix later, now output is high prec - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 - assert expert_map is None, ( + assert layer.expert_map is None, ( "Expert Parallelism / expert_map " "is currently not supported for " "CompressedTensorsW4A4Nvfp4MoEMethod." @@ -670,7 +652,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, quant_config=self.moe_quant_config, - apply_router_weight_on_input=apply_router_weight_on_input, + apply_router_weight_on_input=layer.apply_router_weight_on_input, # TODO(bnell): derive these from arguments m=x.shape[0], n=layer.w2_weight.shape[2] * 2, @@ -1188,23 +1170,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -1215,7 +1180,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL if self.use_marlin: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) return fused_marlin_moe( x, layer.w13_weight, @@ -1228,9 +1195,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, workspace=layer.workspace, ) @@ -1248,9 +1215,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -1270,10 +1237,12 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=None + if self.disable_expert_map + else layer.expert_map, # ??? quant_config=self.moe_quant_config, ) else: @@ -1290,9 +1259,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids, quant_config=self.moe_quant_config, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=None if self.disable_expert_map else layer.expert_map, ab_strides1=self.ab_strides1_c_strides2, ab_strides2=self.ab_strides2, c_strides1=self.c_strides1, @@ -1314,10 +1283,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -1437,23 +1406,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -1469,10 +1421,10 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -1814,25 +1766,10 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -1853,9 +1790,9 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): input_global_scale1=getattr(layer, "w13_input_global_scale", None), input_global_scale2=getattr(layer, "w2_input_global_scale", None), quant_type_id=self.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, g_idx1=layer.w13_weight_g_idx, g_idx2=layer.w2_weight_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, @@ -2057,23 +1994,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -2089,10 +2009,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -2372,32 +2292,15 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor: - assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet." - assert activation in ("silu", "swigluoai", "swiglu"), ( + assert not layer.enable_eplb, "EPLB not supported for W4A8-int MoE yet." + assert layer.activation in ("silu", "swigluoai", "swiglu"), ( "Only SiLU/SwiGLUGU/SwiGLUUG are supported." ) - assert expert_map is None, """expert_map/EP not implemented + assert layer.expert_map is None, """expert_map/EP not implemented for CPU dyn-4bit MoE.""" def _act_kind(s: str) -> int: @@ -2414,15 +2317,9 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids = select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, + top_k=layer.top_k, + use_grouped_topk=layer.use_grouped_topk, + renormalize=layer.renormalize, ) return torch.ops._C.dynamic_4bit_int_moe( @@ -2435,8 +2332,8 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): layer.w2_in_features, layer.w13_out_features, layer.group_size, - apply_router_weight_on_input, - int(_act_kind(activation)), + layer.apply_router_weight_on_input, + int(_act_kind(layer.activation)), ) @@ -2707,28 +2604,11 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ): - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." ) @@ -2749,9 +2629,9 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids, quant_config=self.moe_quant_config, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=None if self.disable_expert_map else layer.expert_map, a_strides1=self.a_strides1_c_strides2, a_strides2=self.a_strides2, b_strides1=self.b_strides1, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 7ebe40ec84687..11097cf36f5ca 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Optional import torch @@ -140,23 +139,6 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -172,10 +154,10 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 419ddd91b64e0..8567f64b936b5 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from enum import Enum from functools import partial from typing import TYPE_CHECKING, Any, Optional @@ -1242,41 +1241,20 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - assert isinstance(layer, FusedMoE) - if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + if layer.enable_eplb: + raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.") + assert layer.activation == "silu", ( + f"Expected 'silu' activation but got {layer.activation}" ) if self.block_quant: import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 e_score_correction_bias = ( - e_score_correction_bias.to(x.dtype) - if e_score_correction_bias is not None + layer.e_score_correction_bias.to(x.dtype) + if layer.e_score_correction_bias is not None else None ) routing_method_type = layer.routing_method_type @@ -1290,29 +1268,31 @@ class Fp8MoEMethod(FusedMoEMethodBase): w13_weight_scale_inv=layer.w13_weight_scale_inv, w2_weight=layer.w2_weight, w2_weight_scale_inv=layer.w2_weight_scale_inv, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, + global_num_experts=layer.global_num_experts, + top_k=layer.top_k, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, intermediate_size=layer.intermediate_size_per_partition, expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, block_shape=self.weight_block_size, routing_method_type=routing_method_type, - routed_scaling=routed_scaling_factor, + routed_scaling=layer.routed_scaling_factor, ) else: - assert not renormalize and custom_routing_function is not None + assert ( + not layer.renormalize and layer.custom_routing_function is not None + ) result = apply_flashinfer_per_tensor_scale_fp8( layer=layer, hidden_states=x, router_logits=router_logits, - routing_bias=e_score_correction_bias, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, - apply_router_weight_on_input=apply_router_weight_on_input, + routing_bias=layer.e_score_correction_bias, + global_num_experts=layer.global_num_experts, + top_k=layer.top_k, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) select_result = layer.select_experts( @@ -1333,13 +1313,15 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) elif self.use_marlin: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) result = fused_marlin_moe( x, layer.w13_weight, @@ -1352,20 +1334,22 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, workspace=layer.workspace, ) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + assert layer.activation == "silu", ( + f"Expected 'silu' activation but got {layer.activation}" ) if not self.block_quant: - assert not renormalize and custom_routing_function is not None - assert scoring_func == "sigmoid", ( - f"Expected 'sigmoid' scoring func but got {scoring_func}" + assert ( + not layer.renormalize and layer.custom_routing_function is not None + ) + assert layer.scoring_func == "sigmoid", ( + f"Expected 'sigmoid' scoring func but got {layer.scoring_func}" ) # Delegate to CUTLASS FlashInfer path; function already bound with # use_deepseek_fp8_block_scale for block-quant when applicable @@ -1375,10 +1359,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: from vllm.model_executor.layers.fused_moe import fused_experts @@ -1390,10 +1374,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, allow_deep_gemm=self.allow_deep_gemm, allow_cutlass_block_scaled_grouped_gemm=( diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index ee819df292ed1..13aa2bcad21ba 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable, Mapping +from collections.abc import Mapping from types import MappingProxyType from typing import Any, Optional @@ -625,26 +625,9 @@ class GGUFMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." - if apply_router_weight_on_input: + assert layer.activation == "silu", "Only SiLU activation is supported." + if layer.apply_router_weight_on_input: raise NotImplementedError( "Apply router weight on input is not supported for" "fused GGUF MoE method." @@ -662,7 +645,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): topk_ids, layer.w13_qweight_type.weight_type, layer.w2_qweight_type.weight_type, - activation, + layer.activation, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 56034e11329dc..8d1715f52f097 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from copy import deepcopy from typing import Any, Optional @@ -790,25 +789,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -829,9 +811,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): input_global_scale1=getattr(layer, "w13_input_global_scale", None), input_global_scale2=getattr(layer, "w2_input_global_scale", None), quant_type_id=self.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, g_idx1=layer.w13_g_idx, g_idx2=layer.w2_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index a1571afba2974..463c74c1c1482 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Optional import torch @@ -440,31 +439,14 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): layer: torch.nn.Module, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor: return layer.ipex_fusion( x, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, - custom_routing_function=custom_routing_function, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, + custom_routing_function=layer.custom_routing_function, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 034e97a713cdd..e825cb33c3580 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from fnmatch import fnmatch from typing import TYPE_CHECKING, Any, Optional @@ -707,43 +706,27 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptFp8MoEMethod` yet." ) - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + assert layer.activation == "silu", ( + f"Expected 'silu' activation but got {layer.activation}" ) - assert not renormalize + + assert not layer.renormalize return apply_flashinfer_per_tensor_scale_fp8( layer=layer, hidden_states=x, router_logits=router_logits, - routing_bias=e_score_correction_bias, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, - apply_router_weight_on_input=apply_router_weight_on_input, + routing_bias=layer.e_score_correction_bias, + global_num_experts=layer.global_num_experts, + top_k=layer.top_k, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) # Expert selection @@ -753,9 +736,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert activation in ("silu", "relu2_no_mul"), ( + assert layer.activation in ("silu", "relu2_no_mul"), ( "Expected activation to be in ('silu', 'relu2_no_mul')," - f"but got {activation}" + f"but got {layer.activation}" ) return flashinfer_cutlass_moe_fp8( x, @@ -763,10 +746,10 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts @@ -780,11 +763,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, + activation=layer.activation, quant_config=self.moe_quant_config, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) @@ -1504,23 +1487,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if not self.moe.is_act_and_mul: assert ( @@ -1535,7 +1501,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." ) @@ -1543,12 +1509,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer=layer, x=x, router_logits=router_logits, - top_k=top_k, - global_num_experts=global_num_experts, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - e_score_correction_bias=e_score_correction_bias, + top_k=layer.top_k, + global_num_experts=layer.global_num_experts, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + custom_routing_function=layer.custom_routing_function, + e_score_correction_bias=layer.e_score_correction_bias, ) topk_weights, topk_ids, _ = layer.select_experts( @@ -1571,9 +1537,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): global_scale1=layer.w13_weight_scale_2, global_scale2=layer.w2_weight_scale_2, quant_type_id=scalar_types.float4_e2m1f.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, ) @@ -1604,10 +1570,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): topk_ids=topk_ids, quant_config=self.moe_quant_config, inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: # If no modular kernel is provided, use cutlass_moe_fp4 for TP case @@ -1622,8 +1588,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, quant_config=self.moe_quant_config, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, # TODO: derive from arguments m=x.shape[0], n=layer.w2_weight.shape[2] * 2, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 8570b8c339987..0131a330f70d2 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Optional import torch @@ -362,27 +361,10 @@ class MoeWNA16Method(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, @@ -395,9 +377,9 @@ class MoeWNA16Method(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5d330e837eea0..6eae4e9e66e1b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from enum import Enum from typing import Optional @@ -892,25 +891,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") if self.mxfp4_backend == Mxfp4Backend.MARLIN: @@ -933,26 +915,26 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): global_scale1=None, global_scale2=None, quant_type_id=scalar_types.float4_e2m1f.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - activation=activation, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + activation=layer.activation, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, ) assert _can_support_mxfp4( - use_grouped_topk, - topk_group, - num_expert_group, - expert_map, - custom_routing_function, - e_score_correction_bias, - apply_router_weight_on_input, - scoring_func, - activation, - expert_load_view, - logical_to_physical_map, - logical_replica_count, + layer.use_grouped_topk, + layer.topk_group, + layer.num_expert_group, + layer.expert_map, + layer.custom_routing_function, + layer.e_score_correction_bias, + layer.apply_router_weight_on_input, + layer.scoring_func, + layer.activation, + layer.expert_load_view, + layer.logical_to_physical_map, + layer.logical_replica_count, ), "MXFP4 are not supported with this configuration." if ( @@ -988,8 +970,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): None, # output1_scale_scalar None, # output1_scale_gate_scalar None, # output2_scale_scalar - global_num_experts, - top_k, + layer.global_num_experts, + layer.top_k, None, # n_group None, # topk_group self.intermediate_size, # padded to multiple of 256 @@ -997,7 +979,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): self.num_experts, # local num experts None, None, - 1 if renormalize else 0, # routing_method_type, renormalize + 1 if layer.renormalize else 0, # routing_method_type, renormalize True, # do finalize tune_max_num_tokens=max(self.max_capture_size, 1), )[0] @@ -1081,12 +1063,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): w1=layer.w13_weight, w2=layer.w2_weight, gating_output=router_logits, - topk=top_k, - renormalize=renormalize, - global_num_experts=global_num_experts, - expert_map=expert_map, + topk=layer.top_k, + renormalize=layer.renormalize, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, - apply_router_weight_on_input=apply_router_weight_on_input, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") @@ -1138,37 +1120,20 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod): layer: torch.nn.Module, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor: - assert activation == "swigluoai", ( + assert layer.activation == "swigluoai", ( "Only swiglu_oai activation is supported for IPEX MXFP4 MoE" ) hidden_size_pad = round_up(self.original_hidden_size, 128) x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1))) hidden_states = layer.ipex_fusion( x_pad, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, activation="swiglu_oai", ) hidden_states = hidden_states[..., : self.original_hidden_size].contiguous() diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 9e2b2134310fc..d84e22d1fa0f2 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any import torch @@ -337,23 +336,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -371,13 +353,15 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, quant_config=self.moe_quant_config, - expert_map=expert_map, + expert_map=layer.expert_map, ) elif self.use_marlin: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) return fused_marlin_moe( x, layer.w13_weight, @@ -390,9 +374,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, ) else: from vllm.model_executor.layers.fused_moe import fused_experts @@ -404,10 +388,10 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -597,23 +581,6 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -631,9 +598,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, + activation=layer.activation, quant_config=self.moe_quant_config, - expert_map=expert_map, + expert_map=layer.expert_map, ) else: from vllm.model_executor.layers.fused_moe import fused_experts @@ -645,10 +612,11 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) + return out diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 7b51b828009fc..b2ecb0b175f81 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -3,7 +3,6 @@ # Copyright © 2025, Oracle and/or its affiliates. import os -from collections.abc import Callable from typing import Any, Optional import numpy as np @@ -359,23 +358,6 @@ class RTNMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -394,9 +376,9 @@ class RTNMoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, quant_type_id=self.quant_config.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, workspace=workspace, ) From fccd5325874321f34daef0a8ed4d1b15b26ca34e Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 9 Dec 2025 16:54:32 -0500 Subject: [PATCH 124/133] [Quantization] FP8 Weight Reloading for Quantized RL Rollout (#28480) Signed-off-by: Kyle Sayers --- tests/quantization/test_fp8.py | 88 +++++++++++ .../model_executor/layers/quantization/fp8.py | 149 +++++++++--------- .../layers/quantization/kv_cache.py | 7 + .../layers/quantization/utils/fp8_utils.py | 7 +- .../quantization/utils/marlin_utils_fp8.py | 11 +- .../layers/quantization/utils/w8a8_utils.py | 5 +- vllm/model_executor/utils.py | 25 +++ 7 files changed, 206 insertions(+), 86 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7bcac9ad768e7..62203186510ce 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -10,10 +10,14 @@ import torch from tests.quantization.utils import is_quant_method_supported from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.fp8 import ( + Fp8Config, Fp8KVCacheMethod, Fp8LinearMethod, + Fp8MoEMethod, ) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.platforms import current_platform MODELS = [ @@ -261,3 +265,87 @@ def test_scaled_fp8_quant(dtype) -> None: torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype ), ) + + +@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod]) +# FP8 weight reloading does not support online quantization +@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True]) # skip False +@pytest.mark.parametrize("weight_block_size", [None, [1, 1]]) +# any postprocessing that is applied to the weights such as padding and repacking +# (excluding device sharding) must also be applied to the reloaded weights +# +# this is the case for marlin as well as per-tensor Fp8MoEMethod +@pytest.mark.parametrize("use_marlin", [False]) # skip True +def test_fp8_reloading( + method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init +): + if is_checkpoint_fp8_serialized is False: + pytest.skip("FP8 weight reloading does not support online quantization") + + if method_cls is Fp8MoEMethod and weight_block_size is None: + pytest.skip( + "FP8 Tensor weight reloading does not support fusing w13_weight_scale. " + "If this is your use case, consider using a restore function like #26327" + ) + + with torch.device("cuda:0"): + config = Fp8Config( + is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, + weight_block_size=weight_block_size, + ) + + if method_cls is Fp8LinearMethod: + layer = torch.nn.Linear(1, 1) + method = method_cls(config) + method.create_weights( + layer=layer, + input_size_per_partition=1, + output_partition_sizes=[1], + input_size=1, + output_size=1, + params_dtype=torch.bfloat16, + weight_loader=default_weight_loader, + ) + + else: + layer = FusedMoE( + num_experts=1, + top_k=1, + hidden_size=1, + intermediate_size=1, + ) + method = method_cls(config, layer) + method.create_weights( + layer=layer, + num_experts=1, + hidden_size=1, + intermediate_size_per_partition=1, + params_dtype=torch.bfloat16, + weight_loader=default_weight_loader, + ) + + method.use_marlin = use_marlin + + # capture weights format during loading + original_metadata = [ + (name, param.shape, getattr(param, "weight_loader", default_weight_loader)) + for name, param in layer.named_parameters() + ] + + # test loading + for name, shape, _ in original_metadata: + param = getattr(layer, name) + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, torch.zeros(shape)) # cannot use empty + + method.process_weights_after_loading(layer) + + # test reloading works after loading + # assuming that no reshaping occurred + for name, shape, original_weight_loader in original_metadata: + param = getattr(layer, name) + weight_loader = getattr(param, "weight_loader", default_weight_loader) + assert weight_loader is original_weight_loader + weight_loader(param, torch.zeros(shape)) # cannot use empty + + method.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8567f64b936b5..60dde9eb57e0f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -94,7 +94,7 @@ from vllm.model_executor.parameter import ( ModelWeightParameter, PerTensorScaleParameter, ) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils.deep_gemm import ( @@ -548,46 +548,50 @@ class Fp8LinearMethod(LinearMethodBase): assert not self.act_q_static size_k_first = False - weight, weight_scale = process_fp8_weight_block_strategy( + weight, weight_scale_inv = process_fp8_weight_block_strategy( layer.weight, layer.weight_scale_inv ) - # Delete the weight_scale_inv parameter to avoid confusion - # with the weight_scale parameter - del layer.weight_scale_inv + + # Update layer with new values + replace_parameter(layer, "weight", weight.data) + replace_parameter(layer, "weight_scale_inv", weight_scale_inv.data) # If checkpoint not serialized fp8, quantize the weights. - elif not self.quant_config.is_checkpoint_fp8_serialized: - qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) - weight = qweight.t() - - # If checkpoint is fp8 per-tensor, handle that there are N scales for N - # shards in a fused module else: - weight = layer.weight - weight_scale = layer.weight_scale + if not self.quant_config.is_checkpoint_fp8_serialized: + qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + weight = qweight.t() - # If using w8a8, torch._scaled_mm needs per tensor, so - # requantize the logical shards as a single weight. - if not self.use_marlin: - weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy( - weight, - weight_scale, - layer.logical_widths, - getattr(layer, "input_scale", None), - ) - if self.act_q_static: - assert input_scale is not None - input_scale = input_scale.max() - weight = weight.t() + # If checkpoint is fp8 per-tensor, handle that there are N scales for N + # shards in a fused module + else: + weight = layer.weight + weight_scale = layer.weight_scale - # Update layer with new values. - layer.weight = Parameter(weight.data, requires_grad=False) - layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) - layer.input_scale = ( - Parameter(input_scale, requires_grad=False) - if input_scale is not None - else None - ) + # If using w8a8, torch._scaled_mm needs per tensor, so + # requantize the logical shards as a single weight. + if not self.use_marlin: + weight, weight_scale, input_scale = ( + process_fp8_weight_tensor_strategy( + weight, + weight_scale, + layer.logical_widths, + getattr(layer, "input_scale", None), + ) + ) + if self.act_q_static: + assert input_scale is not None + input_scale = input_scale.max() + weight = weight.t() + + # Update layer with new values. + replace_parameter(layer, "weight", weight.data) + replace_parameter(layer, "weight_scale", weight_scale.data) + + if input_scale is not None: + replace_parameter(layer, "input_scale", input_scale) + else: + layer.input_scale = None if self.use_marlin: prepare_fp8_layer_for_marlin( @@ -614,7 +618,7 @@ class Fp8LinearMethod(LinearMethodBase): return self.w8a8_block_fp8_linear.apply( input=x, weight=layer.weight, - weight_scale=layer.weight_scale, + weight_scale=layer.weight_scale_inv, input_scale=layer.input_scale, bias=bias, ) @@ -643,10 +647,15 @@ class Fp8LinearMethod(LinearMethodBase): return torch.nn.functional.linear(x, weight_bf16.t(), bias) if self.use_marlin: + if self.block_quant: + weight_scale = layer.weight_scale_inv + else: + weight_scale = layer.weight_scale + return apply_fp8_marlin_linear( input=x, weight=layer.weight, - weight_scale=layer.weight_scale, + weight_scale=weight_scale, workspace=layer.workspace, size_n=layer.output_size_per_partition, size_k=layer.input_size_per_partition, @@ -660,7 +669,7 @@ class Fp8LinearMethod(LinearMethodBase): return self.w8a8_block_fp8_linear.apply( input=x, weight=layer.weight, - weight_scale=layer.weight_scale, + weight_scale=layer.weight_scale_inv, input_scale=layer.input_scale, bias=bias, ) @@ -937,22 +946,18 @@ class Fp8MoEMethod(FusedMoEMethodBase): w2_weight_scale_inv = layer.w2_weight_scale_inv # torch.compile() cannot use Parameter subclasses. - layer.w13_weight = Parameter(w13_weight, requires_grad=False) - layer.w13_weight_scale_inv = Parameter( - w13_weight_scale_inv, requires_grad=False - ) - layer.w2_weight = Parameter(w2_weight, requires_grad=False) - layer.w2_weight_scale_inv = Parameter( - w2_weight_scale_inv, requires_grad=False - ) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv) + replace_parameter(layer, "w2_weight", w2_weight) + replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv) if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight.data, layer.w2_weight.data ) - layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. @@ -990,13 +995,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): # Re-initialize w13_scale because we directly quantize # merged w13 weights and generate a single scaling factor. - layer.w13_weight_scale = torch.nn.Parameter( + replace_parameter( + layer, + "w13_weight_scale", torch.ones( layer.local_num_experts, dtype=torch.float32, device=w13_weight.device, ), - requires_grad=False, ) for expert in range(layer.local_num_experts): w13_weight[expert, :, :], layer.w13_weight_scale[expert] = ( @@ -1005,16 +1011,17 @@ class Fp8MoEMethod(FusedMoEMethodBase): w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) ) - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w2_weight", w2_weight) + if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight, layer.w2_weight ) - layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) # If checkpoint is fp8, we need to handle that the # MoE kernels require single activation scale and single weight # scale for w13 per expert. @@ -1035,12 +1042,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): "fp8 MoE layer. Using the maximum across experts " "for each layer." ) - layer.w13_input_scale = torch.nn.Parameter( - layer.w13_input_scale.max(), requires_grad=False - ) - layer.w2_input_scale = torch.nn.Parameter( - layer.w2_input_scale.max(), requires_grad=False - ) + replace_parameter(layer, "w13_input_scale", layer.w13_input_scale.max()) + replace_parameter(layer, "w2_input_scale", layer.w2_input_scale.max()) if current_platform.is_fp8_fnuz(): # Normalize the weights and scales w13_weight, w13_weight_scale, w13_input_scale = ( @@ -1054,22 +1057,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) ) # Reset the parameter - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w13_weight_scale = torch.nn.Parameter( - w13_weight_scale, requires_grad=False - ) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w13_weight_scale", w13_weight_scale) if w13_input_scale is not None: - layer.w13_input_scale = torch.nn.Parameter( - w13_input_scale, requires_grad=False - ) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - layer.w2_weight_scale = torch.nn.Parameter( - w2_weight_scale, requires_grad=False - ) + replace_parameter(layer, "w13_input_scale", w13_input_scale) + replace_parameter(layer, "w2_weight", w2_weight) + replace_parameter(layer, "w2_weight_scale", w2_weight_scale) if w2_input_scale is not None: - layer.w2_input_scale = torch.nn.Parameter( - w2_input_scale, requires_grad=False - ) + replace_parameter(layer, "w2_input_scale", w2_input_scale) # Fp8 moe kernel needs single weight scale for w13 per expert. # We take the max then dequant and requant each expert. @@ -1093,12 +1088,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w13_weight, layer.w2_weight ) - layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) - layer.w13_weight_scale = torch.nn.Parameter( - max_w13_scales, requires_grad=False - ) + replace_parameter(layer, "w13_weight_scale", max_w13_scales) if self.flashinfer_moe_backend is not None: # NOTE: weights have to be swapped since the activation is diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 78456dcf1ca56..f0497a8722909 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -45,6 +45,13 @@ class BaseKVCacheMethod(QuantizeMethodBase): raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.") def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # skip if there are no weights to process (for example, weight reloading) + if not hasattr(layer, "q_scale"): + assert not hasattr(layer, "k_scale") + assert not hasattr(layer, "v_scale") + assert not hasattr(layer, "prob_scale") + return + # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0 # regardless whether the kv-scale is available in the checkpoint. # No need to process kv scales after loading if we are going to diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 366c5778fa5d9..f5200d7d34891 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -27,6 +27,7 @@ from vllm.model_executor.parameter import ( ChannelQuantScaleParameter, PerTensorScaleParameter, ) +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( @@ -1404,12 +1405,12 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module): if should_use_deepgemm: dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block( wq=layer.weight.data, - ws=layer.weight_scale.data, + ws=layer.weight_scale_inv.data, quant_block_shape=tuple(layer.weight_block_size), use_e8m0=is_deep_gemm_e8m0_used(), ) - layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False) - layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False) + replace_parameter(layer, "weight", dg_weight) + replace_parameter(layer, "weight_scale_inv", dg_weight_scale) def expert_weight_is_col_major(x: torch.Tensor) -> bool: diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index e6b4f567caea4..c67e4f437cf0c 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_quant_input, should_use_atomic_add_reduce, ) +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -130,7 +131,7 @@ def prepare_fp8_layer_for_marlin( size_n=part_size_n, num_bits=8, ) - layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False) + replace_parameter(layer, "weight", marlin_qweight) # WEIGHT SCALES # Permute scales @@ -138,7 +139,6 @@ def prepare_fp8_layer_for_marlin( scales = layer.weight_scale.to(layer.orig_dtype) elif "weight_scale_inv" in dir(layer): scales = layer.weight_scale_inv.to(layer.orig_dtype) - del layer.weight_scale_inv group_size = -1 if weight_block_size is None else weight_block_size[1] @@ -177,12 +177,15 @@ def prepare_fp8_layer_for_marlin( ) if input_dtype != torch.float8_e4m3fn: marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales) - layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False) + if hasattr(layer, "weight_scale"): + replace_parameter(layer, "weight_scale", marlin_scales) + elif hasattr(layer, "weight_scale_inv"): + replace_parameter(layer, "weight_scale_inv", marlin_scales) if hasattr(layer, "bias") and layer.bias is not None: assert layer.bias.shape == (part_size_n,) bias = marlin_permute_bias(layer.bias) - layer.bias = torch.nn.Parameter(bias, requires_grad=False) + replace_parameter(layer, "bias", bias) def prepare_moe_fp8_layer_for_marlin( diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index fceed3e55c2df..4287922417c63 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -118,8 +118,11 @@ def requantize_with_max_scale( # from disk in this case. Skip requantization in this case (since) # we already are quantized with the single scale. # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 + # + # Extra note: upon weight reloading weight_scale.ndim == 0 unfused_module_in_checkpoint = ( - weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min + weight_scale.ndim != 0 + and weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min ) # If unfused checkpoint, need requanize with the single scale. diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 8aad59e84ff25..b89371d987541 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -50,6 +50,31 @@ def set_weight_attrs( setattr(weight, key, value) +def replace_parameter(layer: torch.nn.Module, param_name: str, new_data: torch.Tensor): + """ + Replace a parameter of a layer while maintaining the ability to reload the weight. + Called within implementations of the `process_weights_after_loading` method. + + This function should not be called on weights which are tied/shared + + Args: + layer: Layer containing parameter to replace + param_name: Name of parameter to replace + new_data: New data of the new parameter + """ + # should not be used on a tied/shared param + if isinstance(new_data, torch.nn.Parameter): + new_data = new_data.data + new_param = torch.nn.Parameter(new_data, requires_grad=False) + + old_param: torch.nn.Parameter | None = getattr(layer, param_name, None) + if old_param is not None and hasattr(old_param, "weight_loader"): + weight_loader = old_param.weight_loader + set_weight_attrs(new_param, {"weight_loader": weight_loader}) + + setattr(layer, param_name, new_param) + + def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: parent_map = getattr(model, "packed_modules_mapping", None) parent_map = copy.deepcopy(parent_map) if parent_map is not None else {} From 3c680f4a17057d7994af8fbb1dc8c2d98307c890 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Tue, 9 Dec 2025 16:39:26 -0600 Subject: [PATCH 125/133] [Rocm][torch.compile] Adding layernorm + fp8 block quant and silu + fp8 block quant for Aiter (#25693) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: charlifu Signed-off-by: Micah Williamson Signed-off-by: Charlie Fu Co-authored-by: Micah Williamson Co-authored-by: wuhuikx Co-authored-by: Luka Govedič Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> --- tests/compile/test_fusion.py | 98 ++++++- tests/compile/test_silu_mul_quant_fusion.py | 62 ++++- vllm/_aiter_ops.py | 214 ++++++++++++---- vllm/compilation/pass_manager.py | 11 + vllm/compilation/rocm_aiter_fusion.py | 242 ++++++++++++++++++ .../layers/quantization/utils/fp8_utils.py | 43 +++- 6 files changed, 610 insertions(+), 60 deletions(-) create mode 100644 vllm/compilation/rocm_aiter_fusion.py diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 2ad34a79859a3..6b72c595cd779 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,10 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools + import pytest import torch import vllm.plugins +from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops from vllm.compilation.fusion import FUSED_OPS, FusedRMSQuantKey, RMSNormQuantFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.matcher_utils import QUANT_OPS @@ -152,13 +155,79 @@ GROUP_SHAPES = [ ] +class TestRmsnormGroupFp8QuantModel(torch.nn.Module): + def __init__(self, hidden_size: int, eps: float, **kwargs): + super().__init__() + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(128, 128), + act_quant_group_shape=GroupShape(1, 128), + cutlass_block_fp8_supported=False, + use_aiter_and_is_supported=True, + ) + self.w = [ + torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() + for _ in range(3) + ] + + scale_hidden_size = (hidden_size + 128 - 1) // 128 + self.wscale = [ + torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32) + for _ in range(3) + ] + + self.norm_weight = [torch.ones(hidden_size) for _ in range(4)] + self.eps = eps + + def forward(self, x): + # avoid having graph input be an arg to a pattern directly + x = resid = torch.relu(x) + y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps) + + x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0]) + # make sure resid is used for replacement to work + y2, resid = rocm_aiter_ops.rms_norm2d_with_add( + x2, resid, self.norm_weight[1], self.eps + ) + + x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1]) + + y3, resid = rocm_aiter_ops.rms_norm2d_with_add( + x3, resid, self.norm_weight[2], self.eps + ) + + x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2]) + + y4, resid = rocm_aiter_ops.rms_norm2d_with_add( + x4, resid, self.norm_weight[3], self.eps + ) + return y4 + + def ops_in_model_before(self): + return [ + torch.ops.vllm.rocm_aiter_rms_norm, + torch.ops.vllm.rocm_aiter_group_fp8_quant, + ] + + def ops_in_model_before_partial(self): + return [] + + def ops_in_model_after(self): + return [ + torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant, + torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant, + ] + + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("hidden_size", [256]) @pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("group_shape", GROUP_SHAPES) -@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False]) -@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False]) +@pytest.mark.parametrize( + "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op", + list(itertools.product([TestModel], [True, False], [True, False])) + + [(TestRmsnormGroupFp8QuantModel, False, False)], +) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @pytest.mark.parametrize( @@ -173,10 +242,14 @@ def test_fusion_rmsnorm_quant( num_tokens, eps, group_shape, + model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, cuda_force_torch, ): + if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND: + pytest.skip("AITER is not supported on this GPU.") + torch.set_default_device("cuda") torch.set_default_dtype(dtype) torch.manual_seed(1) @@ -209,12 +282,24 @@ def test_fusion_rmsnorm_quant( with vllm.config.set_current_vllm_config(vllm_config): # Reshape pass is needed for the fusion pass to work noop_pass = NoOpEliminationPass(vllm_config) - fusion_pass = RMSNormQuantFusionPass(vllm_config) + if model_class is TestRmsnormGroupFp8QuantModel: + from vllm.compilation.rocm_aiter_fusion import ( + RocmAiterRMSNormFp8GroupQuantFusionPass, + ) + + fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config) + else: + fusion_pass = RMSNormQuantFusionPass(vllm_config) cleanup_pass = PostCleanupPass(vllm_config) backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) backend2 = TestBackend(noop_pass, cleanup_pass) - model = TestModel(hidden_size, eps, group_shape, cuda_force_torch) + model = model_class( + hidden_size=hidden_size, + eps=eps, + group_shape=group_shape, + cuda_force_torch=cuda_force_torch, + ) # First dimension dynamic x = torch.rand(num_tokens, hidden_size) torch._dynamo.mark_dynamic(x, 0) @@ -243,7 +328,10 @@ def test_fusion_rmsnorm_quant( # there's a risk that the fused add doesn't get included in the # replacement and only the rms part gets fused with quant. # Hence, we check only 2 add nodes are left (final fused rmsnorm add). - if not enable_rms_norm_custom_op: + if ( + not enable_rms_norm_custom_op + and model_class is not TestRmsnormGroupFp8QuantModel + ): n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g)) # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each) assert n_add_nodes(backend.graph_pre_pass) == 7 diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index c336a45955cb5..eb0dee8d4e399 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -7,6 +7,7 @@ import torch import vllm.envs as envs from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor +from vllm._aiter_ops import IS_AITER_FOUND from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.compilation.activation_quant_fusion import ( FUSED_OPS, @@ -24,6 +25,7 @@ from vllm.config import ( set_current_vllm_config, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, kFp8StaticTensorSym, @@ -126,6 +128,39 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): return [FUSED_OPS[kNvfp4Quant]] +class TestSiluMulGroupFp8QuantModel(torch.nn.Module): + def __init__(self, hidden_size: int, **kwargs): + super().__init__() + self.silu_and_mul = SiluAndMul() + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(128, 128), + act_quant_group_shape=GroupShape(1, 128), + cutlass_block_fp8_supported=False, + use_aiter_and_is_supported=True, + ) + self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() + + scale_hidden_size = (hidden_size + 128 - 1) // 128 + self.wscale = torch.rand( + (scale_hidden_size, scale_hidden_size), dtype=torch.float32 + ) + + self.enable_silu_mul_custom_op = self.silu_and_mul.enabled() + + def forward(self, x): + y = self.silu_and_mul(x) + x2 = self.w8a8_block_fp8_linear.apply(y, self.w, self.wscale) + return x2 + + def ops_in_model_before(self): + return [ + SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul, + ] + + def ops_in_model_after(self): + return [torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant] + + @pytest.mark.parametrize("num_tokens", [32, 64]) @pytest.mark.parametrize("hidden_size", [128, 256]) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @@ -133,7 +168,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): @pytest.mark.parametrize( "model_class, enable_quant_fp8_custom_op, cuda_force_torch", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False])) - + [(TestSiluMulNvfp4QuantModel, False, False)], + + [ + (TestSiluMulNvfp4QuantModel, False, False), + (TestSiluMulGroupFp8QuantModel, False, False), + ], ) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @@ -144,13 +182,19 @@ def test_fusion_silu_and_mul_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, - model_class: type[TestSiluMulFp8QuantModel | TestSiluMulNvfp4QuantModel], + model_class: type[ + TestSiluMulFp8QuantModel + | TestSiluMulNvfp4QuantModel + | TestSiluMulGroupFp8QuantModel + ], enable_silu_mul_custom_op: bool, enable_quant_fp8_custom_op: bool, cuda_force_torch: bool, ): if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported(): pytest.skip("NVFP4 is not supported on this GPU.") + if model_class is TestSiluMulGroupFp8QuantModel and not IS_AITER_FOUND: + pytest.skip("AITER is not supported on this GPU.") torch.set_default_device("cuda") torch.set_default_dtype(dtype) @@ -173,9 +217,15 @@ def test_fusion_silu_and_mul_quant( ) with set_current_vllm_config(config): - fusion_pass = ActivationQuantFusionPass(config) + fusion_passes = [ActivationQuantFusionPass(config)] + if IS_AITER_FOUND: + from vllm.compilation.rocm_aiter_fusion import ( + RocmAiterSiluMulFp8GroupQuantFusionPass, + ) - passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)] + fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] + + passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] backend = TestBackend(*passes) model = model_class( hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x @@ -194,12 +244,14 @@ def test_fusion_silu_and_mul_quant( atol, rtol = 1e-3, 1e-3 elif model_class == TestSiluMulNvfp4QuantModel: atol, rtol = 1e-1, 1e-1 + elif model_class == TestSiluMulGroupFp8QuantModel: + atol, rtol = 5e-2, 5e-2 torch.testing.assert_close( result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol ) - assert fusion_pass.matched_count == 1 + assert sum([p.matched_count for p in fusion_passes]) == 1 # In pre-nodes, quant op should be present and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 94bbc9b00225e..010817e79a936 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -24,6 +24,15 @@ def is_aiter_found() -> bool: # we keep this global outside to not cause torch compile breaks. IS_AITER_FOUND = is_aiter_found() +# Can't use dtypes.fp8 directly inside an op +# because it returns wrong result on gfx942. +# This is a workaround to get the correct FP8 dtype. +# This might because that the get_gfx() is wrapped as a custom op. +if IS_AITER_FOUND: + from aiter import dtypes + + AITER_FP8_DTYPE = dtypes.fp8 + def if_aiter_supported(func: Callable) -> Callable: """Decorator that only executes the function if @@ -45,36 +54,6 @@ def if_aiter_supported(func: Callable) -> Callable: return wrapper -def _rocm_aiter_group_fp8_quant_impl( - x: torch.Tensor, - group_size: int, -) -> tuple[torch.Tensor, torch.Tensor]: - assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size" - from aiter import QuantType, dtypes, get_hip_quant - - aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) - return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8) - - -def _rocm_aiter_group_fp8_quant_fake( - x: torch.Tensor, - group_size: int, -) -> tuple[torch.Tensor, torch.Tensor]: - from aiter import dtypes - - M, N = x.shape - x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device) - out_bs = torch.empty( - ( - M, - (N + group_size - 1) // group_size, - ), - dtype=torch.float32, - device=x.device, - ) - return x_fp8, out_bs - - def _rocm_aiter_fused_moe_impl( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -522,6 +501,142 @@ def _rocm_aiter_per_token_quant_fake( ) +def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant + + (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant( + x, + weight, + variance_epsilon, + None, + None, + None, + group_size=group_size, + dtype_quant=AITER_FP8_DTYPE, + res1=residual, + ) + return (x_quant, x_quant_scales, res) + + +def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + M, N = x.shape + scale_shape = (M, (N + group_size - 1) // group_size) + return ( + torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device), + torch.empty(scale_shape, dtype=torch.float32, device=x.device), + torch.empty_like(residual, device=residual.device), + ) + + +def _rocm_aiter_rmsnorm_fp8_group_quant_impl( + x: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant + + (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant( + x, + weight, + variance_epsilon, + None, + None, + None, + group_size=group_size, + dtype_quant=AITER_FP8_DTYPE, + res1=None, + ) + return (x_quant, x_quant_scales) + + +def _rocm_aiter_rmsnorm_fp8_group_quant_fake( + x: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + scale_shape = (M, (N + group_size - 1) // group_size) + return ( + torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device), + torch.empty(scale_shape, dtype=torch.float32, device=x.device), + ) + + +def _rocm_aiter_group_fp8_quant_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size" + from aiter import QuantType, get_hip_quant + + aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) + return aiter_per1x128_quant(x.contiguous(), quant_dtype=AITER_FP8_DTYPE) + + +def _rocm_aiter_group_fp8_quant_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + x_fp8 = torch.empty((M, N), dtype=AITER_FP8_DTYPE, device=x.device) + out_bs = torch.empty( + ( + M, + (N + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + +def _rocm_aiter_act_mul_and_fp8_group_quant_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.triton.activation import act_mul_and_fp8_group_quant + + return act_mul_and_fp8_group_quant( + x, + activation="silu", + group_size=group_size, + dtype_quant=AITER_FP8_DTYPE, + ) + + +def _rocm_aiter_act_mul_and_fp8_group_quant_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + assert N % 2 == 0 + N_half = N // 2 + x_fp8 = torch.empty((M, N_half), dtype=AITER_FP8_DTYPE, device=x.device) + out_bs = torch.empty( + ( + M, + (N_half + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + # Global flag to ensure ops are registered only once _OPS_REGISTERED = False @@ -557,7 +672,7 @@ class rocm_aiter_ops: @if_aiter_supported def is_linear_fp8_enaled(cls) -> bool: """ "Verifies device specs and availability of env variable.""" - return cls.is_linear_enabled() and current_platform.is_fp8_fnuz() + return cls.is_linear_enabled() @classmethod @if_aiter_supported @@ -632,14 +747,6 @@ class rocm_aiter_ops: ) # register all the custom ops here - direct_register_custom_op( - op_name="rocm_aiter_group_fp8_quant", - op_func=_rocm_aiter_group_fp8_quant_impl, - mutates_args=[], - fake_impl=_rocm_aiter_group_fp8_quant_fake, - dispatch_key=current_platform.dispatch_key, - ) - direct_register_custom_op( op_name="rocm_aiter_asm_moe_tkw1", op_func=_rocm_aiter_asm_moe_tkw1_impl, @@ -699,27 +806,46 @@ class rocm_aiter_ops: direct_register_custom_op( op_name="rocm_aiter_gemm_a8w8_blockscale", op_func=_rocm_aiter_gemm_a8w8_blockscale_impl, - mutates_args=[], fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="rocm_aiter_rms_norm", op_func=_rocm_aiter_rms_norm_impl, - mutates_args=[], fake_impl=_rocm_aiter_rms_norm_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="rocm_aiter_rmsnorm2d_fwd_with_add", op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl, - mutates_args=[], fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake, dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm_fp8_group_quant", + op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl, + fake_impl=_rocm_aiter_rmsnorm_fp8_group_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm_with_add_fp8_group_quant", + op_func=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl, + fake_impl=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_act_mul_and_fp8_group_quant", + op_func=_rocm_aiter_act_mul_and_fp8_group_quant_impl, + fake_impl=_rocm_aiter_act_mul_and_fp8_group_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_group_fp8_quant", + op_func=_rocm_aiter_group_fp8_quant_impl, + fake_impl=_rocm_aiter_group_fp8_quant_fake, + ) + direct_register_custom_op( op_name="rocm_aiter_per_tensor_quant", op_func=_rocm_aiter_per_tensor_quant_impl, diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 6848bfb6a3c53..4ebb386f75ed8 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -5,6 +5,7 @@ import functools from torch import fx as fx from vllm import envs +from vllm._aiter_ops import rocm_aiter_ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform @@ -13,6 +14,12 @@ from vllm.utils.system_utils import set_env_var from .post_cleanup import PostCleanupPass from .vllm_inductor_pass import VllmInductorPass +if rocm_aiter_ops.is_enabled(): + from vllm.compilation.rocm_aiter_fusion import ( + RocmAiterRMSNormFp8GroupQuantFusionPass, + RocmAiterSiluMulFp8GroupQuantFusionPass, + ) + if current_platform.is_cuda_alike(): from .activation_quant_fusion import ActivationQuantFusionPass from .fusion import RMSNormQuantFusionPass @@ -109,8 +116,12 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.fuse_norm_quant: self.passes += [RMSNormQuantFusionPass(config)] + if rocm_aiter_ops.is_enabled(): + self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)] if self.pass_config.fuse_act_quant: self.passes += [ActivationQuantFusionPass(config)] + if rocm_aiter_ops.is_enabled(): + self.passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] if self.pass_config.fuse_attn_quant: self.passes += [AttnFusionPass(config)] diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py new file mode 100644 index 0000000000000..8b5db9de38181 --- /dev/null +++ b/vllm/compilation/rocm_aiter_fusion.py @@ -0,0 +1,242 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import torch +import torch._inductor.pattern_matcher as pm +from torch import fx +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch._ops import OpOverload + +import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 +from vllm.compilation.activation_quant_fusion import ActivationQuantPattern +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.platforms import current_platform + +from .fusion import empty_bf16 +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherSiluAndMul +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) +FP8_DTYPE = current_platform.fp8_dtype() + +AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default +AITER_RMS_ADD_GROUP_QUANT_OP = ( + torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default +) + +AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default +AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default + +AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default +TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default + +FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default + + +class AiterRMSFp8GroupQuantPattern: + """ + This pattern fuses aiter rms_norm & group fp8 quant custom + ops into an aiter rms_norm_group_fp8_quant op. + """ + + def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): + self.epsilon = epsilon + self.quant_dtype = quant_dtype + self.quant_op = quant_op + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + ): + at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon) + + at2 = self.quant_op(at1, 128) + + return at2[0], at2[1] + + def replacement( + input: torch.Tensor, + weight: torch.Tensor, + ): + at = AITER_RMS_GROUP_QUANT_OP( + x=input, + weight=weight, + variance_epsilon=self.epsilon, + group_size=128, + ) + + return at[0], at[1] + + inputs = [ + empty_bf16(5, 4), # input + empty_bf16(1, 5), # weight + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class AiterFusedAddRMSFp8GroupQuantPattern: + """ + This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops + into a aiter rms_norm_with_add_group_fp8_quant op. + """ + + def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): + self.epsilon = epsilon + self.quant_dtype = quant_dtype + self.quant_op = quant_op + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + ): + at1 = AITER_RMS_ADD_OP( + x=input, + residual=residual, + weight=weight, + variance_epsilon=self.epsilon, + ) + + at2 = self.quant_op(at1[0], 128) + + # result, scale, residual + return at2[0], at2[1], at1[1] + + def replacement( + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + ): + at = AITER_RMS_ADD_GROUP_QUANT_OP( + x=input, + residual=residual, + weight=weight, + variance_epsilon=self.epsilon, + group_size=128, + ) + + # result, scale, residual + return at[0], at[1], at[2] + + inputs = [ + empty_bf16(5, 4), # input + empty_bf16(5, 4), # residual + empty_bf16(1, 5), # weight + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass): + """ + This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. + It also supports fused_add_rms_norm. + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass" + ) + + # Make sure fused add patterns are before simple rms norm, + # as the latter is a subset of the former in torch ops + for epsilon in [1e-5, 1e-6]: + # Fuse rms_norm + dynamic group fp8 quant + for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: + AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register( + self.patterns + ) + + AiterFusedAddRMSFp8GroupQuantPattern( + epsilon, FP8_DTYPE, quant_op + ).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def uuid(self) -> Any: + fusion_patterns = [ + AiterRMSFp8GroupQuantPattern, + AiterFusedAddRMSFp8GroupQuantPattern, + ] + return self.hash_source(self, *fusion_patterns) + + +class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): + """ + This pattern fuses aiter silu_and_mul & group fp8 quant custom + ops into an aiter silu_and_mul_group_fp8_quant op. + """ + + def __init__(self, quant_op: OpOverload): + self.silu_and_mul_matcher = MatcherSiluAndMul() + self.quant_op = quant_op + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + ): + at1 = self.silu_and_mul_matcher(input) + at2 = self.quant_op(at1, 128) + return at2[0], at2[1] + + def replacement( + input: torch.Tensor, + ): + at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128) + return at[0], at[1] + + inputs = [ + self.silu_and_mul_matcher.inputs()[0], + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): + """ + This pass fuses a pre-defined set of custom ops into fused ops. + It uses the torch pattern matcher to find the patterns and replace them. + + Because patterns can only be registered once, the pass is a singleton. + This will be addressed in a future version of PyTorch: + https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" + ) + + for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: + AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: torch.fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def uuid(self): + fusion_patterns = [ + ActivationQuantPattern, + AiterSiluMulFp8GroupQuantPattern, + ] + return VllmInductorPass.hash_source(self, *fusion_patterns) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index f5200d7d34891..b459d5947863b 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -196,6 +196,39 @@ direct_register_custom_op( ) +def _triton_per_token_group_quant_fp8_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + return per_token_group_quant_fp8( + x, group_size, column_major_scales=False, use_ue8m0=False + ) + + +def _triton_per_token_group_quant_fp8_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + x_fp8 = torch.empty((M, N), dtype=current_platform.fp8_dtype(), device=x.device) + out_bs = torch.empty( + ( + M, + (N + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + +direct_register_custom_op( + "triton_per_token_group_quant_fp8", + _triton_per_token_group_quant_fp8_impl, + fake_impl=_triton_per_token_group_quant_fp8_fake, +) + + # TODO fix ROCm->Triton custom path: # https://github.com/vllm-project/vllm/issues/14397 class W8A8BlockFp8LinearOp: @@ -341,17 +374,15 @@ class W8A8BlockFp8LinearOp: if input_scale is not None: q_input = input_2d - # MI350 case uses triton kernel elif use_triton: - q_input, input_scale = per_token_group_quant_fp8( + q_input, input_scale = torch.ops.vllm.triton_per_token_group_quant_fp8( input_2d, self.act_quant_group_shape.col, - column_major_scales=False, - use_ue8m0=False, ) - # MI300 uses tuned AITER ASM/C++ kernel else: - q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d) + q_input, input_scale = rocm_aiter_ops.group_fp8_quant( + input_2d, self.act_quant_group_shape.col + ) return gemm_a8w8_blockscale_op( q_input, From 2e7054da065504a4786d251f4c5bd099a9ddab86 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:51:32 -0800 Subject: [PATCH 126/133] Improve wvsplitK tile and balance heristics. (#29937) Signed-off-by: Hashem Hashemi --- csrc/rocm/skinny_gemms.cu | 97 +++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index 2ef579a1b7537..8ebe55cef391d 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx, } #endif // defined(__HIP__GFX9__) TODO: Add NAVI support +// Find the min val of div2 that doesn't increase N/(div1*div2) int mindiv(int N, int div1, int div2) { int nPrRnd = div1 * div2; - int rnds0 = N / nPrRnd; - nPrRnd -= div1 * 3; - int rnds3 = N / nPrRnd; - nPrRnd -= div1; - int rnds4 = N / nPrRnd; - nPrRnd -= div1; - int rnds5 = N / nPrRnd; - nPrRnd -= div1; - int rnds6 = N / nPrRnd; - nPrRnd -= div1; - int rnds7 = N / nPrRnd; - nPrRnd -= div1; - int rnds8 = N / nPrRnd; - nPrRnd -= div1; - int rnds9 = N / nPrRnd; - nPrRnd -= div1; - int rtn = div2; - if (rnds0 == rnds3) rtn = div2 - 3; - if (rnds0 == rnds4) rtn = div2 - 4; - if (rnds0 == rnds5) rtn = div2 - 5; - if (rnds0 == rnds6) rtn = div2 - 6; - if (rnds0 == rnds7) rtn = div2 - 7; - if (rnds0 == rnds8) rtn = div2 - 8; - if (rnds0 == rnds9) rtn = div2 - 9; - return rtn; + int rnds[13]; + for (int i = 0; i < 13; i++) { + rnds[i] = (N + nPrRnd - 1) / nPrRnd; + nPrRnd -= div1; + } + for (int i = 12; i >= 0; i--) + if (rnds[0] == rnds[i]) return (div2 - i); } torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, @@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const int max_lds_len = get_lds_size() / 2; -#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ - _N) \ - { \ - dim3 block(64, _WvPrGrp); \ - if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ - wvSplitK_hf_sml_ \ - <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ - biasf4, c, __wvPrGrp, CuCount); \ - } else if (K_in * N_in <= max_lds_len * 1.2) { \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ - wvSplitK_hf_ \ - <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ - biasf4, c, __wvPrGrp, CuCount); \ - } else { \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \ - wvSplitK_hf_big_ \ - <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ - biasf4, c, __wvPrGrp, CuCount); \ - } \ +#define WVSPLITK(_YTILE, _UNRL, _N) \ + { \ + dim3 block(64, 16); \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \ + if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \ + wvSplitK_hf_sml_ \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ + else if (K_in * N_in <= max_lds_len * 1.2) \ + wvSplitK_hf_ \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ + else \ + wvSplitK_hf_big_ \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ + } + +#define WVSPLIT_TILE(_sYT, __N) \ + { \ + bool fit_lds = (K_in * N_in <= max_lds_len); \ + if (_sYT <= 1) \ + WVSPLITK(1, 4, __N) \ + else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \ + WVSPLITK(2, 2, __N) \ + else if (_sYT <= 4 * 3) \ + WVSPLITK(3, 2, __N) \ + else if (__N == 4) \ + WVSPLITK(4, 1, __N) \ + else \ + WVSPLITK(4, 2, __N) \ } AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] { @@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, ? reinterpret_cast(in_bias->data_ptr()) : nullptr; fptype* c = reinterpret_cast(out_c.data_ptr()); + + // first shoot for biggest tile-size that keeps all simd busy, + // then cut the active waves to balance their distribution... + int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4); + switch (N_in) { case 1: - WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1) + WVSPLIT_TILE(sYT, 1) break; case 2: - WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2) + WVSPLIT_TILE(sYT, 2) break; case 3: - WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3) + WVSPLIT_TILE(sYT, 3) break; case 4: - WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4) + WVSPLIT_TILE(sYT, 4) break; default: throw std::runtime_error( From 03b5f940fdcff25024ce5d37c357d770344a8f20 Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Wed, 10 Dec 2025 08:15:01 +0800 Subject: [PATCH 127/133] [V1][Spec Decode] Optimize Medusa proposer to avoid GPU-CPU sync (#29723) Signed-off-by: dongbo910220 <1275604947@qq.com> --- vllm/v1/spec_decode/medusa.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 12b903ccaca97..989478f348161 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -38,16 +38,16 @@ class MedusaProposer: self, target_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> list[list[int]]: + ) -> torch.Tensor: # Generate blocks and compute logits blocks = self.model(target_hidden_states) logits = self.model.compute_logits(blocks) - # Get draft tokens and transpose the result - # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU - # synchronization. - draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits] - return [list(row) for row in zip(*draft_tokens)] + # Compute argmax for each Medusa head and stack into a single tensor + # Shape: [batch_size, num_heads] + draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1) + + return draft_tokens def load_model(self, target_model: nn.Module) -> None: from vllm.compilation.backends import set_model_tag From 4c2e10ea19b9053924d66f30f3d7121fbd9684f8 Mon Sep 17 00:00:00 2001 From: PatrykSaffer Date: Wed, 10 Dec 2025 01:47:07 +0100 Subject: [PATCH 128/133] [Bugfix] Fix cuda graph sizes when running with speculative decoding (#30330) Signed-off-by: Patryk Saffer Signed-off-by: PatrykSaffer Co-authored-by: Patryk Saffer --- vllm/config/vllm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 614a3226cb711..8f27db0013305 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1047,8 +1047,14 @@ class VllmConfig: self.compilation_config.max_cudagraph_capture_size ) if max_cudagraph_capture_size is None: + decode_query_len = 1 + if ( + self.speculative_config + and self.speculative_config.num_speculative_tokens + ): + decode_query_len += self.speculative_config.num_speculative_tokens max_cudagraph_capture_size = min( - self.scheduler_config.max_num_seqs * 2, 512 + self.scheduler_config.max_num_seqs * decode_query_len * 2, 512 ) max_num_tokens = self.scheduler_config.max_num_batched_tokens max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size) From 2e7035dd8cc2e6c907873462b4ac0bb9f08e0abb Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Wed, 10 Dec 2025 02:17:25 +0100 Subject: [PATCH 129/133] [Bugfix] Fix fp8 DeepGemm compilation issues (#30336) --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index b459d5947863b..e12fe61bf3d97 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( - DeepGemmQuantScaleFMT, fp8_gemm_nt, is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp: self.act_quant_group_shape = act_quant_group_shape self.is_deep_gemm_supported = is_deep_gemm_supported() self.is_hopper = current_platform.is_device_capability(90) + self.is_blackwell = current_platform.is_device_capability(100) self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() # Get the correct blockscale mul and input quant operations. @@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp: weight: torch.Tensor, weight_scale: torch.Tensor, ) -> torch.Tensor: - if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0: + if self.use_deep_gemm_e8m0 and self.is_blackwell: q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( input_2d, group_size=self.act_quant_group_shape.col, From abe93bce5952ed8adf90d4b77af6ed3515958620 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 9 Dec 2025 20:18:10 -0500 Subject: [PATCH 130/133] [Attention] Make seq_lens_cpu optional in CommonAttentionMetadata to enable true async spec-decode (#29624) Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Co-authored-by: Benjamin Chislett --- tests/v1/attention/utils.py | 4 +- tests/v1/e2e/test_async_spec_decode.py | 131 ++++++++++++++++++++ tests/v1/spec_decode/test_tree_attention.py | 4 +- vllm/attention/layers/cross_attention.py | 2 +- vllm/v1/attention/backends/gdn_attn.py | 2 +- vllm/v1/attention/backends/utils.py | 66 +++++++--- vllm/v1/spec_decode/eagle.py | 20 +-- vllm/v1/worker/gpu/attn_utils.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 4 +- 9 files changed, 200 insertions(+), 37 deletions(-) create mode 100644 tests/v1/e2e/test_async_spec_decode.py diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 6cab129c116c5..4dcaf9d908690 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -106,8 +106,8 @@ def create_common_attn_metadata( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=batch_spec.batch_size, num_actual_tokens=num_tokens, max_query_len=max_query_len, diff --git a/tests/v1/e2e/test_async_spec_decode.py b/tests/v1/e2e/test_async_spec_decode.py new file mode 100644 index 0000000000000..561f37a52d573 --- /dev/null +++ b/tests/v1/e2e/test_async_spec_decode.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test that verifies no implicit GPU-CPU synchronization occurs during +speculative decoding generation under expected conditions. +""" + +import multiprocessing +import sys +import traceback + +import pytest +import torch + + +@pytest.fixture +def sync_tracker(): + """ + Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect + lazy init syncs. Prints stack traces immediately when syncs occur. + """ + from vllm.v1.attention.backends.utils import CommonAttentionMetadata + + # Shared counter for cross-process communication (inherited by fork) + sync_count = multiprocessing.Value("i", 0) + + # Save original property + original_prop = CommonAttentionMetadata.seq_lens_cpu + original_fget = original_prop.fget + + # Create tracking wrapper + def tracking_seq_lens_cpu(self): + if self._seq_lens_cpu is None: + # Increment counter + with sync_count.get_lock(): + sync_count.value += 1 + count = sync_count.value + # Print stack trace immediately (shows in subprocess output) + print(f"\n{'=' * 60}", file=sys.stderr) + print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr) + print(f"{'=' * 60}", file=sys.stderr) + traceback.print_stack(file=sys.stderr) + print(f"{'=' * 60}\n", file=sys.stderr) + sys.stderr.flush() + return original_fget(self) + + # Apply patch + CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu) + + class SyncTracker: + @property + def count(self) -> int: + return sync_count.value + + def assert_no_sync(self, msg: str = ""): + count = sync_count.value + assert count == 0, ( + f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered " + f"{count} times. See stack traces above. {msg}" + ) + + yield SyncTracker() + + # Restore original property + CommonAttentionMetadata.seq_lens_cpu = original_prop + torch._dynamo.reset() + + +# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env) +SPEC_DECODE_CONFIGS = [ + pytest.param( + "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama3_2_1B_speculator.eagle3", + "eagle3", + 2, + id="eagle3-llama", + ), + pytest.param( + "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", + "eagle", + 2, + id="eagle-mla-deepseek", + ), +] + + +@pytest.mark.parametrize( + "model,spec_model,method,num_spec_tokens", + SPEC_DECODE_CONFIGS, +) +def test_no_sync_with_spec_decode( + sync_tracker, + model: str, + spec_model: str, + method: str, + num_spec_tokens: int, +): + """ + Test that no implicit GPU-CPU sync occurs during speculative decoding + generation. + """ + # Import vLLM AFTER sync_tracker fixture has applied the patch + from vllm import LLM, SamplingParams + from vllm.distributed import cleanup_dist_env_and_memory + + llm = LLM( + model=model, + max_model_len=256, + speculative_config={ + "method": method, + "num_speculative_tokens": num_spec_tokens, + "model": spec_model, + }, + enforce_eager=True, + async_scheduling=True, + ) + + outputs = llm.generate( + ["Hello, my name is"], + SamplingParams(temperature=0, max_tokens=10), + ) + + assert len(outputs) == 1 + assert len(outputs[0].outputs[0].text) > 0 + + del llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + sync_tracker.assert_no_sync() diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index a4ee53008ce82..0afeeb8914b87 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -88,8 +88,8 @@ def forward_attention( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc.cpu(), seq_lens=seq_lens, - seq_lens_cpu=seq_lens.cpu(), - num_computed_tokens_cpu=context_lens.cpu(), + _seq_lens_cpu=seq_lens.cpu(), + _num_computed_tokens_cpu=context_lens.cpu(), num_reqs=batch_size, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index 068fd0a0eb7d0..cfd203bdd37b9 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -103,7 +103,7 @@ def create_cross_attention_backend( # needed here to know how many tokens to attend to from the cached # cross-attention KV cache. new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens - new_metadata.seq_lens_cpu = torch.from_numpy( + new_metadata._seq_lens_cpu = torch.from_numpy( common_attn_metadata.encoder_seq_lens_cpu ) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index e921f8c3de073..3a2f92d9921c3 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] num_accepted_tokens = torch.diff(m.query_start_loc) num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu() - m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu() + m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu() return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 5200bc48b3156..79a1f7d4757d9 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -18,7 +18,7 @@ from typing import ( import numpy as np import torch -from typing_extensions import runtime_checkable +from typing_extensions import deprecated, runtime_checkable from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.utils.math_utils import cdiv @@ -66,11 +66,6 @@ class CommonAttentionMetadata: """(batch_size + 1,), the start location of each request in query Tensor""" seq_lens: torch.Tensor - seq_lens_cpu: torch.Tensor - """(batch_size,), the length of each request including both computed tokens - and newly scheduled tokens""" - - num_computed_tokens_cpu: torch.Tensor """(batch_size,), the number of computed tokens for each request""" num_reqs: int @@ -81,7 +76,7 @@ class CommonAttentionMetadata: max_query_len: int """Longest query in batch""" max_seq_len: int - """Longest context length in batch""" + """Longest context length (may be an upper bound)""" block_table_tensor: torch.Tensor slot_mapping: torch.Tensor @@ -100,6 +95,40 @@ class CommonAttentionMetadata: dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" + # WARNING: Deprecated fields. Will be removed in a future release (v0.14.0) + _seq_lens_cpu: torch.Tensor | None = None + _num_computed_tokens_cpu: torch.Tensor | None = None + + @property + @deprecated( + """ + Prefer using device seq_lens directly to avoid implicit H<>D sync. + If a CPU copy is needed, use `seq_lens.cpu()` instead. + Will be removed in a future release (v0.14.0) + """ + ) + def seq_lens_cpu(self) -> torch.Tensor: + if self._seq_lens_cpu is None: + self._seq_lens_cpu = self.seq_lens.to("cpu") + return self._seq_lens_cpu + + @property + @deprecated( + """ + Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full + async scheduling. If a CPU copy is needed, it can be derived from + query_start_loc_cpu and seq_lens. + Will be removed in a future release (v0.14.0) + """ + ) + def num_computed_tokens_cpu(self) -> torch.Tensor: + if self._num_computed_tokens_cpu is None: + query_seq_lens = ( + self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1] + ) + self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens + return self._num_computed_tokens_cpu + # TODO(lucas): remove once we have FULL-CG spec-decode support def unpadded( self, num_actual_tokens: int, num_actual_reqs: int @@ -109,8 +138,12 @@ class CommonAttentionMetadata: query_start_loc=self.query_start_loc[: num_actual_reqs + 1], query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], seq_lens=self.seq_lens[:num_actual_reqs], - seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs], - num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs], + _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs] + if self._seq_lens_cpu is not None + else None, + _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs] + if self._num_computed_tokens_cpu is not None + else None, num_reqs=num_actual_reqs, num_actual_tokens=num_actual_tokens, max_query_len=self.max_query_len, @@ -224,14 +257,14 @@ def _make_metadata_with_slice( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=num_requests, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, ) @@ -689,9 +722,7 @@ def make_local_attention_virtual_batches( return CommonAttentionMetadata( query_start_loc_cpu=query_start_loc_cpu, query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True), - seq_lens_cpu=seq_lens_cpu, seq_lens=seq_lens_cpu.to(device=device, non_blocking=True), - num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local), num_reqs=len(seq_lens_cpu), num_actual_tokens=common_attn_metadata.num_actual_tokens, max_query_len=seqlens_q_local.max(), @@ -699,6 +730,8 @@ def make_local_attention_virtual_batches( block_table_tensor=block_table_local, slot_mapping=common_attn_metadata.slot_mapping, causal=True, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local), ) @@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata( logits_indices = logits_indices_padded[:num_logits_indices] num_reqs = common_attn_metadata.num_reqs query_start_loc = common_attn_metadata.query_start_loc - seq_lens = common_attn_metadata.seq_lens # Example inputs # num_reqs: 3 # generation_indices: [14, 18, 19, 27] @@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata( common_attn_metadata = CommonAttentionMetadata( query_start_loc=decode_query_start_loc, query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True), - seq_lens=seq_lens, - seq_lens_cpu=seq_lens.to("cpu", non_blocking=True), - num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + seq_lens=common_attn_metadata.seq_lens, num_reqs=num_reqs, num_actual_tokens=total_num_decode_tokens, max_query_len=decode_max_query_len, @@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata( block_table_tensor=common_attn_metadata.block_table_tensor, slot_mapping=common_attn_metadata.slot_mapping, causal=True, + _seq_lens_cpu=common_attn_metadata._seq_lens_cpu, + _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu, ) return common_attn_metadata diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 9f7859a5c3565..4cc78ae9d23ae 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -440,16 +440,16 @@ class EagleProposer: # of main model. # Increment the sequence lengths. common_attn_metadata.seq_lens += 1 - # This is an out-of-place operation to avoid modifying the original tensor. - common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1 # For the requests that exceed the max model length, we set the # sequence length to 1 to minimize their overheads in attention. - common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) - common_attn_metadata.num_computed_tokens_cpu = ( - common_attn_metadata.seq_lens_cpu - 1 - ) + # Also update the CPU-side shadow; NOTE: this is hacky and should be + # removed in when common_attn_metadata.seq_lens_cpu is deprecated. + if common_attn_metadata._seq_lens_cpu is not None: + common_attn_metadata._seq_lens_cpu += 1 + if common_attn_metadata._num_computed_tokens_cpu is not None: + common_attn_metadata._num_computed_tokens_cpu += 1 # Compute the slot mapping. if self.uses_mrope: @@ -656,8 +656,8 @@ class EagleProposer: query_start_loc=common_attn_metadata.query_start_loc, seq_lens=common_attn_metadata.seq_lens, query_start_loc_cpu=query_start_loc_cpu, - seq_lens_cpu=common_attn_metadata.seq_lens_cpu, - num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + _seq_lens_cpu=common_attn_metadata._seq_lens_cpu, + _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu, num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), @@ -932,8 +932,8 @@ class EagleProposer: query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True), seq_lens=new_seq_lens_cpu.to(device, non_blocking=True), query_start_loc_cpu=new_query_start_loc_cpu, - seq_lens_cpu=new_seq_lens_cpu, - num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + _seq_lens_cpu=new_seq_lens_cpu, + _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu, num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 5aa1a33d851cc..6386f1a08b446 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -168,9 +168,9 @@ def build_attn_metadata( query_start_loc=query_start_loc_gpu, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, + _seq_lens_cpu=seq_lens_cpu, max_seq_len=max_seq_len, - num_computed_tokens_cpu=num_computed_tokens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=num_reqs, num_actual_tokens=num_tokens, max_query_len=max_query_len, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7398defd74a38..f6f89d6eb6736 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1626,8 +1626,8 @@ class GPUModelRunner( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_actual_tokens=num_tokens_padded, num_reqs=num_reqs_padded, max_query_len=max_query_len, From c3487aca3425f532730c3433cfbd44e880fce2a8 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Tue, 9 Dec 2025 18:13:13 -0800 Subject: [PATCH 131/133] [responsesAPI][6] Fix multi turn MCP tokenization (#30230) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- tests/entrypoints/test_responses_utils.py | 52 ++++++++++++++++--- vllm/entrypoints/constants.py | 2 + vllm/entrypoints/context.py | 6 ++- vllm/entrypoints/openai/serving_engine.py | 1 + vllm/entrypoints/responses_utils.py | 62 +++++++++++++++++++++-- 5 files changed, 110 insertions(+), 13 deletions(-) diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 3951bd4840085..a522967111307 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall from openai.types.responses.response_function_tool_call_output_item import ( ResponseFunctionToolCallOutputItem, ) @@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import ( ) from vllm.entrypoints.responses_utils import ( - construct_chat_message_with_tool_call, + _construct_single_message_from_response_item, + construct_chat_messages_with_tool_call, convert_tool_responses_to_completions_format, ) @@ -42,7 +44,43 @@ class TestResponsesUtils: assert result == {"type": "function", "function": input_tool} - def test_construct_chat_message_with_tool_call(self): + def test_construct_chat_messages_with_tool_call(self): + """Test construction of chat messages with tool calls.""" + reasoning_item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=[ + Content( + text="Leroy Jenkins", + type="reasoning_text", + ) + ], + encrypted_content=None, + status=None, + ) + mcp_tool_item = ResponseFunctionToolCall( + id="mcp_123", + call_id="call_123", + type="function_call", + status="completed", + name="python", + arguments='{"code": "123+456"}', + ) + input_items = [reasoning_item, mcp_tool_item] + messages = construct_chat_messages_with_tool_call(input_items) + + assert len(messages) == 1 + message = messages[0] + assert message["role"] == "assistant" + assert message["reasoning"] == "Leroy Jenkins" + assert message["tool_calls"][0]["id"] == "call_123" + assert message["tool_calls"][0]["function"]["name"] == "python" + assert ( + message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}' + ) + + def test_construct_single_message_from_response_item(self): item = ResponseReasoningItem( id="lol", summary=[], @@ -56,7 +94,7 @@ class TestResponsesUtils: encrypted_content=None, status=None, ) - formatted_item = construct_chat_message_with_tool_call(item) + formatted_item = _construct_single_message_from_response_item(item) assert formatted_item["role"] == "assistant" assert formatted_item["reasoning"] == "Leroy Jenkins" @@ -74,7 +112,7 @@ class TestResponsesUtils: status=None, ) - formatted_item = construct_chat_message_with_tool_call(item) + formatted_item = _construct_single_message_from_response_item(item) assert formatted_item["role"] == "assistant" assert ( formatted_item["reasoning"] @@ -88,7 +126,7 @@ class TestResponsesUtils: output="1234", status="completed", ) - formatted_item = construct_chat_message_with_tool_call(tool_call_output) + formatted_item = _construct_single_message_from_response_item(tool_call_output) assert formatted_item["role"] == "tool" assert formatted_item["content"] == "1234" assert formatted_item["tool_call_id"] == "temp" @@ -102,7 +140,7 @@ class TestResponsesUtils: status=None, ) with pytest.raises(ValueError): - construct_chat_message_with_tool_call(item) + _construct_single_message_from_response_item(item) output_item = ResponseOutputMessage( id="msg_bf585bbbe3d500e0", @@ -119,6 +157,6 @@ class TestResponsesUtils: type="message", ) - formatted_item = construct_chat_message_with_tool_call(output_item) + formatted_item = _construct_single_message_from_response_item(output_item) assert formatted_item["role"] == "assistant" assert formatted_item["content"] == "dongyi" diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py index b5bcccc35d6c8..5726ee0735d4c 100644 --- a/vllm/entrypoints/constants.py +++ b/vllm/entrypoints/constants.py @@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints. # These constants help mitigate header abuse attacks H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB H11_MAX_HEADER_COUNT_DEFAULT = 256 + +MCP_PREFIX = "mcp_" diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 01ddab473723b..c70eaaa082fe5 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -19,6 +19,7 @@ from vllm import envs from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ) +from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.openai.parser.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, @@ -303,7 +304,7 @@ class ParsableContext(ConversationContext): result_str = result.content[0].text message = ResponseFunctionToolCallOutputItem( - id=f"fco_{random_uuid()}", + id=f"mcpo_{random_uuid()}", type="function_call_output", call_id=f"call_{random_uuid()}", output=result_str, @@ -385,6 +386,9 @@ class ParsableContext(ConversationContext): if not self.parser.response_messages: return [] last_msg = self.parser.response_messages[-1] + # change this to a mcp_ function call + last_msg.id = f"{MCP_PREFIX}{random_uuid()}" + self.parser.response_messages[-1] = last_msg if last_msg.name == "code_interpreter": return await self.call_python_tool(self._tool_sessions["python"], last_msg) elif last_msg.name == "web_search_preview": diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 99936f588f28b..44b0f1842a6c1 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1339,6 +1339,7 @@ class OpenAIServing: ) engine_prompt = engine_prompts[0] request_prompt = request_prompts[0] + prompt_text, _, _ = self._get_prompt_components(request_prompt) # Update the sampling params. sampling_params.max_tokens = self.max_model_len - len( diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index fbc137bac4543..99080fa43cb8e 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool from vllm import envs +from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.openai.protocol import ( ChatCompletionMessageParam, ResponseInputOutputItem, @@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context( ) if isinstance(output_messages[-1], ResponseFunctionToolCall): mcp_message = McpCall( - id=f"mcp_{random_uuid()}", + id=f"{MCP_PREFIX}{random_uuid()}", arguments=output_messages[-1].arguments, name=output_messages[-1].name, server_label=output_messages[ -1 ].name, # TODO: store the server label - type="mcp_call", + type=f"{MCP_PREFIX}call", status="completed", output=message.output, # TODO: support error output @@ -98,12 +99,63 @@ def construct_input_messages( if isinstance(request_input, str): messages.append({"role": "user", "content": request_input}) else: - for item in request_input: - messages.append(construct_chat_message_with_tool_call(item)) + input_messages = construct_chat_messages_with_tool_call(request_input) + messages.extend(input_messages) return messages -def construct_chat_message_with_tool_call( +def _maybe_combine_reasoning_and_tool_call( + item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam] +) -> ChatCompletionMessageParam | None: + """Many models treat MCP calls and reasoning as a single message. + This function checks if the last message is a reasoning message and + the current message is a tool call""" + if not ( + isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX) + ): + return None + if len(messages) == 0: + return None + last_message = messages[-1] + if not ( + last_message.get("role") == "assistant" + and last_message.get("reasoning") is not None + ): + return None + + last_message["tool_calls"] = [ + ChatCompletionMessageToolCallParam( + id=item.call_id, + function=FunctionCallTool( + name=item.name, + arguments=item.arguments, + ), + type="function", + ) + ] + return last_message + + +def construct_chat_messages_with_tool_call( + input_messages: list[ResponseInputOutputItem], +) -> list[ChatCompletionMessageParam]: + """This function wraps _construct_single_message_from_response_item + Because some chatMessages come from multiple response items + for example a reasoning item and a MCP tool call are two response items + but are one chat message + """ + messages: list[ChatCompletionMessageParam] = [] + for item in input_messages: + maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages) + if maybe_combined_message is not None: + messages[-1] = maybe_combined_message + else: + messages.append(_construct_single_message_from_response_item(item)) + + return messages + + +def _construct_single_message_from_response_item( item: ResponseInputOutputItem, ) -> ChatCompletionMessageParam: if isinstance(item, ResponseFunctionToolCall): From b75f826fca4febb17a76c12a45d5e315111c7618 Mon Sep 17 00:00:00 2001 From: rasmith Date: Tue, 9 Dec 2025 20:28:37 -0600 Subject: [PATCH 132/133] [CI/Build][AMD] Skip quantization kernels tests that require CUTLASS or e4m3fn when not supported by platform (#30020) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/quantization/test_block_fp8.py | 17 ++++++++++++++--- .../quantization/test_cutlass_scaled_mm.py | 3 +++ tests/kernels/quantization/test_cutlass_w4a8.py | 3 +++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index d0e4f6554a91f..32c77b9a01ece 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -54,6 +54,10 @@ def setup_cuda(): torch.set_default_device("cuda") +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="This platform supports e4m3fnuz, not e4m3fn.", +) @pytest.mark.parametrize( "num_tokens,d,dtype,group_size,seed", itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS), @@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) factor_for_scale = 1e-2 - fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_info = torch.finfo(current_platform.fp8_dtype()) fp8_max, fp8_min = fp8_info.max, fp8_info.min A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype()) B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype()) block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n @@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform." +) @torch.inference_mode() def test_w8a8_block_fp8_cutlass_matmul(): # Test simple case where weight.shape % 128 != 0, @@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul(): assert rel_diff < 0.001 +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="This platform supports e4m3fnuz, not e4m3fn.", +) @pytest.mark.parametrize( "M,N,K,block_size,out_dtype,seed", itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS), diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index de595b0a34e46..bc4744df7e69e 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -15,6 +15,9 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +if not current_platform.is_cuda(): + pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True) + MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index cccef28f5e931..8cfc993fe8e82 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +if not current_platform.is_cuda(): + pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True) + # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel # unit tests to a common utility function. Currently the use of # `is_quant_method_supported` conflates kernels with quantization methods From 7d80c73d4277187d0468f15a22bba959ce853261 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Tue, 9 Dec 2025 20:35:49 -0600 Subject: [PATCH 133/133] [CI] Reduce Flakiness For test_spec_decode.py::test_suffix_decoding_acceptance (#30367) Signed-off-by: Micah Williamson --- tests/v1/e2e/test_spec_decode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 416b582dfaa63..8c904a8cddac4 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance( # Expect the acceptance rate to improve. assert first_accept_rate < last_accept_rate - # Heuristic: expect at least 82.5% acceptance rate at the end. - assert last_accept_rate > 0.825 + # Heuristic: expect at least 80.0% acceptance rate at the end. + assert last_accept_rate > 0.80 del spec_llm torch.cuda.empty_cache()