From c2c661af9be413fb22adc59fc17fe5f5a680b313 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 25 Nov 2025 04:38:36 -0800 Subject: [PATCH 01/24] [Bugfix] Fix overallocation in MM profiling (#29386) Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0ce6c4a3204b0..e78d3c71af77a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4245,14 +4245,18 @@ class GPUModelRunner( # NOTE: This happens when encoder cache needs to store # the embeddings that encoder outputs are scattered onto. # In this case we create dummy embeddings of size - # (encode_budget, hidden_size) and scatter encoder - # output into it. + # (max_tokens_for_modality, hidden_size) and scatter + # encoder output into it. encoder_output_shape = dummy_encoder_outputs[0].shape - if encoder_output_shape[0] < encoder_budget: + max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[ + dummy_modality + ] + if encoder_output_shape[0] < max_mm_tokens_per_item: + encoder_hidden_size = encoder_output_shape[-1] expanded_outputs = [] for output in dummy_encoder_outputs: expanded = output.new_zeros( - (encoder_budget, encoder_output_shape[-1]) + (max_mm_tokens_per_item, encoder_hidden_size) ) num_tokens = output.shape[0] expanded[:num_tokens].copy_(output) From bf0c75cd4f638b359f52602bb0fd54ef434068cb Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Nov 2025 12:41:15 +0000 Subject: [PATCH 02/24] Make Transformers Nightly tests soft-fail and enable all tests (#29401) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e444becd9867b..10a19c52c72dc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -903,11 +903,12 @@ steps: - label: Transformers Nightly Models Test working_dir: "/vllm-workspace/" optional: true + soft_fail: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_transformers.py - # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl From 51fc9e017a721c7fb283cecf3231bbe6e358132b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Nov 2025 12:55:42 +0000 Subject: [PATCH 03/24] Scheduled removal of `CompilationConfig.use_inductor` (#29323) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/compile/fullgraph/test_simple.py | 12 +++++------ tests/engine/test_arg_utils.py | 9 ++++---- tests/utils_/test_argparse_utils.py | 4 ++-- vllm/config/compilation.py | 29 +------------------------- 4 files changed, 13 insertions(+), 41 deletions(-) diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py index e258133ab50a7..36cc1510ed798 100644 --- a/tests/compile/fullgraph/test_simple.py +++ b/tests/compile/fullgraph/test_simple.py @@ -55,7 +55,7 @@ class SillyModel(nn.Module): def _run_simple_model( splitting_ops, use_inductor_graph_partition, - use_inductor, + backend, expected_num_piecewise_graphs_seen, expected_num_piecewise_capturable_graphs_seen, expected_num_backend_compilations, @@ -64,7 +64,7 @@ def _run_simple_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - use_inductor=use_inductor, + backend=backend, splitting_ops=splitting_ops, use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_copy_inputs=True, @@ -124,14 +124,14 @@ def _run_simple_model( assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) -@pytest.mark.parametrize("use_inductor", [True, False]) +@pytest.mark.parametrize("backend", ["inductor", "eager"]) @torch.inference_mode() @create_new_process_for_each_test("spawn") -def test_simple_piecewise_compile(use_inductor): +def test_simple_piecewise_compile(backend): _run_simple_model( splitting_ops=["silly::attention"], use_inductor_graph_partition=False, - use_inductor=use_inductor, + backend=backend, # 2 * num_layers + 1 expected_num_piecewise_graphs_seen=5, # 1 + num_layers @@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch): _run_simple_model( splitting_ops=["silly::attention"], use_inductor_graph_partition=True, - use_inductor=True, + backend="inductor", # Since not splitting at fx graph level expected_num_piecewise_graphs_seen=1, # Since not splitting at fx graph level diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 10827e3b4b9cd..93bc94123aaa9 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -249,14 +249,13 @@ def test_compilation_config(): args = parser.parse_args( [ "-O", - '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' - '"use_inductor": false}', + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}', ] ) assert ( args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] - and not args.compilation_config.use_inductor + and args.compilation_config.backend == "eager" ) # set to string form of a dict @@ -264,13 +263,13 @@ def test_compilation_config(): [ "--compilation-config=" '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' - '"use_inductor": true}', + '"backend": "inductor"}', ] ) assert ( args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] - and args.compilation_config.use_inductor + and args.compilation_config.backend == "inductor" ) diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py index 3310753d2b6d6..32d4eca541356 100644 --- a/tests/utils_/test_argparse_utils.py +++ b/tests/utils_/test_argparse_utils.py @@ -166,7 +166,7 @@ def test_dict_args(parser): "--hf-overrides.key2.key4", "val3", # Test compile config and compilation mode - "-O.use_inductor=true", + "-O.use_inductor_graph_partition=true", "-O.backend", "custom", "-O1", @@ -219,7 +219,7 @@ def test_dict_args(parser): } assert parsed_args.compilation_config == { "mode": 1, - "use_inductor": True, + "use_inductor_graph_partition": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], } diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 556b2d9168b32..865d045676d14 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -264,7 +264,6 @@ class CompilationConfig: - [`cudagraph_copy_inputs`] [vllm.config.CompilationConfig.cudagraph_copy_inputs] - Inductor compilation: - - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] - [`inductor_compile_config`] [vllm.config.CompilationConfig.inductor_compile_config] @@ -348,7 +347,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor". Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -374,24 +373,6 @@ class CompilationConfig: Disabled by default until more models are supported/tested to work.""" # Inductor capture - use_inductor: bool | None = None - """ - Whether to use inductor compilation. - - This flag is deprecated and will be removed in the next release 0.12.0. - Please use the 'backend' option instead. - - - False: inductor compilation is not used. graph runs in eager - (custom_ops enabled by default). - - True: inductor compilation is used (custom_ops disabled by default). - One graph for symbolic shape and one graph per size in compile_sizes - are compiled using configurations in inductor_compile_config. - - This setting is ignored if mode Date: Tue, 25 Nov 2025 15:05:10 +0100 Subject: [PATCH 04/24] [Bugfix] Fix logic for choosing default prefix caching setting (#29393) Signed-off-by: Thomas Parnell --- tests/engine/test_arg_utils.py | 3 ++- vllm/engine/arg_utils.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 93bc94123aaa9..be926764e4948 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -277,8 +277,9 @@ def test_prefix_cache_default(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([]) + # should be None by default (depends on model). engine_args = EngineArgs.from_cli_args(args=args) - assert engine_args.enable_prefix_caching, "prefix caching should default to on." + assert engine_args.enable_prefix_caching is None # with flag to turn it on. args = parser.parse_args(["--enable-prefix-caching"]) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6b5c8ba87ecbf..177915715140f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -880,7 +880,11 @@ class EngineArgs: "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"] ) cache_group.add_argument( - "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"] + "--enable-prefix-caching", + **{ + **cache_kwargs["enable_prefix_caching"], + "default": None, + }, ) cache_group.add_argument( "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"] From 0231ce836a27ad0ed722e4b7162821baeeb19fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:23:38 +0100 Subject: [PATCH 05/24] Revert back to torch.equal over torch.allclose from #28819 (#29086) Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> --- vllm/v1/spec_decode/eagle.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 784ccbc04932f..7b9037c03d4f0 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1055,11 +1055,11 @@ class EagleProposer: elif ( isinstance(target_embed_tokens.weight, torch.Tensor) and isinstance(self.model.model.embed_tokens.weight, torch.Tensor) - and torch.allclose( + # TODO: Offload to CPU for comparison to avoid extra GPU memory + # usage in CI testing environments with limited GPU memory + and torch.equal( target_embed_tokens.weight.cpu(), self.model.model.embed_tokens.weight.cpu(), - rtol=1e-5, - atol=1e-7, ) ): share_embeddings = True @@ -1105,8 +1105,11 @@ class EagleProposer: hasattr(target_language_model, "lm_head") and isinstance(target_language_model.lm_head.weight, torch.Tensor) and isinstance(self.model.lm_head.weight, torch.Tensor) + # TODO: Offload to CPU for comparison to avoid extra GPU memory + # usage in CI testing environments with limited GPU memory and torch.equal( - target_language_model.lm_head.weight, self.model.lm_head.weight + target_language_model.lm_head.weight.cpu(), + self.model.lm_head.weight.cpu(), ) ): share_lm_head = True From 794029f012066e6039b63566ef7d845af1ade831 Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Tue, 25 Nov 2025 23:28:53 +0900 Subject: [PATCH 06/24] [Feature]: Improve GGUF loading from HuggingFace user experience like repo_id:quant_type (#29137) Signed-off-by: Injae Ryou Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- tests/models/test_gguf_download.py | 240 ++++++++++++++++++ tests/transformers_utils/test_utils.py | 146 +++++++++++ vllm/config/model.py | 15 +- vllm/engine/arg_utils.py | 6 +- .../model_loader/gguf_loader.py | 31 ++- .../model_loader/weight_utils.py | 46 ++++ vllm/transformers_utils/config.py | 51 +++- vllm/transformers_utils/processor.py | 10 +- vllm/transformers_utils/tokenizer.py | 17 +- vllm/transformers_utils/utils.py | 53 ++++ 10 files changed, 579 insertions(+), 36 deletions(-) create mode 100644 tests/models/test_gguf_download.py diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py new file mode 100644 index 0000000000000..155768ac9bff7 --- /dev/null +++ b/tests/models/test_gguf_download.py @@ -0,0 +1,240 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock, patch + +import pytest + +from vllm.config import ModelConfig +from vllm.config.load import LoadConfig +from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader +from vllm.model_executor.model_loader.weight_utils import download_gguf + + +class TestGGUFDownload: + """Test GGUF model downloading functionality.""" + + @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") + def test_download_gguf_single_file(self, mock_download): + """Test downloading a single GGUF file.""" + # Setup mock + mock_folder = "/tmp/mock_cache" + mock_download.return_value = mock_folder + + # Mock glob to return a single file + with patch("glob.glob") as mock_glob: + mock_glob.side_effect = lambda pattern, **kwargs: ( + [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else [] + ) + + result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S") + + # Verify download_weights_from_hf was called with correct patterns + mock_download.assert_called_once_with( + model_name_or_path="unsloth/Qwen3-0.6B-GGUF", + cache_dir=None, + allow_patterns=[ + "*-IQ1_S.gguf", + "*-IQ1_S-*.gguf", + "*/*-IQ1_S.gguf", + "*/*-IQ1_S-*.gguf", + ], + revision=None, + ignore_patterns=None, + ) + + # Verify result is the file path, not folder + assert result == f"{mock_folder}/model-IQ1_S.gguf" + + @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") + def test_download_gguf_sharded_files(self, mock_download): + """Test downloading sharded GGUF files.""" + mock_folder = "/tmp/mock_cache" + mock_download.return_value = mock_folder + + # Mock glob to return sharded files + with patch("glob.glob") as mock_glob: + mock_glob.side_effect = lambda pattern, **kwargs: ( + [ + f"{mock_folder}/model-Q2_K-00001-of-00002.gguf", + f"{mock_folder}/model-Q2_K-00002-of-00002.gguf", + ] + if "Q2_K" in pattern + else [] + ) + + result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K") + + # Should return the first file after sorting + assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf" + + @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") + def test_download_gguf_subdir(self, mock_download): + """Test downloading GGUF files from subdirectory.""" + mock_folder = "/tmp/mock_cache" + mock_download.return_value = mock_folder + + with patch("glob.glob") as mock_glob: + mock_glob.side_effect = lambda pattern, **kwargs: ( + [f"{mock_folder}/Q2_K/model-Q2_K.gguf"] + if "Q2_K" in pattern or "**/*.gguf" in pattern + else [] + ) + + result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K") + + assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf" + + @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf") + @patch("glob.glob", return_value=[]) + def test_download_gguf_no_files_found(self, mock_glob, mock_download): + """Test error when no GGUF files are found.""" + mock_folder = "/tmp/mock_cache" + mock_download.return_value = mock_folder + + with pytest.raises(ValueError, match="Downloaded GGUF files not found"): + download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S") + + +class TestGGUFModelLoader: + """Test GGUFModelLoader class methods.""" + + @patch("os.path.isfile", return_value=True) + def test_prepare_weights_local_file(self, mock_isfile): + """Test _prepare_weights with local file.""" + load_config = LoadConfig(load_format="gguf") + loader = GGUFModelLoader(load_config) + + # Create a simple mock ModelConfig with only the model attribute + model_config = MagicMock() + model_config.model = "/path/to/model.gguf" + + result = loader._prepare_weights(model_config) + assert result == "/path/to/model.gguf" + mock_isfile.assert_called_once_with("/path/to/model.gguf") + + @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download") + @patch("os.path.isfile", return_value=False) + def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download): + """Test _prepare_weights with HTTPS URL.""" + load_config = LoadConfig(load_format="gguf") + loader = GGUFModelLoader(load_config) + + mock_hf_download.return_value = "/downloaded/model.gguf" + + # Create a simple mock ModelConfig with only the model attribute + model_config = MagicMock() + model_config.model = "https://huggingface.co/model.gguf" + + result = loader._prepare_weights(model_config) + assert result == "/downloaded/model.gguf" + mock_hf_download.assert_called_once_with( + url="https://huggingface.co/model.gguf" + ) + + @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download") + @patch("os.path.isfile", return_value=False) + def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download): + """Test _prepare_weights with repo_id/filename.gguf format.""" + load_config = LoadConfig(load_format="gguf") + loader = GGUFModelLoader(load_config) + + mock_hf_download.return_value = "/downloaded/model.gguf" + + # Create a simple mock ModelConfig with only the model attribute + model_config = MagicMock() + model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf" + + result = loader._prepare_weights(model_config) + assert result == "/downloaded/model.gguf" + mock_hf_download.assert_called_once_with( + repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf" + ) + + @patch("vllm.config.model.get_hf_image_processor_config", return_value=None) + @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True) + @patch("vllm.config.model.get_config") + @patch("vllm.config.model.is_gguf", return_value=True) + @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf") + @patch("os.path.isfile", return_value=False) + def test_prepare_weights_repo_quant_type( + self, + mock_isfile, + mock_download_gguf, + mock_is_gguf, + mock_get_config, + mock_file_exists, + mock_get_image_config, + ): + """Test _prepare_weights with repo_id:quant_type format.""" + mock_hf_config = MagicMock() + mock_hf_config.architectures = ["Qwen3ForCausalLM"] + + class MockTextConfig: + max_position_embeddings = 4096 + sliding_window = None + model_type = "qwen3" + num_attention_heads = 32 + + mock_text_config = MockTextConfig() + mock_hf_config.get_text_config.return_value = mock_text_config + mock_hf_config.dtype = "bfloat16" + mock_get_config.return_value = mock_hf_config + + load_config = LoadConfig(load_format="gguf") + loader = GGUFModelLoader(load_config) + + mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf" + + model_config = ModelConfig( + model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B" + ) + result = loader._prepare_weights(model_config) + # The actual result will be the downloaded file path from mock + assert result == "/downloaded/model-IQ1_S.gguf" + mock_download_gguf.assert_called_once_with( + "unsloth/Qwen3-0.6B-GGUF", + "IQ1_S", + cache_dir=None, + revision=None, + ignore_patterns=["original/**/*"], + ) + + @patch("vllm.config.model.get_hf_image_processor_config", return_value=None) + @patch("vllm.config.model.get_config") + @patch("vllm.config.model.is_gguf", return_value=False) + @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False) + @patch("os.path.isfile", return_value=False) + def test_prepare_weights_invalid_format( + self, + mock_isfile, + mock_check_gguf, + mock_is_gguf, + mock_get_config, + mock_get_image_config, + ): + """Test _prepare_weights with invalid format.""" + mock_hf_config = MagicMock() + mock_hf_config.architectures = ["Qwen3ForCausalLM"] + + class MockTextConfig: + max_position_embeddings = 4096 + sliding_window = None + model_type = "qwen3" + num_attention_heads = 32 + + mock_text_config = MockTextConfig() + mock_hf_config.get_text_config.return_value = mock_text_config + mock_hf_config.dtype = "bfloat16" + mock_get_config.return_value = mock_hf_config + + load_config = LoadConfig(load_format="gguf") + loader = GGUFModelLoader(load_config) + + # Create ModelConfig with a valid repo_id to avoid validation errors + # Then test _prepare_weights with invalid format + model_config = ModelConfig(model="unsloth/Qwen3-0.6B") + # Manually set model to invalid format after creation + model_config.model = "invalid-format" + with pytest.raises(ValueError, match="Unrecognised GGUF reference"): + loader._prepare_weights(model_config) diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py index bfe1cec76c138..a8d0b9be9ec29 100644 --- a/tests/transformers_utils/test_utils.py +++ b/tests/transformers_utils/test_utils.py @@ -1,11 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from pathlib import Path +from unittest.mock import patch +import pytest from vllm.transformers_utils.utils import ( is_cloud_storage, is_gcs, + is_gguf, + is_remote_gguf, is_s3, + split_remote_gguf, ) @@ -28,3 +34,143 @@ def test_is_cloud_storage(): assert is_cloud_storage("s3://model-path/path-to-model") assert not is_cloud_storage("/unix/local/path") assert not is_cloud_storage("nfs://nfs-fqdn.local") + + +class TestIsRemoteGGUF: + """Test is_remote_gguf utility function.""" + + def test_is_remote_gguf_with_colon_and_slash(self): + """Test is_remote_gguf with repo_id:quant_type format.""" + # Valid quant types + assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") + assert is_remote_gguf("user/repo:Q2_K") + assert is_remote_gguf("repo/model:Q4_K") + assert is_remote_gguf("repo/model:Q8_0") + + # Invalid quant types should return False + assert not is_remote_gguf("repo/model:quant") + assert not is_remote_gguf("repo/model:INVALID") + assert not is_remote_gguf("repo/model:invalid_type") + + def test_is_remote_gguf_without_colon(self): + """Test is_remote_gguf without colon.""" + assert not is_remote_gguf("repo/model") + assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF") + + def test_is_remote_gguf_without_slash(self): + """Test is_remote_gguf without slash.""" + assert not is_remote_gguf("model.gguf") + # Even with valid quant_type, no slash means not remote GGUF + assert not is_remote_gguf("model:IQ1_S") + assert not is_remote_gguf("model:quant") + + def test_is_remote_gguf_local_path(self): + """Test is_remote_gguf with local file path.""" + assert not is_remote_gguf("/path/to/model.gguf") + assert not is_remote_gguf("./model.gguf") + + def test_is_remote_gguf_with_path_object(self): + """Test is_remote_gguf with Path object.""" + assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S")) + assert not is_remote_gguf(Path("repo/model")) + + def test_is_remote_gguf_with_http_https(self): + """Test is_remote_gguf with HTTP/HTTPS URLs.""" + # HTTP/HTTPS URLs should return False even with valid quant_type + assert not is_remote_gguf("http://example.com/repo/model:IQ1_S") + assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K") + assert not is_remote_gguf("http://repo/model:Q4_K") + assert not is_remote_gguf("https://repo/model:Q8_0") + + def test_is_remote_gguf_with_cloud_storage(self): + """Test is_remote_gguf with cloud storage paths.""" + # Cloud storage paths should return False even with valid quant_type + assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S") + assert not is_remote_gguf("gs://bucket/repo/model:Q2_K") + assert not is_remote_gguf("s3://repo/model:Q4_K") + assert not is_remote_gguf("gs://repo/model:Q8_0") + + +class TestSplitRemoteGGUF: + """Test split_remote_gguf utility function.""" + + def test_split_remote_gguf_valid(self): + """Test split_remote_gguf with valid repo_id:quant_type format.""" + repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") + assert repo_id == "unsloth/Qwen3-0.6B-GGUF" + assert quant_type == "IQ1_S" + + repo_id, quant_type = split_remote_gguf("repo/model:Q2_K") + assert repo_id == "repo/model" + assert quant_type == "Q2_K" + + def test_split_remote_gguf_with_path_object(self): + """Test split_remote_gguf with Path object.""" + repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S")) + assert repo_id == "unsloth/Qwen3-0.6B-GGUF" + assert quant_type == "IQ1_S" + + def test_split_remote_gguf_invalid(self): + """Test split_remote_gguf with invalid format.""" + # Invalid format (no colon) - is_remote_gguf returns False + with pytest.raises(ValueError, match="Wrong GGUF model"): + split_remote_gguf("repo/model") + + # Invalid quant type - is_remote_gguf returns False + with pytest.raises(ValueError, match="Wrong GGUF model"): + split_remote_gguf("repo/model:INVALID_TYPE") + + # HTTP URL - is_remote_gguf returns False + with pytest.raises(ValueError, match="Wrong GGUF model"): + split_remote_gguf("http://repo/model:IQ1_S") + + # Cloud storage - is_remote_gguf returns False + with pytest.raises(ValueError, match="Wrong GGUF model"): + split_remote_gguf("s3://bucket/repo/model:Q2_K") + + +class TestIsGGUF: + """Test is_gguf utility function.""" + + @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True) + def test_is_gguf_with_local_file(self, mock_check_gguf): + """Test is_gguf with local GGUF file.""" + assert is_gguf("/path/to/model.gguf") + assert is_gguf("./model.gguf") + + def test_is_gguf_with_remote_gguf(self): + """Test is_gguf with remote GGUF format.""" + # Valid remote GGUF format (repo_id:quant_type with valid quant_type) + assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") + assert is_gguf("repo/model:Q2_K") + assert is_gguf("repo/model:Q4_K") + + # Invalid quant_type should return False + assert not is_gguf("repo/model:quant") + assert not is_gguf("repo/model:INVALID") + + @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False) + def test_is_gguf_false(self, mock_check_gguf): + """Test is_gguf returns False for non-GGUF models.""" + assert not is_gguf("unsloth/Qwen3-0.6B") + assert not is_gguf("repo/model") + assert not is_gguf("model") + + def test_is_gguf_edge_cases(self): + """Test is_gguf with edge cases.""" + # Empty string + assert not is_gguf("") + + # Only colon, no slash (even with valid quant_type) + assert not is_gguf("model:IQ1_S") + + # Only slash, no colon + assert not is_gguf("repo/model") + + # HTTP/HTTPS URLs + assert not is_gguf("http://repo/model:IQ1_S") + assert not is_gguf("https://repo/model:Q2_K") + + # Cloud storage + assert not is_gguf("s3://bucket/repo/model:IQ1_S") + assert not is_gguf("gs://bucket/repo/model:Q2_K") diff --git a/vllm/config/model.py b/vllm/config/model.py index caa9a3440c41d..14ffdec2e09d1 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -39,7 +39,12 @@ from vllm.transformers_utils.gguf_utils import ( maybe_patch_hf_config_from_gguf, ) from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri -from vllm.transformers_utils.utils import check_gguf_file, maybe_model_redirect +from vllm.transformers_utils.utils import ( + is_gguf, + is_remote_gguf, + maybe_model_redirect, + split_remote_gguf, +) from vllm.utils.import_utils import LazyLoader from vllm.utils.torch_utils import common_broadcastable_dtype @@ -440,7 +445,8 @@ class ModelConfig: self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: - if check_gguf_file(self.model): + # Check if this is a GGUF model (either local file or remote GGUF) + if is_gguf(self.model): raise ValueError( "Using a tokenizer is mandatory when loading a GGUF model. " "Please specify the tokenizer path or name using the " @@ -832,7 +838,10 @@ class ModelConfig: self.tokenizer = object_storage_tokenizer.dir def _get_encoder_config(self): - return get_sentence_transformer_tokenizer_config(self.model, self.revision) + model = self.model + if is_remote_gguf(model): + model, _ = split_remote_gguf(model) + return get_sentence_transformer_tokenizer_config(model, self.revision) def _verify_tokenizer_mode(self) -> None: tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower()) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 177915715140f..6d5b3392baa2b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -86,7 +86,7 @@ from vllm.transformers_utils.config import ( is_interleaved, maybe_override_with_speculators, ) -from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage +from vllm.transformers_utils.utils import is_cloud_storage, is_gguf from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip @@ -1148,8 +1148,8 @@ class EngineArgs: return engine_args def create_model_config(self) -> ModelConfig: - # gguf file needs a specific model loader and doesn't use hf_repo - if check_gguf_file(self.model): + # gguf file needs a specific model loader + if is_gguf(self.model): self.quantization = self.load_format = "gguf" # NOTE(woosuk): In V1, we use separate processes for workers (unless diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 2416836be03c4..74052f72ceab9 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -18,6 +18,7 @@ from vllm.model_executor.model_loader.utils import ( process_weights_after_loading, ) from vllm.model_executor.model_loader.weight_utils import ( + download_gguf, get_gguf_extra_tensor_names, get_gguf_weight_type_map, gguf_quant_weights_iterator, @@ -43,7 +44,8 @@ class GGUFModelLoader(BaseModelLoader): f"load format {load_config.load_format}" ) - def _prepare_weights(self, model_name_or_path: str): + def _prepare_weights(self, model_config: ModelConfig): + model_name_or_path = model_config.model if os.path.isfile(model_name_or_path): return model_name_or_path # for raw HTTPS link @@ -55,12 +57,23 @@ class GGUFModelLoader(BaseModelLoader): if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"): repo_id, filename = model_name_or_path.rsplit("/", 1) return hf_hub_download(repo_id=repo_id, filename=filename) - else: - raise ValueError( - f"Unrecognised GGUF reference: {model_name_or_path} " - "(expected local file, raw URL, or /.gguf)" + # repo_id:quant_type + elif "/" in model_name_or_path and ":" in model_name_or_path: + repo_id, quant_type = model_name_or_path.rsplit(":", 1) + return download_gguf( + repo_id, + quant_type, + cache_dir=self.load_config.download_dir, + revision=model_config.revision, + ignore_patterns=self.load_config.ignore_patterns, ) + raise ValueError( + f"Unrecognised GGUF reference: {model_name_or_path} " + "(expected local file, raw URL, /.gguf, " + "or :)" + ) + def _get_gguf_weights_map(self, model_config: ModelConfig): """ GGUF uses this naming convention for their tensors from HF checkpoint: @@ -244,7 +257,7 @@ class GGUFModelLoader(BaseModelLoader): gguf_to_hf_name_map: dict[str, str], ) -> dict[str, str]: weight_type_map = get_gguf_weight_type_map( - model_config.model, gguf_to_hf_name_map + model_name_or_path, gguf_to_hf_name_map ) is_multimodal = hasattr(model_config.hf_config, "vision_config") if is_multimodal: @@ -290,10 +303,10 @@ class GGUFModelLoader(BaseModelLoader): yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) def download_model(self, model_config: ModelConfig) -> None: - self._prepare_weights(model_config.model) + self._prepare_weights(model_config) def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: - local_model_path = self._prepare_weights(model_config.model) + local_model_path = self._prepare_weights(model_config) gguf_weights_map = self._get_gguf_weights_map(model_config) model.load_weights( self._get_weights_iterator(model_config, local_model_path, gguf_weights_map) @@ -303,7 +316,7 @@ class GGUFModelLoader(BaseModelLoader): self, vllm_config: VllmConfig, model_config: ModelConfig ) -> nn.Module: device_config = vllm_config.device_config - local_model_path = self._prepare_weights(model_config.model) + local_model_path = self._prepare_weights(model_config) gguf_weights_map = self._get_gguf_weights_map(model_config) # we can only know if tie word embeddings after mapping weights if "lm_head.weight" in get_gguf_extra_tensor_names( diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 4572ebe2ea11b..0809bdfa9d4c2 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -369,6 +369,52 @@ def get_sparse_attention_config( return config +def download_gguf( + repo_id: str, + quant_type: str, + cache_dir: str | None = None, + revision: str | None = None, + ignore_patterns: str | list[str] | None = None, +) -> str: + # Use patterns that snapshot_download can handle directly + # Patterns to match: + # - *-{quant_type}.gguf (root) + # - *-{quant_type}-*.gguf (root sharded) + # - */*-{quant_type}.gguf (subdir) + # - */*-{quant_type}-*.gguf (subdir sharded) + allow_patterns = [ + f"*-{quant_type}.gguf", + f"*-{quant_type}-*.gguf", + f"*/*-{quant_type}.gguf", + f"*/*-{quant_type}-*.gguf", + ] + + # Use download_weights_from_hf which handles caching and downloading + folder = download_weights_from_hf( + model_name_or_path=repo_id, + cache_dir=cache_dir, + allow_patterns=allow_patterns, + revision=revision, + ignore_patterns=ignore_patterns, + ) + + # Find the downloaded file(s) in the folder + local_files = [] + for pattern in allow_patterns: + # Convert pattern to glob pattern for local filesystem + glob_pattern = os.path.join(folder, pattern) + local_files.extend(glob.glob(glob_pattern)) + + if not local_files: + raise ValueError( + f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}" + ) + + # Sort to ensure consistent ordering (prefer non-sharded files) + local_files.sort(key=lambda x: (x.count("-"), x)) + return local_files[0] + + def download_weights_from_hf( model_name_or_path: str, cache_dir: str | None, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index c1880a3fba0ee..a29d92f67f5df 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -42,7 +42,10 @@ from vllm.logger import init_logger from vllm.transformers_utils.config_parser_base import ConfigParserBase from vllm.transformers_utils.utils import ( check_gguf_file, + is_gguf, + is_remote_gguf, parse_safetensors_file_metadata, + split_remote_gguf, ) if envs.VLLM_USE_MODELSCOPE: @@ -629,10 +632,12 @@ def maybe_override_with_speculators( Returns: Tuple of (resolved_model, resolved_tokenizer, speculative_config) """ - is_gguf = check_gguf_file(model) - if is_gguf: + if check_gguf_file(model): kwargs["gguf_file"] = Path(model).name gguf_model_repo = Path(model).parent + elif is_remote_gguf(model): + repo_id, _ = split_remote_gguf(model) + gguf_model_repo = Path(repo_id) else: gguf_model_repo = None kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE @@ -678,10 +683,18 @@ def get_config( ) -> PretrainedConfig: # Separate model folder from file path for GGUF models - is_gguf = check_gguf_file(model) - if is_gguf: - kwargs["gguf_file"] = Path(model).name - model = Path(model).parent + _is_gguf = is_gguf(model) + _is_remote_gguf = is_remote_gguf(model) + if _is_gguf: + if check_gguf_file(model): + # Local GGUF file + kwargs["gguf_file"] = Path(model).name + model = Path(model).parent + elif _is_remote_gguf: + # Remote GGUF - extract repo_id from repo_id:quant_type format + # The actual GGUF file will be downloaded later by GGUFModelLoader + # Keep model as repo_id:quant_type for download, but use repo_id for config + model, _ = split_remote_gguf(model) if config_format == "auto": try: @@ -689,10 +702,25 @@ def get_config( # Transformers implementation. if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision): config_format = "mistral" - elif is_gguf or file_or_path_exists( + elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists( model, HF_CONFIG_NAME, revision=revision ): config_format = "hf" + # Remote GGUF models must have config.json in repo, + # otherwise the config can't be parsed correctly. + # FIXME(Isotr0py): Support remote GGUF repos without config.json + elif _is_remote_gguf and not file_or_path_exists( + model, HF_CONFIG_NAME, revision=revision + ): + err_msg = ( + "Could not find config.json for remote GGUF model repo. " + "To load remote GGUF model through `:`, " + "ensure your model has config.json (HF format) file. " + "Otherwise please specify --hf-config-path " + "in engine args to fetch config from unquantized hf model." + ) + logger.error(err_msg) + raise ValueError(err_msg) else: raise ValueError( "Could not detect config format for no config file found. " @@ -713,9 +741,6 @@ def get_config( "'config.json'.\n" " - For Mistral models: ensure the presence of a " "'params.json'.\n" - "3. For GGUF: pass the local path of the GGUF checkpoint.\n" - " Loading GGUF from a remote repo directly is not yet " - "supported.\n" ).format(model=model) raise ValueError(error_message) from e @@ -729,7 +754,7 @@ def get_config( **kwargs, ) # Special architecture mapping check for GGUF models - if is_gguf: + if _is_gguf: if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: raise RuntimeError(f"Can't get gguf config for {config.model_type}.") model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] @@ -889,6 +914,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None A dictionary containing the pooling type and whether normalization is used, or None if no pooling configuration is found. """ + if is_remote_gguf(model): + model, _ = split_remote_gguf(model) modules_file_name = "modules.json" @@ -1108,6 +1135,8 @@ def get_hf_image_processor_config( # Separate model folder from file path for GGUF models if check_gguf_file(model): model = Path(model).parent + elif is_remote_gguf(model): + model, _ = split_remote_gguf(model) return get_image_processor_config( model, token=hf_token, revision=revision, **kwargs ) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 8deacb5b07913..63cdf63370342 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -18,7 +18,7 @@ from transformers.processing_utils import ProcessorMixin from transformers.video_processing_utils import BaseVideoProcessor from typing_extensions import TypeVar -from vllm.transformers_utils.utils import check_gguf_file, convert_model_repo_to_path +from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: @@ -236,8 +236,8 @@ def cached_processor_from_config( processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: - if check_gguf_file(model_config.model): - assert not check_gguf_file(model_config.tokenizer), ( + if is_gguf(model_config.model): + assert not is_gguf(model_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load processor." ) @@ -350,8 +350,8 @@ def cached_image_processor_from_config( model_config: "ModelConfig", **kwargs: Any, ): - if check_gguf_file(model_config.model): - assert not check_gguf_file(model_config.tokenizer), ( + if is_gguf(model_config.model): + assert not is_gguf(model_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load image processor." ) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 233076741503d..f0e0ba8ef4246 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -20,7 +20,12 @@ from vllm.transformers_utils.config import ( list_filtered_repo_files, ) from vllm.transformers_utils.tokenizers import MistralTokenizer -from vllm.transformers_utils.utils import check_gguf_file +from vllm.transformers_utils.utils import ( + check_gguf_file, + is_gguf, + is_remote_gguf, + split_remote_gguf, +) if TYPE_CHECKING: from vllm.config import ModelConfig @@ -180,10 +185,12 @@ def get_tokenizer( kwargs["truncation_side"] = "left" # Separate model folder from file path for GGUF models - is_gguf = check_gguf_file(tokenizer_name) - if is_gguf: - kwargs["gguf_file"] = Path(tokenizer_name).name - tokenizer_name = Path(tokenizer_name).parent + if is_gguf(tokenizer_name): + if check_gguf_file(tokenizer_name): + kwargs["gguf_file"] = Path(tokenizer_name).name + tokenizer_name = Path(tokenizer_name).parent + elif is_remote_gguf(tokenizer_name): + tokenizer_name, _ = split_remote_gguf(tokenizer_name) # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format # first to use official Mistral tokenizer if possible. diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 901a64d9d2633..45a873c9f7001 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -9,6 +9,8 @@ from os import PathLike from pathlib import Path from typing import Any +from gguf import GGMLQuantizationType + import vllm.envs as envs from vllm.logger import init_logger @@ -46,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool: return False +@cache +def is_remote_gguf(model: str | Path) -> bool: + """Check if the model is a remote GGUF model.""" + model = str(model) + return ( + (not is_cloud_storage(model)) + and (not model.startswith(("http://", "https://"))) + and ("/" in model and ":" in model) + and is_valid_gguf_quant_type(model.rsplit(":", 1)[1]) + ) + + +def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool: + """Check if the quant type is a valid GGUF quant type.""" + return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None + + +def split_remote_gguf(model: str | Path) -> tuple[str, str]: + """Split the model into repo_id and quant type.""" + model = str(model) + if is_remote_gguf(model): + parts = model.rsplit(":", 1) + return (parts[0], parts[1]) + raise ValueError( + "Wrong GGUF model or invalid GGUF quant type: %s.\n" + "- It should be in repo_id:quant_type format.\n" + "- Valid GGMLQuantizationType values: %s", + model, + GGMLQuantizationType._member_names_, + ) + + +def is_gguf(model: str | Path) -> bool: + """Check if the model is a GGUF model. + + Args: + model: Model name, path, or Path object to check. + + Returns: + True if the model is a GGUF model, False otherwise. + """ + model = str(model) + + # Check if it's a local GGUF file + if check_gguf_file(model): + return True + + # Check if it's a remote GGUF model (repo_id:quant_type format) + return is_remote_gguf(model) + + def modelscope_list_repo_files( repo_id: str, revision: str | None = None, From dbc3d9991ab0e5adc0db6a8c71c9059268032a14 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Nov 2025 09:46:18 -0500 Subject: [PATCH 07/24] [UX] Put CUDA attention backend selection log into one line (#29337) Signed-off-by: mgoin --- vllm/platforms/cuda.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 06793a3d1bb14..75b6bc77e4c1c 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -407,9 +407,6 @@ class CudaPlatformBase(Platform): # We have found some valid backends. Select the one with the # highest priority. - logger.info( - "Valid backends: %s", [b[0].name for b in valid_backends_priorities] - ) sorted_indices = sorted( range(len(valid_backends_priorities)), key=lambda i: valid_backends_priorities[i][1], @@ -417,8 +414,9 @@ class CudaPlatformBase(Platform): selected_index = sorted_indices[0] selected_backend = valid_backends_priorities[selected_index][0] logger.info( - "Using %s backend.", + "Using %s attention backend out of potential backends: %s", selected_backend.name, + [b[0].name for b in valid_backends_priorities], ) return selected_backend.get_path() From e502098643b863f09dd9d8e74249523641f01298 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Nov 2025 09:59:07 -0500 Subject: [PATCH 08/24] [Kernel] Add NVFP4 MoE CUTLASS support for SM120 (#29242) Signed-off-by: mgoin Signed-off-by: Michael Goin --- CMakeLists.txt | 5 +- .../fp4/nvfp4_blockwise_moe_kernel.cu | 230 +++++++++++++++++- csrc/quantization/fp4/nvfp4_experts_quant.cu | 2 +- csrc/quantization/fp4/nvfp4_quant_entry.cu | 10 +- .../w8a8/cutlass/scaled_mm_entry.cu | 27 +- docs/design/moe_kernel_features.md | 2 +- .../compressed_tensors_moe.py | 14 +- .../layers/quantization/modelopt.py | 4 + 8 files changed, 264 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86746a0db4c0e..d88ba3aa66303 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" - "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") + "csrc/quantization/fp4/nvfp4_experts_quant.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu" + "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${FP4_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") else() message(STATUS "Not building NVFP4 as no compatible archs were found.") diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu index 5b007e5ea3283..6744402783832 100644 --- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu +++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu @@ -22,6 +22,7 @@ #include #include #include +#include "cutlass_extensions/common.hpp" #include "cute/tensor.hpp" #include "cutlass/tensor_ref.h" @@ -173,7 +174,7 @@ void run_get_group_gemm_starts( } template -void run_fp4_blockwise_scaled_group_mm( +void run_fp4_blockwise_scaled_group_mm_sm100( torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, const torch::Tensor& alphas, const torch::Tensor& problem_sizes, @@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm( auto can_implement_status = gemm_op.can_implement(args); TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, - "Failed to implement GEMM"); + "Failed to implement GEMM: status=", (int)can_implement_status); // Run the GEMM auto status = gemm_op.initialize(args, workspace.data_ptr()); - TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM"); + TORCH_CHECK(status == cutlass::Status::kSuccess, + "Failed to initialize GEMM: status=", (int)status, + " workspace_size=", workspace_size, " num_experts=", num_experts, + " M=", M, " N=", N, " K=", K); status = gemm_op.run(args, workspace.data_ptr(), stream); TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); } +void run_fp4_blockwise_scaled_group_mm_sm120( + torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, + const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, + const torch::Tensor& alphas, const torch::Tensor& problem_sizes, + const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M, + int N, int K) { + using ProblemShape = + cutlass::gemm::GroupProblemShape>; + using ElementType = cutlass::float_e2m1_t; + using ElementSFType = cutlass::float_ue4m3_t; + using ElementA = cutlass::nv_float4_t; + using ElementB = cutlass::nv_float4_t; + + // NOTE: For SM120 it seems templating the output type is not supported and + // we need to hardcode the output type to bfloat16 + using ElementC = cutlass::bfloat16_t; + using ElementD = ElementC; + using ElementAccumulator = float; + // Layout definitions + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = LayoutC; + + // Alignment constraints + static constexpr int AlignmentA = 32; + static constexpr int AlignmentB = 32; + static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + + // Architecture definitions + using ArchTag = cutlass::arch::Sm120; + using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; + + using ClusterShape = Shape<_1, _1, _1>; + using MmaTileShape = Shape<_128, _128, _128>; + + using FusionOperation = cutlass::epilogue::fusion::LinearCombination< + ElementD, ElementAccumulator, ElementC, ElementAccumulator>; + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, MmaTileShape, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, + ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD, + LayoutD*, AlignmentD, + cutlass::epilogue::collective::EpilogueScheduleAuto, + FusionOperation>::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB, + LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp; + + using GemmKernel = + cutlass::gemm::kernel::GemmUniversal; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + using StrideA = typename Gemm::GemmKernel::InternalStrideA; + using StrideB = typename Gemm::GemmKernel::InternalStrideB; + using StrideC = typename Gemm::GemmKernel::InternalStrideC; + using StrideD = typename Gemm::GemmKernel::InternalStrideD; + + using LayoutSFA = + typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA; + using LayoutSFB = + typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB; + using ScaleConfig = + typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig; + + using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape; + int num_experts = static_cast(expert_offsets.size(0)); + auto options_int = + torch::TensorOptions().dtype(torch::kInt64).device(a.device()); + + torch::Tensor a_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_ptrs = torch::empty(num_experts, options_int); + torch::Tensor out_ptrs = torch::empty(num_experts, options_int); + torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int); + torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int); + torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int); + torch::Tensor c_strides1 = + torch::full({num_experts}, output.stride(0), options_int); + torch::Tensor a_strides1 = + torch::full({num_experts}, a.stride(0) * 2, options_int); + torch::Tensor b_strides1 = + torch::full({num_experts}, b.stride(1) * 2, options_int); + + run_get_group_gemm_starts( + a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs, + layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas, + expert_offsets, sf_offsets, problem_sizes, M, N, K); + + // Create an instance of the GEMM + Gemm gemm_op; + + // Initialize problem_sizes_as_shapes correctly + UnderlyingProblemShape* problem_sizes_as_shapes = + static_cast(problem_sizes.data_ptr()); + + // Set the Scheduler info + cutlass::KernelHardwareInfo hw_info; + using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions; + typename Gemm::GemmKernel::TileSchedulerArguments scheduler; + scheduler.raster_order = RasterOrderOptions::AlongM; + hw_info.device_id = a.get_device(); + static std::unordered_map cached_sm_counts; + if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) { + cached_sm_counts[hw_info.device_id] = + cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + hw_info.device_id); + } + hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX); + + // Mainloop Arguments + typename GemmKernel::MainloopArguments mainloop_args{ + static_cast(a_ptrs.data_ptr()), + static_cast(a_strides1.data_ptr()), + static_cast(b_ptrs.data_ptr()), + static_cast(b_strides1.data_ptr()), + static_cast(a_scales_ptrs.data_ptr()), + reinterpret_cast(layout_sfa.data_ptr()), + static_cast(b_scales_ptrs.data_ptr()), + reinterpret_cast(layout_sfb.data_ptr())}; + + // Epilogue Arguments + typename GemmKernel::EpilogueArguments epilogue_args{ + {}, // epilogue.thread + nullptr, + static_cast(c_strides1.data_ptr()), + static_cast(out_ptrs.data_ptr()), + static_cast(c_strides1.data_ptr())}; + auto& fusion_args = epilogue_args.thread; + fusion_args.alpha_ptr_array = + reinterpret_cast(alpha_ptrs.data_ptr()); + fusion_args.dAlpha = {_0{}, _0{}, 1}; + fusion_args.beta = 0.0f; + + // Gemm Arguments + typename GemmKernel::Arguments args{ + cutlass::gemm::GemmUniversalMode::kGrouped, + {num_experts, problem_sizes_as_shapes, nullptr}, + mainloop_args, + epilogue_args, + hw_info, + scheduler}; + + size_t workspace_size = Gemm::get_workspace_size(args); + auto const workspace_options = + torch::TensorOptions().dtype(torch::kUInt8).device(a.device()); + auto workspace = torch::empty(workspace_size, workspace_options); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device()); + + auto can_implement_status = gemm_op.can_implement(args); + TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, + "Failed to implement GEMM: status=", (int)can_implement_status); + + // Run the GEMM + auto status = gemm_op.initialize(args, workspace.data_ptr()); + TORCH_CHECK(status == cutlass::Status::kSuccess, + "Failed to initialize GEMM: status=", (int)status, + " workspace_size=", workspace_size, " num_experts=", num_experts, + " M=", M, " N=", N, " K=", K); + + status = gemm_op.run(args, workspace.data_ptr(), stream); + TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); +} + +template +void run_fp4_blockwise_scaled_group_mm( + torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, + const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, + const torch::Tensor& alphas, const torch::Tensor& problem_sizes, + const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M, + int N, int K) { + int32_t version_num = get_sm_version_num(); +#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120 + if (version_num >= 120 && version_num < 130) { + run_fp4_blockwise_scaled_group_mm_sm120( + output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes, + expert_offsets, sf_offsets, M, N, K); + return; + } +#endif #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 + if (version_num >= 100 && version_num < 120) { + run_fp4_blockwise_scaled_group_mm_sm100( + output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes, + expert_offsets, sf_offsets, M, N, K); + return; + } +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ", + version_num, ". Required capability: 100 or 120"); +} + +#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \ + (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120) constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte; constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn; #endif @@ -374,7 +583,8 @@ void cutlass_fp4_group_mm( const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales, const torch::Tensor& alphas, const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) { -#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 +#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \ + (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120) // Input validation CHECK_INPUT(a, FLOAT4_E2M1X2, "a"); CHECK_INPUT(b, FLOAT4_E2M1X2, "b"); @@ -408,6 +618,14 @@ void cutlass_fp4_group_mm( output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes, expert_offsets, sf_offsets, M, N, K); } else { + #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120 + int32_t version_num = get_sm_version_num(); + if (version_num >= 120 && version_num < 130) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ", + output.scalar_type()); + } + #endif run_fp4_blockwise_scaled_group_mm( output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes, expert_offsets, sf_offsets, M, N, K); @@ -416,8 +634,8 @@ void cutlass_fp4_group_mm( TORCH_CHECK_NOT_IMPLEMENTED( false, "No compiled cutlass_fp4_group_mm kernel, vLLM must " - "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA " - "12.8 or above."); + "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 " + "and CUDA 12.8 or above."); #endif } diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 6d385e0dd94e7..82c53c2375a31 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float; constexpr auto INT = at::ScalarType::Int; constexpr auto UINT8 = at::ScalarType::Byte; -void scaled_fp4_experts_quant_sm100a( +void scaled_fp4_experts_quant_sm1xxa( torch::Tensor& output, torch::Tensor& output_scale, torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input_offset_by_experts, diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu index c2b39e5438805..fb6d22f035b99 100644 --- a/csrc/quantization/fp4/nvfp4_quant_entry.cu +++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu @@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, torch::Tensor const& input_sf); #endif -#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 -void scaled_fp4_experts_quant_sm100a( +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +void scaled_fp4_experts_quant_sm1xxa( torch::Tensor& output, torch::Tensor& output_scale, torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input_offset_by_experts, @@ -54,8 +55,9 @@ void scaled_fp4_experts_quant( torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input_offset_by_experts, torch::Tensor const& output_scale_offset_by_experts) { -#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100 - return scaled_fp4_experts_quant_sm100a( +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + return scaled_fp4_experts_quant_sm1xxa( output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts); #endif diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu index 1001af05ff003..c5012a8669317 100644 --- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu +++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu @@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a, std::optional const& bias); #endif -#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \ - defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \ - defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120 +#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) || \ + (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \ + (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120) void get_cutlass_moe_mm_data_caller( const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data( // This function currently gets compiled only if we have a valid cutlass moe // mm to run it for. int32_t version_num = get_sm_version_num(); -#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ - (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) +#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ + (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \ + (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120) get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1, problem_sizes2, input_permutation, output_permutation, num_experts, n, k, @@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data( false, "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for " "CUDA device capability: ", - version_num, ". Required capability: 90 or 100"); + version_num, ". Required capability: 90, 100, or 120"); } void get_cutlass_moe_mm_problem_sizes( @@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes( torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets) { int32_t version_num = get_sm_version_num(); -#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ - (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) +#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ + (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \ + (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120) get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets); @@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes( false, "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm " "kernel for CUDA device capability: ", - version_num, ". Required capability: 90 or 100"); + version_num, ". Required capability: 90, 100, or 120"); } void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, @@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, // This function currently gets compiled only if we have a valid cutlass moe // mm to run it for. int32_t version_num = get_sm_version_num(); -#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ - (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) +#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ + (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \ + (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120) get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens, num_local_experts, padded_m, n, k); @@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, false, "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel " "for CUDA device capability: ", - version_num, ". Required capability: 90 or 100"); + version_num, ". Required capability: 90, 100, or 120"); } void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index f0d5a3e934f39..e54a9e2bc5e77 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod] - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod] -- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod] +- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod] - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod] - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod] - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod] diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 149e4419c64a4..71d7de97d4a10 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -103,7 +103,7 @@ __all__ = [ "CompressedTensorsW8A8Int8MoEMethod", "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod", - "CompressedTensorsW4A4MoeMethod", + "CompressedTensorsW4A4Nvfp4MoeMethod", "CompressedTensorsW4A8Int8MoEMethod", ] @@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): quant_config, layer.moe_config ) elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant): - return CompressedTensorsW4A4MoeMethod(layer.moe_config) + return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config) elif ( quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant) or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant) @@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): ) -class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): +class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod): def __init__(self, moe: FusedMoEConfig): from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 detect_nvfp4_moe_support, @@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): self.flashinfer_moe_backend = get_flashinfer_moe_backend() logger.info_once( f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels" - " for CompressedTensorsW4A4MoeMethod." + " for CompressedTensorsW4A4Nvfp4MoeMethod." ) + elif self.use_marlin: + logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.") + else: + logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.") def create_weights( self, @@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): assert expert_map is None, ( "Expert Parallelism / expert_map " "is currently not supported for " - "CompressedTensorsW4A4MoeMethod." + "CompressedTensorsW4A4Nvfp4MoeMethod." ) assert self.moe_quant_config is not None diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 8165673135910..2cf7089e0ff90 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels" " for ModelOptNvFp4FusedMoE." ) + elif self.use_marlin: + logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.") + else: + logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.") def maybe_make_prepare_finalize( self, From 48ddb02b79d7e22e2eefbf5294bf70de50afd1b2 Mon Sep 17 00:00:00 2001 From: Yifan Qiao Date: Tue, 25 Nov 2025 07:30:57 -0800 Subject: [PATCH 09/24] [Hybrid Allocator] Support KV cache groups with different block_size (#29143) Signed-off-by: Yifan Qiao Co-authored-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 49 +++++- tests/v1/core/test_prefix_caching.py | 95 ++++++++++- .../core/test_single_type_kv_cache_manager.py | 22 ++- vllm/engine/arg_utils.py | 8 +- vllm/model_executor/models/config.py | 11 +- vllm/v1/core/block_pool.py | 22 ++- vllm/v1/core/kv_cache_coordinator.py | 98 ++++++++--- vllm/v1/core/kv_cache_manager.py | 25 +-- vllm/v1/core/kv_cache_utils.py | 156 +++++++++++++++--- vllm/v1/core/sched/scheduler.py | 1 + vllm/v1/core/single_type_kv_cache_manager.py | 72 +++++++- 11 files changed, 472 insertions(+), 87 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 12ed59b6e863b..58a7a2692bfc8 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead(): ) # Test case 1: Requires additional lookahead tokens - kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100) + kv_cache_manager = KVCacheManager( + kv_cache_config=config, max_model_len=100, hash_block_size=block_size + ) blocks = kv_cache_manager.allocate_slots( request, num_new_tokens=3, @@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead(): assert len(blocks.get_block_ids()[0]) == 2 # ceil(5/4)=2 blocks # Test case 2: With precomputed blocks - kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100) + kv_cache_manager = KVCacheManager( + kv_cache_config=config, max_model_len=100, hash_block_size=block_size + ) # required_blocks = ceil((3 + 2) /4) = 2 blocks = kv_cache_manager.allocate_slots( request, @@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead(): # Test case 3: With precomputed blocks # required_blocks = ceil((3 + 4) / 4) = 2 - kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100) + kv_cache_manager = KVCacheManager( + kv_cache_config=config, max_model_len=100, hash_block_size=block_size + ) blocks = kv_cache_manager.allocate_slots( request, num_new_tokens=3, @@ -1495,7 +1501,8 @@ def test_get_kv_cache_config_one_worker(): ), ], ) - # different hidden size + + # different hidden size but same type, use UniformTypeKVCacheSpecs kv_cache_specs_hybrid = { "layer_1": new_kv_cache_spec(head_size=128), "layer_2": new_kv_cache_spec(head_size=64), @@ -1519,6 +1526,40 @@ def test_get_kv_cache_config_one_worker(): ], ) + # Different hidden size and different type, align by different block size + kv_cache_specs_hybrid = { + "layer_1": new_kv_cache_spec(head_size=64), + "layer_2": new_sliding_window_spec(head_size=32), + } + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32] + )[0] + assert kv_cache_config_hybrid == KVCacheConfig( + num_blocks=32, + kv_cache_tensors=[ + KVCacheTensor( + size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"] + ), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)), + KVCacheGroupSpec( + ["layer_2"], new_sliding_window_spec(head_size=32, block_size=32) + ), + ], + ) + + # different hidden size that cannot be aligned by using different block size + kv_cache_specs_hybrid = { + "layer_1": new_kv_cache_spec(head_size=64), + "layer_2": new_sliding_window_spec(head_size=96), + } + + with pytest.raises(NotImplementedError): + get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32] + )[0] + # Test num_gpu_blocks_override vllm_config.cache_config.num_gpu_blocks_override = 16 kv_cache_config_override_blocks = get_kv_cache_configs( diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 2291f363731f2..64fd5ab1dd9aa 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -134,6 +134,7 @@ def test_prefill(hash_fn): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # Complete 3 blocks (48 tokens) @@ -256,6 +257,7 @@ def test_prefill_hybrid_model(): make_kv_cache_config_hybrid_model(block_size, 21), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) hash_fn = sha256 @@ -416,6 +418,7 @@ def test_prefill_plp(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # the default hash function is sha256 hash_fn = sha256 @@ -523,6 +526,7 @@ def test_decode(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # Complete 3 blocks (48 tokens) @@ -585,6 +589,7 @@ def test_evict(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) last_token_id = 5 * 16 + 7 @@ -643,6 +648,7 @@ def test_hash_block_correct_reuse(): make_kv_cache_config(16, 2), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # Allocate 1 block and cache it. @@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted(): make_kv_cache_config(block_size, 3), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # Allocate a block and cache it. @@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled(): make_kv_cache_config(block_size, 5), max_model_len=8192, enable_caching=False, + hash_block_size=block_size, ) req1 = make_request( @@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn): block_pool = BlockPool( num_gpu_blocks=5, enable_caching=True, + hash_block_size=block_size, ) # Req: # Block 0: [0, 1, 2, 3] @@ -833,7 +842,9 @@ def test_cache_blocks_multi_group(): This tests that blocks are cached correctly for different kv cache groups. """ block_size = 4 - block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True) + block_pool = BlockPool( + num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size + ) # Req: # Block 0/4: [0, 1, 2, 3] @@ -921,6 +932,7 @@ def test_mm_prefix_caching(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # Common prompt tokens (T is text tokens and P is image placeholder tokens) @@ -1020,6 +1032,7 @@ def test_cache_key_salting(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # 3 complete blocks and an incomplete block with 11 tokens. @@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) # Complete 3 blocks (48 tokens) # | Common-0 | Common-1 | Common-2 | ... | @@ -1173,6 +1187,7 @@ def test_reset_prefix_cache(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, ) full_block_token_ids = [i for i in range(3) for _ in range(16)] @@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled(): make_kv_cache_config(block_size, 11), max_model_len=8192, enable_caching=True, + hash_block_size=block_size, log_stats=False, # Disable logging stats ) assert manager.prefix_cache_stats is None @@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled(): def test_maybe_evict_cached_block(): - pool = BlockPool(num_gpu_blocks=4, enable_caching=True) + pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16) block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000) block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000) block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000) @@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int): max_model_len=8192, enable_caching=True, enable_kv_cache_events=True, + hash_block_size=block_size, ) num_tokens = block_size * blocks_to_cache @@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int): max_model_len=8192, enable_caching=True, enable_kv_cache_events=True, + hash_block_size=block_size, ) # Test with LoRA request @@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block(): max_model_len=8192, enable_caching=True, use_eagle=True, + hash_block_size=block_size, ) # Request with 3 full blocks (48 tokens) @@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks(): max_model_len=8192, enable_caching=True, use_eagle=True, + hash_block_size=block_size, ) # 2 full blocks + 5 tokens (non-divisible length) token_ids = [0] * (2 * block_size + 5) @@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window(): max_model_len=8192, enable_caching=True, use_eagle=True, + hash_block_size=block_size, ) # 2 full blocks + 5 tokens (non-divisible length) @@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window(): assert num_tokens == 0 +def test_different_block_size(): + block_size = 16 + # full attention and sliding window attention layers have the same page size: + # (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token) + kv_cache_config = KVCacheConfig( + num_blocks=100, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec( + ["layer1"], + FullAttentionSpec(block_size * 2, 1, 1, torch.float16), + ), + KVCacheGroupSpec( + ["layer2"], + SlidingWindowSpec( + block_size, + 1, + 1, + torch.float32, + sliding_window=2 * block_size, + ), + ), + ], + ) + manager = KVCacheManager( + kv_cache_config=kv_cache_config, + max_model_len=8192, + enable_caching=True, + hash_block_size=block_size, + ) + + # 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block. + common_token_ids = [i for i in range(10) for _ in range(block_size)] + + req0 = make_request("0", common_token_ids, block_size, sha256) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0) + assert not computed_blocks.blocks[0] + assert not computed_blocks.blocks[1] + assert num_computed_tokens == 0 + blocks = manager.allocate_slots( + req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks + ) + assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11]) + req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) + assert len(computed_blocks.blocks[0]) == 3 + assert len(computed_blocks.blocks[1]) == 6 + assert num_computed_tokens == 6 * 16 + + req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) + assert len(computed_blocks.blocks[0]) == 3 + assert len(computed_blocks.blocks[1]) == 6 + assert num_computed_tokens == 6 * 16 + + # Evict some blocks to make sliding window cache hit length 5*16 + # But should return 4 * 16 because full attention cache hit length must be + # a multiple of 32 + manager.block_pool.cached_block_hash_to_block.pop( + make_block_hash_with_group_id(req1.block_hashes[6], 1), 11 + ) + manager.block_pool.cached_block_hash_to_block.pop( + make_block_hash_with_group_id(req1.block_hashes[5], 1), 10 + ) + computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) + assert len(computed_blocks.blocks[0]) == 2 + assert len(computed_blocks.blocks[1]) == 4 + assert num_computed_tokens == 4 * 16 + + def test_block_lookup_cache_single_block_per_key(): cache = BlockHashToBlockMap() key0 = BlockHashWithGroupId(b"hash0") diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py index a27f32938c08b..e6a69dc8a949a 100644 --- a/tests/v1/core/test_single_type_kv_cache_manager.py +++ b/tests/v1/core/test_single_type_kv_cache_manager.py @@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix(): attention_chunk_size=4, ) - block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + block_pool = BlockPool( + num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size + ) manager = get_chunked_local_attention_manager( chunked_local_attention_spec, block_pool ) @@ -70,6 +72,7 @@ def test_chunked_local_attention_possible_cached_prefix(): block_pool=block_pool, kv_cache_spec=chunked_local_attention_spec, use_eagle=False, + alignment_tokens=block_size, )[0] assert len(computed_blocks) == expect_length @@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix(): sliding_window=4, ) - block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + block_pool = BlockPool( + num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size + ) manager = get_sliding_window_manager(sliding_window_spec, block_pool) def run_one_case(block_is_cached, expect_length): @@ -138,6 +143,7 @@ def test_sliding_window_possible_cached_prefix(): block_pool=block_pool, kv_cache_spec=sliding_window_spec, use_eagle=False, + alignment_tokens=block_size, )[0] assert len(computed_blocks) == expect_length @@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks(): attention_chunk_size=4, ) - block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True) + block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2) manager = get_chunked_local_attention_manager(attention_spec, block_pool) @@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks(): sliding_window=4, ) - block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True) + block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2) manager = get_sliding_window_manager(sliding_window_spec, block_pool) @@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate(): sliding_window=4, # Placeholder value, not related to test result ) - block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + block_pool = BlockPool( + num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size + ) manager = get_sliding_window_manager(sliding_window_spec, block_pool) cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)] cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [ @@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate(): attention_chunk_size=4, # Placeholder value, not related to test result ) - block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + block_pool = BlockPool( + num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size + ) manager = get_chunked_local_attention_manager(attention_spec, block_pool) cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)] cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6d5b3392baa2b..bdccb15e3f655 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1816,9 +1816,11 @@ class EngineArgs: if model_config.runner_type != "pooling": default_chunked_prefill = True - # Disable prefix caching default for hybrid models - # since the feature is still experimental. - default_prefix_caching = not model_config.is_hybrid + # Disable prefix caching default for hybrid models and mamba-only + # models since the feature is still experimental. + default_prefix_caching = not ( + model_config.is_hybrid or model_config.is_attention_free + ) else: assert model_config.pooler_config is not None diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 3cf4bf991e667..d7e802ba1aca0 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -289,9 +289,6 @@ class MambaModelConfig(VerifyAndUpdateConfig): model_config = vllm_config.model_config cache_config = vllm_config.cache_config - if cache_config.mamba_block_size is None: - cache_config.mamba_block_size = model_config.max_model_len - if cache_config.enable_prefix_caching: if model_config.supports_mamba_prefix_caching: logger.info( @@ -299,6 +296,11 @@ class MambaModelConfig(VerifyAndUpdateConfig): "Its support for Mamba layers is experimental. " "Please report any issues you may observe." ) + # By default, mamba block size will be set to max_model_len (see + # below). When enabling prefix caching, we align mamba block size + # to the block size as the basic granularity for prefix caching. + if cache_config.mamba_block_size is None: + cache_config.mamba_block_size = cache_config.block_size else: logger.info( "Hybrid or mamba-based model detected without " @@ -306,6 +308,9 @@ class MambaModelConfig(VerifyAndUpdateConfig): ) cache_config.enable_prefix_caching = False + if cache_config.mamba_block_size is None: + cache_config.mamba_block_size = model_config.max_model_len + # TODO(tdoublep): remove once cascade attention is supported logger.info( "Disabling cascade attention since it is not supported for hybrid models." diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 55710ad5cc693..8b0e8fd3a2410 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -13,6 +13,8 @@ from vllm.distributed.kv_events import ( from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import ( BlockHash, + BlockHashList, + BlockHashListWithBlockSize, BlockHashWithGroupId, ExternalBlockHash, FreeKVCacheBlockQueue, @@ -133,6 +135,10 @@ class BlockPool: Args: num_gpu_blocks: The number of blocks in the pool. enable_caching: Whether to enable prefix caching. + hash_block_size: The block size of which the block hashes are computed. + The actual block size usually equals hash_block_size, but in cases + where different KV cache groups have different block sizes, the + actual block size can be a multiple of hash_block_size. enable_kv_cache_events: Whether to enable kv cache events. """ @@ -140,11 +146,13 @@ class BlockPool: self, num_gpu_blocks: int, enable_caching: bool, + hash_block_size: int, enable_kv_cache_events: bool = False, ): assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 self.num_gpu_blocks = num_gpu_blocks self.enable_caching = enable_caching + self.hash_block_size = hash_block_size # All kv-cache blocks. self.blocks: list[KVCacheBlock] = [ KVCacheBlock(idx) for idx in range(num_gpu_blocks) @@ -223,8 +231,20 @@ class BlockPool: return new_full_blocks = blocks[num_cached_blocks:num_full_blocks] assert len(request.block_hashes) >= num_full_blocks - new_block_hashes = request.block_hashes[num_cached_blocks:] + if block_size == self.hash_block_size: + # Common case. + block_hashes: BlockHashList = request.block_hashes + else: + # block_size is a multiple of hash_block_size. This happens when + # different KV cache groups have different block sizes. + assert block_size % self.hash_block_size == 0 + # Recalculate block_hashes at the granularity of block_size, using + # the original block_hashes (at the granularity of hash_block_size). + block_hashes = BlockHashListWithBlockSize( + request.block_hashes, self.hash_block_size, block_size + ) + new_block_hashes = block_hashes[num_cached_blocks:] new_hashes: list[ExternalBlockHash] | None = ( [] if self.enable_kv_cache_events else None ) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 1531b61f88fe2..fd1ec8e27fba2 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -2,15 +2,25 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Sequence +from math import lcm from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock +from vllm.v1.core.kv_cache_utils import ( + BlockHash, + BlockHashList, + BlockHashListWithBlockSize, + KVCacheBlock, +) from vllm.v1.core.single_type_kv_cache_manager import ( CrossAttentionManager, FullAttentionManager, get_manager_for_kv_cache_spec, ) -from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec +from vllm.v1.kv_cache_interface import ( + FullAttentionSpec, + KVCacheConfig, + KVCacheSpec, +) from vllm.v1.request import Request @@ -28,13 +38,17 @@ class KVCacheCoordinator(ABC): enable_kv_cache_events: bool, dcp_world_size: int, pcp_world_size: int, + hash_block_size: int, ): self.kv_cache_config = kv_cache_config self.max_model_len = max_model_len self.enable_caching = enable_caching self.block_pool = BlockPool( - kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events + kv_cache_config.num_blocks, + enable_caching, + hash_block_size, + enable_kv_cache_events, ) # Needs special handling for find_longest_cache_hit if eagle is enabled @@ -213,6 +227,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): enable_kv_cache_events: bool, dcp_world_size: int, pcp_world_size: int, + hash_block_size: int, ): super().__init__( kv_cache_config, @@ -222,6 +237,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): enable_kv_cache_events, dcp_world_size=dcp_world_size, pcp_world_size=pcp_world_size, + hash_block_size=hash_block_size, ) self.num_single_type_manager = len(self.single_type_managers) @@ -255,6 +271,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): enable_kv_cache_events: bool, dcp_world_size: int, pcp_world_size: int, + hash_block_size: int, ): super().__init__( kv_cache_config, @@ -264,6 +281,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): enable_kv_cache_events, dcp_world_size=dcp_world_size, pcp_world_size=pcp_world_size, + hash_block_size=hash_block_size, ) self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec self.block_size = self.kv_cache_spec.block_size @@ -273,6 +291,11 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): self.block_size *= dcp_world_size if pcp_world_size > 1: self.block_size *= pcp_world_size + # For models using only Mamba, block_size is set to max_model_len when + # prefix caching is disabled, and hash_block_size validation is skipped. + assert not enable_caching or (hash_block_size == self.block_size), ( + "UnitaryKVCacheCoordinator assumes hash_block_size == block_size" + ) assert len(self.kv_cache_config.kv_cache_groups) == 1, ( "UnitaryKVCacheCoordinator assumes only one kv cache group" ) @@ -289,6 +312,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): block_pool=self.block_pool, kv_cache_spec=self.kv_cache_spec, use_eagle=self.use_eagle, + alignment_tokens=self.block_size, dcp_world_size=self.dcp_world_size, pcp_world_size=self.pcp_world_size, ) @@ -313,6 +337,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): enable_kv_cache_events: bool, dcp_world_size: int, pcp_world_size: int, + hash_block_size: int, ): super().__init__( kv_cache_config, @@ -322,7 +347,17 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): enable_kv_cache_events, dcp_world_size=dcp_world_size, pcp_world_size=pcp_world_size, + hash_block_size=hash_block_size, ) + # hash_block_size: the block size used to compute block hashes. + # The actual block size usually equals hash_block_size, but in cases where + # different KV cache groups have different block sizes, the actual block size + # can be a multiple of hash_block_size. + self.hash_block_size = hash_block_size + assert all( + g.kv_cache_spec.block_size % hash_block_size == 0 + for g in kv_cache_config.kv_cache_groups + ), "block_size must be divisible by hash_block_size" assert dcp_world_size == 1, "DCP not support hybrid attn now." assert pcp_world_size == 1, "PCP not support hybrid attn now." self.verify_and_split_kv_cache_groups() @@ -373,14 +408,12 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): self.other_spec = other_spec self.full_attention_block_size = self.full_attention_spec.block_size self.other_block_size = self.other_spec.block_size - - if self.enable_caching: - # this requirement is only needed for the prefix caching logic - divisible = self.other_block_size % self.full_attention_block_size - assert divisible == 0, ( - "KVCacheCoordinator assumes the block_size of full " - "attention layers is divisible by other layers now." - ) + # The LCM of the block sizes of full attention and other attention. + # The cache hit length must be a multiple of the LCM of the block sizes + # to make sure the cache hit length is a multiple of the block size of + # each attention type. Requiring this because we don't support partial + # block cache hit yet. + self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size) if max(self.full_attention_group_ids) < min(self.other_group_ids): self.full_attn_first = True @@ -414,25 +447,48 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): - The number of tokens of the longest cache hit. """ # First, find the longest cache hit for full attention. + if self.full_attention_spec.block_size == self.hash_block_size: + # Common case. + full_attention_block_hashes: BlockHashList = block_hashes + else: + # block_size is a multiple of hash_block_size. This happens when different + # KV cache groups have different block sizes. In this case, we need to + # recalculate block_hashes at the granularity of block_size, using the + # original block_hashes (at the granularity of hash_block_size). + full_attention_block_hashes = BlockHashListWithBlockSize( + block_hashes, self.hash_block_size, self.full_attention_spec.block_size + ) hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit( - block_hashes=block_hashes, + block_hashes=full_attention_block_hashes, max_length=max_cache_hit_length, kv_cache_group_ids=self.full_attention_group_ids, block_pool=self.block_pool, kv_cache_spec=self.full_attention_spec, use_eagle=self.use_eagle, + alignment_tokens=self.lcm_block_size, ) hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size # Next, find the cache hit for the other attention WITHIN # the cache hit of full attention. + if self.other_spec.block_size == self.hash_block_size: + # Common case. + other_block_hashes: BlockHashList = block_hashes + else: + # Similar to the full attention case, here we need to recalculate + # block_hashes at the granularity of block_size, using the original + # block_hashes (at the granularity of hash_block_size). + other_block_hashes = BlockHashListWithBlockSize( + block_hashes, self.hash_block_size, self.other_spec.block_size + ) hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit( - block_hashes=block_hashes, + block_hashes=other_block_hashes, max_length=hit_length, kv_cache_group_ids=self.other_group_ids, block_pool=self.block_pool, kv_cache_spec=self.other_spec, use_eagle=self.use_eagle, + alignment_tokens=self.lcm_block_size, ) hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size @@ -466,6 +522,7 @@ def get_kv_cache_coordinator( enable_kv_cache_events: bool, dcp_world_size: int, pcp_world_size: int, + hash_block_size: int, ) -> KVCacheCoordinator: if not enable_caching: return KVCacheCoordinatorNoPrefixCache( @@ -473,8 +530,9 @@ def get_kv_cache_coordinator( max_model_len, use_eagle, enable_kv_cache_events, - dcp_world_size=dcp_world_size, - pcp_world_size=pcp_world_size, + dcp_world_size, + pcp_world_size, + hash_block_size, ) if len(kv_cache_config.kv_cache_groups) == 1: return UnitaryKVCacheCoordinator( @@ -483,8 +541,9 @@ def get_kv_cache_coordinator( use_eagle, enable_caching, enable_kv_cache_events, - dcp_world_size=dcp_world_size, - pcp_world_size=pcp_world_size, + dcp_world_size, + pcp_world_size, + hash_block_size, ) return HybridKVCacheCoordinator( kv_cache_config, @@ -492,6 +551,7 @@ def get_kv_cache_coordinator( use_eagle, enable_caching, enable_kv_cache_events, - dcp_world_size=dcp_world_size, - pcp_world_size=pcp_world_size, + dcp_world_size, + pcp_world_size, + hash_block_size, ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 2012c3fef88bc..b061e5cc831dd 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -95,6 +95,7 @@ class KVCacheManager: self, kv_cache_config: KVCacheConfig, max_model_len: int, + hash_block_size: int, enable_caching: bool = True, use_eagle: bool = False, log_stats: bool = False, @@ -107,28 +108,11 @@ class KVCacheManager: self.enable_caching = enable_caching self.use_eagle = use_eagle self.log_stats = log_stats - # FIXME: make prefix cache stats conditional on log_stats + # FIXME: make prefix cache stats conditional on log_stats. We still need + # this comment because when the log stats is enabled there are still + # potential configs we could expose in the future. self.prefix_cache_stats = PrefixCacheStats() if log_stats else None - self.block_size: int | None = None - if self.enable_caching: - assert ( - len( - set( - g.kv_cache_spec.block_size - for g in kv_cache_config.kv_cache_groups - ) - ) - == 1 - ), "Only one block size is supported for now" - self.block_size = kv_cache_config.kv_cache_groups[ - 0 - ].kv_cache_spec.block_size - - if dcp_world_size * pcp_world_size > 1: - assert len(kv_cache_config.kv_cache_groups) == 1 - self.block_size *= dcp_world_size * pcp_world_size - self.coordinator = get_kv_cache_coordinator( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, @@ -137,6 +121,7 @@ class KVCacheManager: enable_kv_cache_events=enable_kv_cache_events, dcp_world_size=dcp_world_size, pcp_world_size=pcp_world_size, + hash_block_size=hash_block_size, ) self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups) self.block_pool = self.coordinator.block_pool diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index a0033fa650baa..602eb81beb010 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -5,9 +5,9 @@ import copy import os from collections import defaultdict -from collections.abc import Callable, Iterable, Sequence -from dataclasses import dataclass -from typing import Any, NewType, TypeAlias +from collections.abc import Callable, Iterable, Iterator, Sequence +from dataclasses import dataclass, replace +from typing import Any, NewType, TypeAlias, overload from vllm import envs from vllm.config import VllmConfig @@ -825,11 +825,11 @@ def get_num_blocks( return num_blocks -def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int: +def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int: """ Get the page size of the KV cache. """ - page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values()) + page_sizes = {layer.page_size_bytes for layer in kv_cache_specs} assert len(page_sizes) == 1 return page_sizes.pop() @@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool return len(page_sizes) == 1 +def unify_kv_cache_spec_page_size( + kv_cache_spec: dict[str, KVCacheSpec], +) -> dict[str, KVCacheSpec]: + """ + Unify the page size of the given KVCacheSpec. If the page size of all layers + are the same, return the original KVCacheSpec. If not same, unify the page + size by increasing the block size of layers with smaller page size. Raise + NotImplementedError if failed to unify the page size. + + Args: + kv_cache_spec: The KVCacheSpec of each attention layer in the model + + Returns: + The updated KVCacheSpec with the same page_size_bytes. + """ + page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} + if len(page_sizes) <= 1: + # All layers have the same page size, no need to unify. + return kv_cache_spec + + max_page_size = max(page_sizes) + new_kv_cache_spec = {} + for layer_name, layer_spec in kv_cache_spec.items(): + if layer_spec.page_size_bytes == max_page_size: + new_kv_cache_spec[layer_name] = layer_spec + else: + layer_page_size = layer_spec.page_size_bytes + if max_page_size % layer_page_size != 0: + raise NotImplementedError( + "The page size of the layer is not divisible by the " + "maximum page size. Cannot unify by adjusting block_size." + ) + ratio = max_page_size // layer_page_size + new_block_size = layer_spec.block_size * ratio + new_spec = replace(layer_spec, block_size=new_block_size) + assert new_spec.page_size_bytes == max_page_size + new_kv_cache_spec[layer_name] = new_spec + return new_kv_cache_spec + + def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: # kv_cache_spec is an empty dict for attention free models return not kv_cache_spec @@ -1010,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size( def get_kv_cache_config_from_groups( vllm_config: VllmConfig, kv_cache_groups: list[KVCacheGroupSpec], - kv_cache_specs: dict[str, KVCacheSpec], available_memory: int, ) -> KVCacheConfig: """ @@ -1020,7 +1059,6 @@ def get_kv_cache_config_from_groups( Args: vllm_config: The global VllmConfig kv_cache_groups: The KV cache groups - kv_cache_specs: The KV cache spec of each attention layer in the model available_memory: Memory available for KV cache in bytes Returns: The generated KVCacheConfig @@ -1064,7 +1102,9 @@ def get_kv_cache_config_from_groups( # full.1, sw.2: share another Tensor with size=available_memory//2 group_size = max(len(group.layer_names) for group in kv_cache_groups) - page_size = get_uniform_page_size(kv_cache_specs) + page_size = get_uniform_page_size( + [group.kv_cache_spec for group in kv_cache_groups] + ) assert group_size > 0, "group_size must be greater than 0" num_blocks = get_num_blocks( vllm_config, group_size, available_memory, page_size @@ -1166,7 +1206,8 @@ def get_kv_cache_groups( # This returns an empty list to allow for the KVCacheManager to handle # attention free models. return [] - elif is_kv_cache_spec_uniform(kv_cache_spec): + + if is_kv_cache_spec_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. @@ -1176,14 +1217,16 @@ def get_kv_cache_groups( # full attention, or all layers are sliding window attention with the # same window size). Put all layers into one group. return _get_kv_cache_groups_uniform_type(uniform_spec) - elif is_kv_cache_page_size_uniform(kv_cache_spec): - # Model contains multiple attention types, but KV cache of all layers - # have the same physical memory per block per layer. Split the layers - # into groups with the same number of layers, and thus same total page - # size. - return _get_kv_cache_groups_uniform_page_size(kv_cache_spec) - raise NotImplementedError + # As KVCacheManager can only allocate memory of one size, we need to unify + # the page size of the layers. For cases cannot be unified, this function + # will raise an error. + kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec) + # Model contains multiple attention types, but KV cache of all layers + # have the same physical memory per block per layer. Split the layers + # into groups with the same number of layers, and thus same total page + # size. + return _get_kv_cache_groups_uniform_page_size(kv_cache_spec) def generate_scheduler_kv_cache_config( @@ -1327,10 +1370,7 @@ def get_kv_cache_configs( ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group." kv_cache_configs.append( get_kv_cache_config_from_groups( - vllm_config, - kv_cache_groups_one_worker, - kv_cache_spec_one_worker, - available_memory_one_worker, + vllm_config, kv_cache_groups_one_worker, available_memory_one_worker ) ) @@ -1353,3 +1393,79 @@ def get_kv_cache_configs( _report_kv_cache_config(vllm_config, kv_cache_config) return kv_cache_configs + + +class BlockHashListWithBlockSize: + """ + Convert block-hash granularity from `hash_block_size` to `target_block_size`. + Used when KV cache groups have different block sizes: `hash_block_size` + is the size used to compute the original `block_hashes`; `target_block_size` + is the group's actual block size. + + Currently, only scaling up by an integer factor is supported (i.e., + `target_block_size` is a multiple of `hash_block_size`). Conversion is + performed lazily on access for efficiency, by concatenating consecutive + hashes at `hash_block_size` to form each hash at `target_block_size`. + + Example (`hash_block_size` = 16, `target_block_size` = 32): + concatenating two 16-size hashes yields one 32-size hash: + + Block hashes with block_size 16: + | Token Range | 0-15 | 16-31 | 32-47 | 48-63 | + |-------------|------|-------|-------|-------| + | Hash | A | B | C | D | + + Block hashes with block_size 32: + | Token Range | 0-31 | 32-63 | + |-------------|------|-------| + | Hash | AB | CD | + + Args: + block_hashes: Block hashes to convert, computed at `hash_block_size`. + hash_block_size: Block size at which `block_hashes` were computed. + target_block_size: Desired block size; must be a multiple of `hash_block_size`. + """ + + def __init__( + self, + block_hashes: list[BlockHash], + hash_block_size: int, + target_block_size: int, + ): + self.block_hashes = block_hashes + assert target_block_size % hash_block_size == 0 + self.scale_factor = target_block_size // hash_block_size + + def __len__(self) -> int: + return len(self.block_hashes) // self.scale_factor + + @overload + def __getitem__(self, idx: int) -> BlockHash: ... + + @overload + def __getitem__(self, idx: slice) -> list[BlockHash]: ... + + def __getitem__(self, idx): + if isinstance(idx, int): + return self._get_value_at(idx) + + if isinstance(idx, slice): + start, stop, step = idx.indices(len(self)) + return [self._get_value_at(i) for i in range(start, stop, step)] + + raise TypeError(f"Invalid index type: {type(idx)!r}") + + def __iter__(self) -> Iterator[BlockHash]: + for i in range(len(self)): + yield self._get_value_at(i) + + def _get_value_at(self, idx: int) -> BlockHash: + base = idx * self.scale_factor + end = base + self.scale_factor + merged_hash: bytes = self.block_hashes[base] + for i in range(base + 1, end): + merged_hash += self.block_hashes[i] + return BlockHash(merged_hash) + + +BlockHashList = list[BlockHash] | BlockHashListWithBlockSize diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 23af014c10364..bea2f865bad46 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -186,6 +186,7 @@ class Scheduler(SchedulerInterface): enable_kv_cache_events=self.enable_kv_cache_events, dcp_world_size=self.dcp_world_size, pcp_world_size=self.pcp_world_size, + hash_block_size=self.block_size, ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index d90ec550f7666..4aeb17a156bb3 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -7,7 +7,7 @@ from collections.abc import Sequence from vllm.utils.math_utils import cdiv from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock +from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock from vllm.v1.kv_cache_interface import ( ChunkedLocalAttentionSpec, CrossAttentionSpec, @@ -207,12 +207,13 @@ class SingleTypeKVCacheManager(ABC): @abstractmethod def find_longest_cache_hit( cls, - block_hashes: list[BlockHash], + block_hashes: BlockHashList, max_length: int, kv_cache_group_ids: list[int], block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + alignment_tokens: int, dcp_world_size: int = 1, pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: @@ -232,6 +233,11 @@ class SingleTypeKVCacheManager(ABC): block_pool: The block pool. kv_cache_spec: The kv cache spec. use_eagle: Whether to use eagle. + alignment_tokens: The returned cache hit length (in tokens) should + be a multiple of this value (in tokens). By default, it should + be set to the block_size. + dcp_world_size: The world size of decode context parallelism. + pcp_world_size: The world size of prefill context parallelism. Returns: A list of cached blocks with skipped blocks replaced by null block @@ -299,17 +305,18 @@ class FullAttentionManager(SingleTypeKVCacheManager): @classmethod def find_longest_cache_hit( cls, - block_hashes: list[BlockHash], + block_hashes: BlockHashList, max_length: int, kv_cache_group_ids: list[int], block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + alignment_tokens: int, dcp_world_size: int = 1, pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance( - kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec) + kv_cache_spec, FullAttentionSpec | ChunkedLocalAttentionSpec ), ( "FullAttentionManager can only be used for full attention " "and chunked local attention groups" @@ -333,6 +340,13 @@ class FullAttentionManager(SingleTypeKVCacheManager): else: break if use_eagle and computed_blocks[0]: + # Need to drop the last matched block if eagle is enabled. + for computed in computed_blocks: + computed.pop() + while ( + block_size != alignment_tokens # Faster for common case. + and len(computed_blocks[0]) * block_size % alignment_tokens != 0 + ): for computed in computed_blocks: computed.pop() return computed_blocks @@ -359,12 +373,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager): @classmethod def find_longest_cache_hit( cls, - block_hashes: list[BlockHash], + block_hashes: BlockHashList, max_length: int, kv_cache_group_ids: list[int], block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + alignment_tokens: int, dcp_world_size: int = 1, pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: @@ -396,6 +411,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager): [block_pool.null_block] * max_num_blocks for _ in range(len(kv_cache_group_ids)) ) + block_size = kv_cache_spec.block_size num_contiguous_blocks = 0 match_found = False # Search from right to left and early stop when a match is found. @@ -403,6 +419,15 @@ class SlidingWindowManager(SingleTypeKVCacheManager): if cached_block := block_pool.get_cached_block( block_hashes[i], kv_cache_group_ids ): + # Skip prefix matching check if the block is not aligned with + # `alignment_tokens`. + if ( + num_contiguous_blocks == 0 + and block_size != alignment_tokens # Faster for common case. + and (i + 1) * block_size % alignment_tokens != 0 + ): + continue + # Add the cached block to the computed blocks. for computed, cached in zip(computed_blocks, cached_block): computed[i] = cached num_contiguous_blocks += 1 @@ -421,7 +446,16 @@ class SlidingWindowManager(SingleTypeKVCacheManager): # `num_contiguous_blocks < sliding_window_contiguous_blocks`. for computed in computed_blocks: del computed[num_contiguous_blocks:] + while ( + block_size != alignment_tokens # Faster for common case. + and len(computed_blocks[0]) * block_size % alignment_tokens != 0 + ): + for computed in computed_blocks: + computed.pop() if use_eagle and computed_blocks[0]: + assert kv_cache_spec.block_size == alignment_tokens, ( + "aligned_length is not compatible with eagle now" + ) for computed in computed_blocks: computed.pop() return computed_blocks @@ -475,12 +509,13 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): @classmethod def find_longest_cache_hit( cls, - block_hashes: list[BlockHash], + block_hashes: BlockHashList, max_length: int, kv_cache_group_ids: list[int], block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + alignment_tokens: int, dcp_world_size: int = 1, pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: @@ -511,6 +546,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): block_pool: The block pool. kv_cache_spec: The kv cache spec. use_eagle: Whether to use eagle. + dcp_world_size: The world size of decode context parallelism. + pcp_world_size: The world size of prefill context parallelism. + alignment_tokens: The returned cache hit length (in tokens) should + be a multiple of this value (in tokens). Returns: A list of cached blocks @@ -524,6 +563,10 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): ) assert dcp_world_size == 1, "DCP not support chunked local attn now." assert pcp_world_size == 1, "PCP not support chunked local attn now." + assert kv_cache_spec.block_size == alignment_tokens, ( + "KV cache groups with different block sizes are not compatible with " + "chunked local attention now" + ) max_num_blocks = max_length // kv_cache_spec.block_size if max_length > 0: local_attention_start_idx = ( @@ -612,12 +655,13 @@ class MambaManager(SingleTypeKVCacheManager): @classmethod def find_longest_cache_hit( cls, - block_hashes: list[BlockHash], + block_hashes: BlockHashList, max_length: int, kv_cache_group_ids: list[int], block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + alignment_tokens: int, dcp_world_size: int = 1, pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: @@ -630,12 +674,21 @@ class MambaManager(SingleTypeKVCacheManager): [] for _ in range(len(kv_cache_group_ids)) ) - max_num_blocks = max_length // kv_cache_spec.block_size + block_size = kv_cache_spec.block_size + max_num_blocks = max_length // block_size # Search from right to left and early stop when a match is found. for i in range(max_num_blocks - 1, -1, -1): if cached_block := block_pool.get_cached_block( block_hashes[i], kv_cache_group_ids ): + # When enable Mamba prefix caching, `block_size` will be aligned + # across full attention layers and Mamba layers to ensure the + # prefix hit length aligned at block + if ( + block_size != alignment_tokens # Faster for common case. + and (i + 1) * block_size % alignment_tokens != 0 + ): + continue for computed, cached in zip(computed_blocks, cached_block): # the hit length logic later assumes: # hit_length = len(hit_blocks_other_attn[0]) @@ -708,12 +761,13 @@ class CrossAttentionManager(SingleTypeKVCacheManager): @classmethod def find_longest_cache_hit( cls, - block_hashes: list[BlockHash], + block_hashes: BlockHashList, max_length: int, kv_cache_group_ids: list[int], block_pool: BlockPool, kv_cache_spec: KVCacheSpec, use_eagle: bool, + alignment_tokens: int, dcp_world_size: int = 1, pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: From a1f267687956ae1f0b7c1668b48020055d764619 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:08:57 +0000 Subject: [PATCH 10/24] Scheduled removal of `override_pooler_config` and `disable_log_requests` (#29402) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/config/model.py | 16 ---------------- vllm/engine/arg_utils.py | 29 +---------------------------- vllm/entrypoints/llm.py | 5 ----- vllm/utils/argparse_utils.py | 8 -------- vllm/v1/engine/async_llm.py | 8 -------- 5 files changed, 1 insertion(+), 65 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 14ffdec2e09d1..ce5e824da5c22 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -299,9 +299,6 @@ class ModelConfig: pooler_config: PoolerConfig | None = None """Pooler config which controls the behaviour of output pooling in pooling models.""" - override_pooler_config: dict | PoolerConfig | None = None - """[DEPRECATED] Use `pooler_config` instead. This field will be removed in - v0.12.0 or v1.0.0, whichever is sooner.""" # Multimodal config and init vars multimodal_config: MultiModalConfig | None = None @@ -359,7 +356,6 @@ class ModelConfig: "logits_processors", "io_processor_plugin", "pooler_config", - "override_pooler_config", "multimodal_config", "limit_mm_per_prompt", "media_io_kwargs", @@ -648,18 +644,6 @@ class ModelConfig: # Init pooler config if needed if self.runner_type == "pooling": - if self.override_pooler_config is not None: - logger.warning_once( - "`override_pooler_config` is deprecated and will be " - "removed in v0.12.0 or v1.0.0, whichever is sooner. " - "Please use `pooler_config` instead." - ) - - if isinstance(self.override_pooler_config, dict): - self.pooler_config = PoolerConfig(**self.override_pooler_config) - else: - self.pooler_config = self.override_pooler_config - if self.pooler_config is None: self.pooler_config = PoolerConfig() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bdccb15e3f655..696ff3a1f4024 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -29,7 +29,7 @@ import regex as re import torch from pydantic import TypeAdapter, ValidationError from pydantic.fields import FieldInfo -from typing_extensions import TypeIs, deprecated +from typing_extensions import TypeIs import vllm.envs as envs from vllm.attention.backends.registry import AttentionBackendEnum @@ -520,9 +520,6 @@ class EngineArgs: scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls pooler_config: PoolerConfig | None = ModelConfig.pooler_config - override_pooler_config: dict | PoolerConfig | None = ( - ModelConfig.override_pooler_config - ) compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config") worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls @@ -659,11 +656,6 @@ class EngineArgs: ) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"]) - model_group.add_argument( - "--override-pooler-config", - **model_kwargs["override_pooler_config"], - deprecated=True, - ) model_group.add_argument( "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"] ) @@ -1243,7 +1235,6 @@ class EngineArgs: mm_encoder_tp_mode=self.mm_encoder_tp_mode, mm_encoder_attn_backend=self.mm_encoder_attn_backend, pooler_config=self.pooler_config, - override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, override_generation_config=self.override_generation_config, @@ -2047,24 +2038,6 @@ class AsyncEngineArgs(EngineArgs): enable_log_requests: bool = False - @property - @deprecated( - "`disable_log_requests` is deprecated and has been replaced with " - "`enable_log_requests`. This will be removed in v0.12.0. Please use " - "`enable_log_requests` instead." - ) - def disable_log_requests(self) -> bool: - return not self.enable_log_requests - - @disable_log_requests.setter - @deprecated( - "`disable_log_requests` is deprecated and has been replaced with " - "`enable_log_requests`. This will be removed in v0.12.0. Please use " - "`enable_log_requests` instead." - ) - def disable_log_requests(self, value: bool): - self.enable_log_requests = not value - @staticmethod def add_cli_args( parser: FlexibleArgumentParser, async_args_only: bool = False diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 848916dbd8763..1860f383d45fb 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -174,9 +174,6 @@ class LLM: For example, for Phi-3-Vision: `{"num_crops": 4}`. pooler_config: Initialize non-default pooling config for the pooling model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. - override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This - argument is deprecated and will be removed in v0.12.0 or v1.0.0, - whichever is sooner. compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the mode of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. @@ -214,7 +211,6 @@ class LLM: hf_overrides: HfOverrides | None = None, mm_processor_kwargs: dict[str, Any] | None = None, pooler_config: PoolerConfig | None = None, - override_pooler_config: PoolerConfig | None = None, structured_outputs_config: dict[str, Any] | StructuredOutputsConfig | None = None, @@ -330,7 +326,6 @@ class LLM: hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, pooler_config=pooler_config, - override_pooler_config=override_pooler_config, structured_outputs_config=structured_outputs_instance, compilation_config=compilation_config_instance, logits_processors=logits_processors, diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index 3d105a3685b37..692e756d19634 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -73,14 +73,6 @@ class FlexibleArgumentParser(ArgumentParser): # Enable the deprecated kwarg for Python 3.12 and below def parse_known_args(self, args=None, namespace=None): - if args is not None and "--disable-log-requests" in args: - # Special case warning because the warning below won't trigger - # if –-disable-log-requests because its value is default. - logger.warning_once( - "argument '--disable-log-requests' is deprecated and " - "replaced with '--enable-log-requests'. This will be " - "removed in v0.12.0." - ) namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if ( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 55087baadff97..827a2736af284 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -31,7 +31,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_ from vllm.usage.usage_lib import UsageContext from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list -from vllm.utils.func_utils import deprecate_kwargs from vllm.utils.math_utils import cdiv from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient @@ -195,12 +194,6 @@ class AsyncLLM(EngineClient): self.profiler = None @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=( - "This argument will have no effect. Use `enable_log_requests` instead." - ), - ) def from_vllm_config( cls, vllm_config: VllmConfig, @@ -213,7 +206,6 @@ class AsyncLLM(EngineClient): client_addresses: dict[str, str] | None = None, client_count: int = 1, client_index: int = 0, - disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": # Create the LLMEngine. return cls( From 0353d2e162cbda776d9dbfe026e65303204a7f1f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:23:45 +0000 Subject: [PATCH 11/24] Fix RoPE related failures in Transformers nightly tests (#29333) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/baichuan.py | 2 +- vllm/model_executor/models/gpt_j.py | 2 +- vllm/model_executor/models/grok1.py | 2 +- vllm/model_executor/models/llama.py | 2 +- vllm/transformers_utils/config.py | 62 ++++++++++++++------------ 5 files changed, 37 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index edf47270e5277..024788918d024 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -233,7 +233,7 @@ class BaiChuanDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, position_embedding=position_embedding, - rope_parameters=config.rope_parameters, + rope_parameters=getattr(config, "rope_parameters", None), max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index e94de8952fa63..bd1bfea3c0fef 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -100,7 +100,7 @@ class GPTJAttention(nn.Module): self.head_size, rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - rope_parameters=config.rope_parameters, + rope_parameters=getattr(config, "rope_parameters", None), is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 4bf23cd6fd19a..cfca564920111 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -239,7 +239,7 @@ class Grok1DecoderLayer(nn.Module): num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_parameters=config.rope_parameters, + rope_parameters=getattr(config, "rope_parameters", None), cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index eebb9e07fa89d..f6af2bb3b12e9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -262,7 +262,7 @@ class LlamaAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - rope_parameters=config.rope_parameters, + rope_parameters=getattr(config, "rope_parameters", None), is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index a29d92f67f5df..66680f410cb3c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -456,51 +456,55 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - # Retrieve rope_parameters differently based on Transformers version + # Patch rope_parameters differently based on Transformers version if Version(version("transformers")) >= Version("5.0.0.dev0"): - from transformers.modeling_rope_utils import RopeParameters - - rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( - config, "rope_parameters", None + from transformers.modeling_rope_utils import ( + rope_config_validation, + standardize_rope_params, ) - elif hasattr(config, "rope_parameters"): - # We are in Transformers v4 and rope_parameters - # has already been patched for this config - return + + # When Transformers v5 is installed, legacy rope_theta may be present + # when using custom code models written for Transformers v4 + if (rope_theta := getattr(config, "rope_theta", None)) is not None: + standardize_rope_params(config, rope_theta=rope_theta) + rope_config_validation(config) + # Delete rope_theta to avoid confusion in downstream code + del config.rope_theta else: - # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters - rope_theta: float | None = getattr(config, "rope_theta", None) - rope_scaling: dict | None = getattr(config, "rope_scaling", None) - rope_parameters = rope_scaling - # Move rope_theta into rope_parameters - if rope_theta is not None: - rope_parameters = rope_parameters or {"rope_type": "default"} - rope_parameters["rope_theta"] = rope_theta - # Add original_max_position_embeddings if present - if rope_parameters and ( - ompe := getattr(config, "original_max_position_embeddings", None) - ): - rope_parameters["original_max_position_embeddings"] = ompe - # Write back to config - config.rope_parameters = rope_parameters + # When Transformers v4 is installed, legacy rope_scaling may be present + if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: + config.rope_parameters = rope_scaling + # When Transformers v4 is installed, legacy rope_theta may be present + if (rope_theta := getattr(config, "rope_theta", None)) is not None: + if not hasattr(config, "rope_parameters"): + config.rope_parameters = {"rope_type": "default"} + config.rope_parameters["rope_theta"] = rope_theta # No RoPE parameters to patch - if rope_parameters is None: + if not hasattr(config, "rope_parameters"): return + # Add original_max_position_embeddings if present + if ompe := getattr(config, "original_max_position_embeddings", None): + config.rope_parameters["original_max_position_embeddings"] = ompe + # Handle nested rope_parameters in interleaved sliding attention models - if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): - for rope_parameters_layer_type in rope_parameters.values(): + if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + for rope_parameters_layer_type in config.rope_parameters.values(): patch_rope_parameters_dict(rope_parameters_layer_type) else: - patch_rope_parameters_dict(rope_parameters) + patch_rope_parameters_dict(config.rope_parameters) def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: if "rope_type" in rope_parameters and "type" in rope_parameters: rope_type = rope_parameters["rope_type"] rope_type_legacy = rope_parameters["type"] - if rope_type != rope_type_legacy: + if (rope_type_legacy == "su" and rope_type == "longrope") or ( + rope_type_legacy == "mrope" and rope_type == "default" + ): + pass # No action needed + elif rope_type != rope_type_legacy: raise ValueError( f"Found conflicts between 'rope_type={rope_type}' (modern " f"field) and 'type={rope_type_legacy}' (legacy field). " From b07555d26f4c7ad9a2d1ec45428a9d4287db612c Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Tue, 25 Nov 2025 10:27:26 -0800 Subject: [PATCH 12/24] [responsesAPI][2] parse ResponseFunctionToolCallOutputItem (#29383) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- tests/entrypoints/test_responses_utils.py | 15 +++++++++++++++ vllm/entrypoints/openai/protocol.py | 5 +---- vllm/entrypoints/responses_utils.py | 9 +++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 91c818374e3fd..893d806b65742 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -2,6 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from openai.types.responses.response_function_tool_call_output_item import ( + ResponseFunctionToolCallOutputItem, +) from openai.types.responses.response_reasoning_item import ( Content, ResponseReasoningItem, @@ -76,6 +79,18 @@ class TestResponsesUtils: == 'Hmm, the user has just started with a simple "Hello,"' ) + tool_call_output = ResponseFunctionToolCallOutputItem( + id="temp_id", + type="function_call_output", + call_id="temp", + output="1234", + status="completed", + ) + formatted_item = construct_chat_message_with_tool_call(tool_call_output) + assert formatted_item["role"] == "tool" + assert formatted_item["content"] == "1234" + assert formatted_item["tool_call_id"] == "temp" + item = ResponseReasoningItem( id="lol", summary=[], diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 98a385a1dcd5f..688ea9697d9d6 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -29,7 +29,6 @@ from openai.types.responses import ( ResponseOutputItemAddedEvent, ResponseOutputItemDoneEvent, ResponsePrompt, - ResponseReasoningItem, ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, ResponseStatus, @@ -304,9 +303,7 @@ def get_logits_processors( return None -ResponseInputOutputItem: TypeAlias = ( - ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall -) +ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem class ResponsesRequest(OpenAIBaseModel): diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index b02c43c7f8246..07abb80ebc9e3 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -10,6 +10,9 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( Function as FunctionCallTool, ) from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem +from openai.types.responses.response_function_tool_call_output_item import ( + ResponseFunctionToolCallOutputItem, +) from openai.types.responses.response_output_message import ResponseOutputMessage from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool @@ -94,6 +97,12 @@ def construct_chat_message_with_tool_call( "role": "assistant", "reasoning": reasoning_content, } + elif isinstance(item, ResponseFunctionToolCallOutputItem): + return ChatCompletionToolMessageParam( + role="tool", + content=item.output, + tool_call_id=item.call_id, + ) elif item.get("type") == "function_call_output": # Append the function call output as a tool message. return ChatCompletionToolMessageParam( From c32a18cbe7342ac0700802b94ae98bbf928a00f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Tue, 25 Nov 2025 20:23:36 +0100 Subject: [PATCH 13/24] Attempt to fix GPU OOM in a spec-decoding test (#29419) Signed-off-by: Eldar Kurtic <8884008+eldarkurtic@users.noreply.github.com> --- examples/offline_inference/spec_decode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 67a0732459709..29b2e95d262f8 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -133,7 +133,7 @@ def main(args): tensor_parallel_size=args.tp, enable_chunked_prefill=args.enable_chunked_prefill, enforce_eager=args.enforce_eager, - gpu_memory_utilization=0.8, + gpu_memory_utilization=0.9, speculative_config=speculative_config, disable_log_stats=False, max_model_len=args.max_model_len, From e7d776273de379bb6c9fc11ce070c57e0fcd84f9 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 25 Nov 2025 20:58:56 +0100 Subject: [PATCH 14/24] [Compile] Refactor. Move PostGradPassManager out of Compilation config (#29340) Signed-off-by: ilmarkov --- vllm/compilation/backends.py | 33 ++++++++++++++++----------- vllm/compilation/piecewise_backend.py | 2 +- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 2d8dd4c51c7ef..1773913d0b6c6 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -11,6 +11,7 @@ import pprint import time from collections.abc import Callable, Sequence from contextlib import contextmanager +from copy import deepcopy from functools import partial from typing import Any @@ -429,7 +430,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): self.vllm_backend.compiler_manager.compile( submod, args, - self.compilation_config.inductor_compile_config, + self.vllm_backend.inductor_config, self.compilation_config, graph_index=index, num_graphs=len(self.compile_submod_names), @@ -531,6 +532,9 @@ class VllmBackend: sym_tensor_indices: list[int] input_buffers: list[torch.Tensor] compiler_manager: CompilerManager + # Copy of CompilationConfig.inductor_compile_config + + # an entry for PostGradPassManager + inductor_config: dict[str, Any] def __init__( self, @@ -561,25 +565,30 @@ class VllmBackend: self.compilation_config ) + # Deepcopy the inductor config to detach the post-grad custom pass + # from CompilationConfig. + # We want to avoid PostGradPassManager in CompilationConfig because + # in future we need PostGradPassManager.uuid() to be executed + # only at compile time. + self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config) # `torch.compile` is JIT compiled, so we don't need to # do anything here def configure_post_pass(self): - config = self.compilation_config self.pass_manager.configure(self.vllm_config) # Post-grad custom passes are run using the post_grad_custom_post_pass # hook. If a pass for that hook exists, add it to the pass manager. - inductor_config = config.inductor_compile_config - if self.pass_key in inductor_config: - if isinstance(inductor_config[self.pass_key], PostGradPassManager): - # PassManager already added to config, make sure it's correct - assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid() + if self.pass_key in self.inductor_config: + if isinstance(self.inductor_config[self.pass_key], PostGradPassManager): + raise ValueError( + "PostGradPassManager can not be kept in CompilationConfig." + ) else: # Config should automatically wrap all inductor passes - assert isinstance(inductor_config[self.pass_key], InductorPass) - self.pass_manager.add(inductor_config[self.pass_key]) - inductor_config[self.pass_key] = self.pass_manager + assert isinstance(self.inductor_config[self.pass_key], InductorPass) + self.pass_manager.add(self.inductor_config[self.pass_key]) + self.inductor_config[self.pass_key] = self.pass_manager def __call__( self, graph: fx.GraphModule, example_inputs @@ -638,9 +647,7 @@ class VllmBackend: self.compilation_config.local_cache_dir = local_cache_dir # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE. - disable_cache = not is_compile_cache_enabled( - self.compilation_config.inductor_compile_config - ) + disable_cache = not is_compile_cache_enabled(self.inductor_config) if disable_cache: logger.info_once("vLLM's torch.compile cache is disabled.", scope="local") diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 2931580afbbb0..e535d2c461c6e 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -107,7 +107,7 @@ class PiecewiseBackend: entry.runnable = self.vllm_backend.compiler_manager.compile( self.graph, args, - self.compilation_config.inductor_compile_config, + self.vllm_backend.inductor_config, self.compilation_config, graph_index=self.piecewise_compile_index, num_graphs=self.total_piecewise_compiles, From 4e57c6587fe062211177f6b5d6785f00c3aea562 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 25 Nov 2025 12:55:24 -0800 Subject: [PATCH 15/24] [Core] Support logprobs with spec decode + async scheduling (#29223) Signed-off-by: Nick Hill --- tests/v1/e2e/test_async_scheduling.py | 7 ++++- vllm/v1/core/sched/scheduler.py | 2 -- vllm/v1/sample/rejection_sampler.py | 14 ++++++++-- vllm/v1/worker/gpu_model_runner.py | 37 ++++++++++++--------------- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 00d93e1ba0b53..945276376d665 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -87,6 +87,11 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): # Set small draft model len to force doesn't-fit-in-drafter case. spec_config_short = spec_config | {"max_model_len": 50} + test_sampling_params = [ + dict(), + dict(logprobs=2), + ] + # test_preemption, executor, async_scheduling, # spec_config, test_prefill_chunking test_configs = [ @@ -103,7 +108,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): (True, "uni", True, spec_config_short, True), ] - run_tests(monkeypatch, MTP_MODEL, test_configs, [{}]) + run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params) @dynamo_config.patch(cache_size_limit=16) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index bea2f865bad46..0304a8ec48bf7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1089,8 +1089,6 @@ class Scheduler(SchedulerInterface): and request.sampling_params.logprobs is not None and logprobs ): - # NOTE: once we support N tokens per step (spec decode), - # the outer lists can be of length > 1. new_logprobs = logprobs.slice(req_index, req_index + 1) if new_token_ids and self.structured_output_manager.should_advance(request): diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 926305d25f56b..ccaf07e18c468 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from dataclasses import replace import torch @@ -204,7 +205,9 @@ class RejectionSampler(nn.Module): def parse_output( output_token_ids: torch.Tensor, vocab_size: int, - ) -> list[list[int]]: + discard_req_indices: Sequence[int] = (), + return_cu_num_tokens: bool = False, + ) -> tuple[list[list[int]], list[int] | None]: """Parse the output of the rejection sampler. Args: output_token_ids: The sampled token IDs in shape @@ -212,6 +215,8 @@ class RejectionSampler(nn.Module): replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler and will be filtered out in this function. vocab_size: The size of the vocabulary. + discard_req_indices: Optional row indices to discard tokens in. + return_cu_num_tokens: Whether to also return cumulative token counts. Returns: A list of lists of token IDs. """ @@ -220,10 +225,15 @@ class RejectionSampler(nn.Module): valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & ( output_token_ids_np < vocab_size ) + cu_num_tokens = None + if return_cu_num_tokens: + cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist() + if len(discard_req_indices) > 0: + valid_mask[discard_req_indices] = False outputs = [ row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np) ] - return outputs + return outputs, cu_num_tokens def apply_logits_processors( self, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e78d3c71af77a..bb44c5ad84cc1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -183,7 +183,7 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): self, model_runner_output: ModelRunnerOutput, sampled_token_ids: torch.Tensor, - logprobs_tensors: torch.Tensor | None, + logprobs_tensors: LogprobsTensors | None, invalid_req_indices: list[int], async_output_copy_stream: torch.cuda.Stream, vocab_size: int, @@ -219,28 +219,29 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): This function blocks until the copy is finished. """ + max_gen_len = self.sampled_token_ids_cpu.shape[-1] self.async_copy_ready_event.synchronize() # Release the device tensors once the copy has completed. del self._logprobs_tensors del self._sampled_token_ids - max_gen_len = self.sampled_token_ids_cpu.shape[-1] if max_gen_len == 1: valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist() + for i in self._invalid_req_indices: + valid_sampled_token_ids[i].clear() + cu_num_tokens = None else: - valid_sampled_token_ids = RejectionSampler.parse_output( + valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( self.sampled_token_ids_cpu, self.vocab_size, + self._invalid_req_indices, + return_cu_num_tokens=self._logprobs_tensors_cpu is not None, ) - for i in self._invalid_req_indices: - valid_sampled_token_ids[i].clear() output = self._model_runner_output output.sampled_token_ids = valid_sampled_token_ids if self._logprobs_tensors_cpu: - # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens - # for async sched + spec decode + logprobs compatibility. - output.logprobs = self._logprobs_tensors_cpu.tolists() + output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens) return output @@ -2597,28 +2598,24 @@ class GPUModelRunner( sampled_token_ids = sampler_output.sampled_token_ids logprobs_tensors = sampler_output.logprobs_tensors invalid_req_indices = [] - cu_num_new_tokens: list[int] | None = None + cu_num_tokens: list[int] | None = None if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] if max_gen_len == 1: # No spec decode tokens. valid_sampled_token_ids = self._to_list(sampled_token_ids) + # Mask out the sampled tokens that should not be sampled. + for i in discard_sampled_tokens_req_indices: + valid_sampled_token_ids[int(i)].clear() else: # Includes spec decode tokens. - valid_sampled_token_ids = self.rejection_sampler.parse_output( + valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output( sampled_token_ids, self.input_batch.vocab_size, + discard_sampled_tokens_req_indices, + return_cu_num_tokens=logprobs_tensors is not None, ) - if logprobs_tensors: - # Needed for extracting logprobs when spec decoding. - # This must be done prior to discarding sampled tokens. - cu_num_new_tokens = [0] - for toks in valid_sampled_token_ids: - cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks)) - # Mask out the sampled tokens that should not be sampled. - for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[int(i)].clear() else: valid_sampled_token_ids = [] invalid_req_indices = discard_sampled_tokens_req_indices.tolist() @@ -2672,7 +2669,7 @@ class GPUModelRunner( req_state.output_token_ids.extend(sampled_ids) logprobs_lists = ( - logprobs_tensors.tolists(cu_num_new_tokens) + logprobs_tensors.tolists(cu_num_tokens) if not self.use_async_scheduling and logprobs_tensors is not None else None ) From 0abc79482a6d476f58530907443c46134ba6e2e1 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Tue, 25 Nov 2025 16:46:41 -0500 Subject: [PATCH 16/24] [caching] Add enable_prompt_embeds and cpu_offload_gb to compile hashes. (#29435) Signed-off-by: zhxchen17 --- vllm/config/cache.py | 4 +--- vllm/config/model.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index ef6928d8ebd5c..00530846fce00 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -144,7 +144,7 @@ class CacheConfig: kv_offloading_backend: KVOffloadingBackend | None = None """The backend to use for KV cache offloading. Supported backends include - 'native' (vLLM native CPU offloading), 'lmcache' This option must be used + 'native' (vLLM native CPU offloading), 'lmcache' This option must be used together with kv_offloading_size.""" def compute_hash(self) -> str: @@ -167,8 +167,6 @@ class CacheConfig: "num_gpu_blocks_override", "enable_prefix_caching", "prefix_caching_hash_algo", - # `cpu_offload_gb` does not use `torch.compile` yet. - "cpu_offload_gb", "cpu_kvcache_space_bytes", "mamba_page_size_padded", # Post-init/derived counters diff --git a/vllm/config/model.py b/vllm/config/model.py index ce5e824da5c22..25972f097f53d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -345,7 +345,6 @@ class ModelConfig: "logprobs_mode", "disable_cascade_attn", "skip_tokenizer_init", - "enable_prompt_embeds", "served_model_name", "config_format", "hf_token", From 7df0289782ab500b2713b7521979c28de2b21cac Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Nov 2025 17:52:31 -0500 Subject: [PATCH 17/24] Change warning logs to debug for unimplemented MXFP4 Linear/Attention (#29441) Signed-off-by: Michael Goin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 198feb03be3e4..d975131f7cff7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -196,9 +196,10 @@ class Mxfp4Config(QuantizationConfig): # TODO: Add support for MXFP4 Linear Method. # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation # if you are interested in enabling MXFP4 here. - logger.warning_once( + logger.debug_once( "MXFP4 linear layer is not implemented - falling back to " - "UnquantizedLinearMethod." + "UnquantizedLinearMethod.", + scope="local", ) return UnquantizedLinearMethod() elif isinstance(layer, FusedMoE): @@ -208,9 +209,10 @@ class Mxfp4Config(QuantizationConfig): return Mxfp4MoEMethod(layer.moe_config) elif isinstance(layer, Attention): # TODO: Add support for MXFP4 Attention. - logger.warning_once( + logger.debug_once( "MXFP4 attention layer is not implemented. " - "Skipping quantization for this layer." + "Skipping quantization for this layer.", + scope="local", ) return None From de75b0bb701c89c1b2bffe09b23f86446c951f73 Mon Sep 17 00:00:00 2001 From: Andrey Khalyavin Date: Wed, 26 Nov 2025 02:45:58 +0300 Subject: [PATCH 18/24] [BugFix] Fix initialization of draft model. (#29319) Signed-off-by: Andrey Khalyavin Signed-off-by: Tyler Michael Smith Co-authored-by: Tyler Michael Smith --- vllm/v1/worker/gpu_model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bb44c5ad84cc1..9f3c34b15e2a8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3460,6 +3460,10 @@ class GPUModelRunner( scope="local", ) prepare_communication_buffer_for_model(self.model) + if (drafter := getattr(self, "drafter", None)) and ( + drafter_model := getattr(drafter, "model", None) + ): + prepare_communication_buffer_for_model(drafter_model) mm_config = self.model_config.multimodal_config self.is_multimodal_pruning_enabled = ( supports_multimodal_pruning(self.get_model()) From d8819c88eb633f1435ef389f87c23a9fd6010917 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:14:23 -0800 Subject: [PATCH 19/24] fix assertion for single world use case (uni) (#29429) Signed-off-by: Lu Fang Co-authored-by: Lucia (Lu) Fang --- vllm/config/parallel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 913e97250d3d3..7ba1da5db3849 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -593,9 +593,10 @@ class ParallelConfig: "max_parallel_loading_workers is currently " "not supported and will be ignored." ) - if self.distributed_executor_backend != "mp" and self.nnodes > 1: + if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1: raise ValueError( - "nnodes > 1 can only be set when distributed exectuor backend is mp." + "nnodes > 1 can only be set when distributed executor " + "backend is mp or uni." ) @property From 12866af748f3933b62891f02646526c64395b26f Mon Sep 17 00:00:00 2001 From: Xieyang Xu Date: Tue, 25 Nov 2025 16:20:35 -0800 Subject: [PATCH 20/24] dummy run corner case (#29433) --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9f3c34b15e2a8..d3c61794f8b0d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2789,7 +2789,7 @@ class GPUModelRunner( # returns True. before returning early here we call # dummy run to ensure coordinate_batch_across_dp # is called into to avoid out of sync issues. - self._dummy_run(1) + self._dummy_run(self._get_num_input_tokens(1)) if not has_kv_transfer_group(): # Return empty ModelRunnerOutput if no work to do. return EMPTY_MODEL_RUNNER_OUTPUT From 56531b79ccc746bb579a49411f32be31bc307d4b Mon Sep 17 00:00:00 2001 From: "George D. Torres" <41129492+geodavic@users.noreply.github.com> Date: Tue, 25 Nov 2025 18:50:22 -0600 Subject: [PATCH 21/24] [Misc] Add backup hash algorithm for FIPS constrained environments (#28795) Signed-off-by: George D. Torres Signed-off-by: George D. Torres <41129492+geodavic@users.noreply.github.com> Signed-off-by: Russell Bryant Co-authored-by: Russell Bryant --- vllm/compilation/caching.py | 4 ++-- vllm/compilation/compiler_interface.py | 14 +++++++------- vllm/config/device.py | 4 ++-- vllm/config/kv_transfer.py | 4 ++-- vllm/config/load.py | 4 ++-- vllm/config/lora.py | 4 ++-- vllm/config/multimodal.py | 4 ++-- vllm/config/observability.py | 4 ++-- vllm/config/pooler.py | 4 ++-- vllm/config/scheduler.py | 4 ++-- vllm/config/speculative.py | 4 ++-- vllm/config/structured_outputs.py | 4 ++-- vllm/config/vllm.py | 10 +++++----- .../v1/shared_storage_connector.py | 4 ++-- vllm/model_executor/models/registry.py | 4 ++-- vllm/utils/hashing.py | 18 ++++++++++++++++++ 16 files changed, 56 insertions(+), 38 deletions(-) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 6297d9f995aa4..ce482572b401b 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib import inspect import os import pickle @@ -14,6 +13,7 @@ import vllm.envs as envs from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.utils import hash_factors from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash try: from torch._dynamo.aot_compile import SerializableCallable @@ -160,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str: # e.g. exec(). We can't actually check these. continue hash_content.append(content) - return hashlib.md5( + return safe_hash( "\n".join(hash_content).encode(), usedforsecurity=False ).hexdigest() diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 11cf0f85c1787..7deaba1a99fad 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import copy -import hashlib import os from collections.abc import Callable from contextlib import ExitStack @@ -16,6 +15,7 @@ import torch.fx as fx import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.config import VllmConfig +from vllm.utils.hashing import safe_hash from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -197,9 +197,9 @@ class InductorStandaloneAdaptor(CompilerInterface): def compute_hash(self, vllm_config: VllmConfig) -> str: factors = get_inductor_factors() - hash_str = hashlib.md5( - str(factors).encode(), usedforsecurity=False - ).hexdigest()[:10] + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[ + :10 + ] return hash_str def initialize_cache( @@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface): def compute_hash(self, vllm_config: VllmConfig) -> str: factors = get_inductor_factors() - hash_str = hashlib.md5( - str(factors).encode(), usedforsecurity=False - ).hexdigest()[:10] + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[ + :10 + ] return hash_str def initialize_cache( diff --git a/vllm/config/device.py b/vllm/config/device.py index e85cd15de8cf4..85662ddff76b7 100644 --- a/vllm/config/device.py +++ b/vllm/config/device.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from dataclasses import field from typing import Any, Literal @@ -10,6 +9,7 @@ from pydantic import ConfigDict, SkipValidation from pydantic.dataclasses import dataclass from vllm.config.utils import config +from vllm.utils.hashing import safe_hash Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] @@ -45,7 +45,7 @@ class DeviceConfig: # the device/platform information will be summarized # by torch/vllm automatically. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str def __post_init__(self): diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py index dfd7ef63712a3..88f8b91c292bb 100644 --- a/vllm/config/kv_transfer.py +++ b/vllm/config/kv_transfer.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib import uuid from dataclasses import field from typing import Any, Literal, get_args @@ -9,6 +8,7 @@ from typing import Any, Literal, get_args from pydantic.dataclasses import dataclass from vllm.config.utils import config +from vllm.utils.hashing import safe_hash KVProducer = Literal["kv_producer", "kv_both"] KVConsumer = Literal["kv_consumer", "kv_both"] @@ -79,7 +79,7 @@ class KVTransferConfig: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str def __post_init__(self) -> None: diff --git a/vllm/config/load.py b/vllm/config/load.py index e424f8c5edb62..579a0bc31020e 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from typing import TYPE_CHECKING, Any from pydantic import Field, field_validator @@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass from vllm.config.utils import config from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash if TYPE_CHECKING: from vllm.model_executor.model_loader import LoadFormats @@ -104,7 +104,7 @@ class LoadConfig: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str @field_validator("load_format", mode="after") diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 072e0ec2104f5..6a8fd6359aadd 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from typing import TYPE_CHECKING, Any, Literal import torch @@ -11,6 +10,7 @@ from typing_extensions import Self from vllm.config.utils import config from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash if TYPE_CHECKING: from vllm.config import ModelConfig @@ -74,7 +74,7 @@ class LoRAConfig: factors.append(self.fully_sharded_loras) factors.append(self.lora_dtype) - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str @model_validator(mode="after") diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 00a81a319bf72..590bc4dcd0760 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from collections.abc import Mapping from typing import TYPE_CHECKING, Any, Literal, TypeAlias @@ -9,6 +8,7 @@ from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config +from vllm.utils.hashing import safe_hash if TYPE_CHECKING: from vllm.attention.backends.registry import AttentionBackendEnum @@ -216,7 +216,7 @@ class MultiModalConfig: if self.mm_encoder_attn_backend is not None else None ] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str def get_limit_per_prompt(self, modality: str) -> int: diff --git a/vllm/config/observability.py b/vllm/config/observability.py index 564c4f7aed419..ff35e12fe20ed 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from functools import cached_property from typing import Any, Literal, cast @@ -11,6 +10,7 @@ from pydantic.dataclasses import dataclass from vllm import version from vllm.config.utils import config +from vllm.utils.hashing import safe_hash DetailedTraceModules = Literal["model", "worker", "all"] @@ -78,7 +78,7 @@ class ObservabilityConfig: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str @field_validator("show_hidden_metrics_for_version") diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 6bece8d0785bd..85950bbcd666f 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from typing import Any from pydantic.dataclasses import dataclass from vllm.config.utils import config from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash logger = init_logger(__name__) @@ -102,7 +102,7 @@ class PoolerConfig: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index b6078706daacf..2cf42d57ec217 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from collections.abc import Callable from dataclasses import InitVar from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast @@ -12,6 +11,7 @@ from typing_extensions import Self, deprecated from vllm.config.utils import config from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash from vllm.utils.import_utils import resolve_obj_by_qualname if TYPE_CHECKING: @@ -178,7 +178,7 @@ class SchedulerConfig: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str @field_validator("scheduler_cls", "async_scheduling", mode="wrap") diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index d7c019c73d598..80d53a543f149 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast -import hashlib from typing import TYPE_CHECKING, Any, Literal, get_args from pydantic import Field, SkipValidation, model_validator @@ -13,6 +12,7 @@ from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash from vllm.utils.import_utils import LazyLoader, has_arctic_inference if TYPE_CHECKING: @@ -162,7 +162,7 @@ class SpeculativeConfig: # Eagle3 affects the computation graph because it returns intermediate # hidden states in addition to the final hidden state. factors.append(self.method == "eagle3") - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str @staticmethod diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py index 9530d3d81e15d..1b32675c3dbd2 100644 --- a/vllm/config/structured_outputs.py +++ b/vllm/config/structured_outputs.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from typing import Any, Literal from pydantic import model_validator @@ -9,6 +8,7 @@ from pydantic.dataclasses import dataclass from typing_extensions import Self from vllm.config.utils import config +from vllm.utils.hashing import safe_hash StructuredOutputsBackend = Literal[ "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer" @@ -58,7 +58,7 @@ class StructuredOutputsConfig: # no factors to consider. # this config will not affect the computation graph. factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str @model_validator(mode="after") diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 8a3599416bc72..9342564aa3d3f 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -3,7 +3,6 @@ import copy import getpass -import hashlib import json import os import tempfile @@ -25,6 +24,7 @@ from vllm.config.speculative import EagleModelTypes from vllm.logger import enable_trace_function_call, init_logger from vllm.transformers_utils.runai_utils import is_runai_obj_uri from vllm.utils import random_uuid +from vllm.utils.hashing import safe_hash from .cache import CacheConfig from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode @@ -193,7 +193,7 @@ class VllmConfig: vllm_factors.append("None") if self.additional_config: if isinstance(additional_config := self.additional_config, dict): - additional_config_hash = hashlib.md5( + additional_config_hash = safe_hash( json.dumps(additional_config, sort_keys=True).encode(), usedforsecurity=False, ).hexdigest() @@ -204,9 +204,9 @@ class VllmConfig: vllm_factors.append("None") factors.append(vllm_factors) - hash_str = hashlib.md5( - str(factors).encode(), usedforsecurity=False - ).hexdigest()[:10] + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[ + :10 + ] return hash_str def pad_for_cudagraph(self, batch_size: int) -> int: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 016d1d45b3593..4611b4d1ff7b8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib import os from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Optional @@ -15,6 +14,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorRole, ) from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput @@ -423,7 +423,7 @@ class SharedStorageConnector(KVConnectorBase_V1): if mm_hashes: mm_str = "-".join(mm_hashes) token_bytes += mm_str.encode("utf-8") - input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest() + input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest() foldername = os.path.join(self._storage_path, input_ids_hash) if create_folder: diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a0d8a78a2ae76..53644f9cb8788 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -5,7 +5,6 @@ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. """ -import hashlib import importlib import json import os @@ -32,6 +31,7 @@ from vllm.config import ( from vllm.logger import init_logger from vllm.logging_utils import logtime from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module +from vllm.utils.hashing import safe_hash from .interfaces import ( has_inner_state, @@ -654,7 +654,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel): if model_path.exists(): with open(model_path, "rb") as f: - module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest() + module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest() mi = self._load_modelinfo_from_cache(module_hash) if mi is not None: diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py index 49f4f13d115f3..edf1e9cb34e56 100644 --- a/vllm/utils/hashing.py +++ b/vllm/utils/hashing.py @@ -5,6 +5,7 @@ from __future__ import annotations import hashlib import pickle +from _hashlib import HASH, UnsupportedDigestmodError from collections.abc import Callable from typing import Any @@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]: return sha256_cbor raise ValueError(f"Unsupported hash function: {hash_fn_name}") + + +def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH: + """Hash for configs, defaulting to md5 but falling back to sha256 + in FIPS constrained environments. + + Args: + data: bytes + usedforsecurity: Whether the hash is used for security purposes + + Returns: + Hash object + """ + try: + return hashlib.md5(data, usedforsecurity=usedforsecurity) + except (UnsupportedDigestmodError, ValueError): + return hashlib.sha256(data) From 8d6a89dffd9e2e238b3c6ead964783cfdb053756 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Nov 2025 20:19:35 -0500 Subject: [PATCH 22/24] [UX] Suppress gloo log spam (#29250) Signed-off-by: mgoin --- vllm/distributed/parallel_state.py | 4 ++- vllm/distributed/utils.py | 52 ++++++++++++++++-------------- vllm/utils/system_utils.py | 33 +++++++++++++++++++ 3 files changed, 63 insertions(+), 26 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f81612fd1f4a3..69c28e278f2d2 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.network_utils import get_distributed_init_method +from vllm.utils.system_utils import suppress_stdout from vllm.utils.torch_utils import ( direct_register_custom_op, supports_custom_op, @@ -329,7 +330,8 @@ class GroupCoordinator: ) # a group with `gloo` backend, to allow direct coordination between # processes through the CPU. - cpu_group = torch.distributed.new_group(ranks, backend="gloo") + with suppress_stdout(): + cpu_group = torch.distributed.new_group(ranks, backend="gloo") if self.rank in ranks: self.ranks = ranks self.world_size = len(ranks) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index debf69c49b7d9..242ce393e4dc8 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous import vllm.envs as envs from vllm.logger import init_logger from vllm.utils.network_utils import get_tcp_uri +from vllm.utils.system_utils import suppress_stdout from vllm.utils.torch_utils import is_torch_equal_or_newer logger = init_logger(__name__) @@ -427,33 +428,34 @@ def init_gloo_process_group( Stateless init ProcessGroup with gloo backend compatible with different torch versions. """ - if is_torch_equal_or_newer("2.6"): - pg = ProcessGroup( - prefix_store, - group_rank, - group_size, - ) - else: - options = ProcessGroup.Options(backend="gloo") - pg = ProcessGroup( - prefix_store, - group_rank, - group_size, - options, - ) - from torch.distributed.distributed_c10d import ProcessGroupGloo + with suppress_stdout(): + if is_torch_equal_or_newer("2.6"): + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + else: + options = ProcessGroup.Options(backend="gloo") + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + options, + ) + from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo( - prefix_store, group_rank, group_size, timeout=timeout - ) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - if is_torch_equal_or_newer("2.6"): - # _set_default_backend is supported in torch >= 2.6 - pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() + backend_class = ProcessGroupGloo( + prefix_store, group_rank, group_size, timeout=timeout + ) + backend_type = ProcessGroup.BackendType.GLOO + device = torch.device("cpu") + if is_torch_equal_or_newer("2.6"): + # _set_default_backend is supported in torch >= 2.6 + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() - pg._register_backend(device, backend_type, backend_class) + pg._register_backend(device, backend_type, backend_class) return pg diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py index cc872040b6c5f..a4eb8f4d4fd7d 100644 --- a/vllm/utils/system_utils.py +++ b/vllm/utils/system_utils.py @@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]: os.environ[key] = old +@contextlib.contextmanager +def suppress_stdout(): + """ + Suppress stdout from C libraries at the file descriptor level. + + Only suppresses stdout, not stderr, to preserve error messages. + Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG. + + Example: + with suppress_stdout(): + # C library calls that would normally print to stdout + torch.distributed.new_group(ranks, backend="gloo") + """ + # Don't suppress if logging level is DEBUG + if envs.VLLM_LOGGING_LEVEL == "DEBUG": + yield + return + + stdout_fd = sys.stdout.fileno() + stdout_dup = os.dup(stdout_fd) + devnull_fd = os.open(os.devnull, os.O_WRONLY) + + try: + sys.stdout.flush() + os.dup2(devnull_fd, stdout_fd) + yield + finally: + sys.stdout.flush() + os.dup2(stdout_dup, stdout_fd) + os.close(stdout_dup) + os.close(devnull_fd) + + # File path utilities From c5ee430328789c37a54526da3d254ae77d53ea4f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 26 Nov 2025 01:57:08 +0000 Subject: [PATCH 23/24] Bump actions/checkout from 4 to 6 (#29293) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/cleanup_pr_body.yml | 2 +- .github/workflows/macos-smoke-test.yml | 2 +- .github/workflows/pre-commit.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index c3e132a536a42..861290ea43c87 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -13,7 +13,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - name: Set up Python uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index a183033c9adde..3a12c4b3a8300 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -12,7 +12,7 @@ jobs: timeout-minutes: 30 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v7 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index e21d13b8161f3..d5e70f30ef638 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,7 +16,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: "3.12" From 53d7f1f601a12b8fa58aa0bffb7fc27d63d1eb5e Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Tue, 25 Nov 2025 18:21:00 -0800 Subject: [PATCH 24/24] [Kernel] Use pre-allocated output buffer for triton kernel fused_experts (#29219) Signed-off-by: Xin Yang --- .../fused_moe/gpt_oss_triton_kernels_moe.py | 84 ++++++++++++++++--- 1 file changed, 73 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index badedfc54c382..128507639fdfd 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk @@ -12,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP, ) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.triton_utils import tl, triton from vllm.utils.import_utils import has_triton_kernels @@ -88,14 +90,17 @@ def triton_kernel_moe_forward( gating_output, topk, sm_first=not renormalize ) + output = torch.empty_like(hidden_states) + return triton_kernel_fused_experts( - None, + output, hidden_states, w1, w2, routing_data, gather_idx, scatter_idx, + topk=topk, activation=activation, quant_config=quant_config, apply_router_weight_on_input=apply_router_weight_on_input, @@ -113,6 +118,7 @@ def triton_kernel_fused_experts( routing_data, # RoutingData gather_indx, # GatherIndx scatter_indx, # ScatterIndx + topk: int, activation: str = "silu", quant_config: FusedMoEQuantConfig | None = None, swiglu_alpha: float = 1.702, @@ -120,6 +126,7 @@ def triton_kernel_fused_experts( apply_router_weight_on_input: bool = False, global_num_experts: int = -1, expert_map: torch.Tensor | None = None, + intermediate_cache: torch.Tensor | None = None, a1q_scale: torch.Tensor | None = None, ) -> torch.Tensor: if quant_config is None: @@ -131,14 +138,30 @@ def triton_kernel_fused_experts( assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32 # Shape check, only check non-mxfp4 + assert hidden_states.ndim == 2 assert hidden_states.shape[-1] == w1.shape[-2] assert w2.shape[-1] == w1.shape[1] + batch_dim = 1 + M, K = hidden_states.shape[-2:] E, _, N = w1.shape if global_num_experts == -1: global_num_experts = E + if intermediate_cache is None: + intermediate_cache = torch.empty( + (batch_dim, M * topk, N // 2), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + # Add batch_dim to output buffer because matmul_ogs expects 3D output + intermediate_cache = _resize_cache( + intermediate_cache, (batch_dim, M * topk, N // 2) + ) + output_tensor = _resize_cache(output_tensor, (batch_dim, M, K)) + act = FusedActivation( FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), (swiglu_alpha, swiglu_limit), @@ -146,7 +169,7 @@ def triton_kernel_fused_experts( ) gammas = routing_data.gate_scal if routing_data else None - intermediate_cache1 = matmul_ogs( + matmul_ogs( hidden_states, w1, quant_config.w1_bias, @@ -155,10 +178,11 @@ def triton_kernel_fused_experts( precision_config=quant_config.w1_precision, gammas=gammas if apply_router_weight_on_input else None, fused_activation=act, + y=intermediate_cache, ) - intermediate_cache3 = matmul_ogs( - intermediate_cache1, + matmul_ogs( + intermediate_cache.view(M * topk, N // 2), w2, quant_config.w2_bias, routing_data, @@ -167,7 +191,8 @@ def triton_kernel_fused_experts( gammas=None if apply_router_weight_on_input else gammas, y=output_tensor, ) - return intermediate_cache3 + output_tensor = output_tensor.view(M, K) + return output_tensor def make_routing_data( @@ -221,6 +246,42 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): def supports_expert_map(self) -> bool: return True + def moe_problem_size( + self, + a1: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + ) -> tuple[int, int, int, int, int]: + """ + Extract the MoE problem size from the given tensor arguments: + - a: The hidden states, input to the MoE layer. + - w1: The first set of expert weights. + - w2: The second set of expert weights. + - topk_ids: The topk ids. + Note: extracting the problem shape from the weight and activation + tensors is not obvious. It needs to be done this way specifically + due to subtle issues with particular kernels, e.g. the int4 kernels + divide the trailing dimension by two, so it's not "correct" to + extract N or K from the trailing dimension of w1 or w2. Similarly, + some kernels transpose the weights, so this needs to be kept in mind. + Note: This implementation covers most cases. However, if experts + require a specialized implementation, like MarlinExperts, they are free + to override this function. + """ + assert w1.dim() == 3 and w2.dim() == 3 + E, _, N = w1.size() + K = a1.size(-1) + + assert a1.dim() == 2 + assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}" + M = a1.size(0) + + assert topk_ids.dim() == 2 + topk = topk_ids.size(1) + + return E, M, N, K, topk + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Weight application and reduction happens in the fused_experts kernel. return TopKWeightAndReduceNoOP() @@ -263,8 +324,8 @@ class OAITritonExperts(BaseOAITritonExperts): expert_tokens_meta: mk.ExpertTokensMetadata | None, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # workspace are allocated inside the kernel - workspace1 = (M, K) - workspace2 = (0, 0) + workspace1 = (0, 0) + workspace2 = (M * topk, N // 2) output = (M, K) return (workspace1, workspace2, output) @@ -297,20 +358,21 @@ class OAITritonExperts(BaseOAITritonExperts): topk_ids, topk_weights, local_num_experts ) - experts_output = triton_kernel_fused_experts( - None, + topk = topk_ids.size(1) + triton_kernel_fused_experts( + output, hidden_states, w1, w2, routing_data, gather_indx, scatter_indx, + topk=topk, activation=activation, quant_config=self.quant_config, apply_router_weight_on_input=False, global_num_experts=local_num_experts, expert_map=None, # applied already + intermediate_cache=workspace2, a1q_scale=a1q_scale, ) - - output.copy_(experts_output, non_blocking=True)