From 10ee1c64cfa7c0b7f68e9ee793435c9cafbf821a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 16 Dec 2025 14:28:34 -0500 Subject: [PATCH 01/49] [CI] Generalize gsm8k test args and add Qwen3-Next MTP B200 test (#30723) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 4 +- tests/evals/gsm8k/README.md | 13 ++-- .../DeepSeek-V2-Lite-Instruct-FP8.yaml | 3 +- .../Llama-3-8B-Instruct-nonuniform-CT.yaml | 2 +- .../Llama-3.2-1B-Instruct-INT8-CT.yaml | 2 +- .../gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml | 2 +- .../Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml | 2 +- tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml | 2 +- .../gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml | 3 +- .../configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml | 12 ++++ .../evals/gsm8k/configs/models-blackwell.txt | 1 + tests/evals/gsm8k/conftest.py | 8 +-- tests/evals/gsm8k/test_gsm8k_correctness.py | 70 +++++++++++-------- .../compressed_tensors_moe.py | 11 +-- 14 files changed, 78 insertions(+), 57 deletions(-) create mode 100644 tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9d0b3fdd3a02c..8e6d32f71f220 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -654,7 +654,7 @@ steps: - vllm/model_executor/layers/quantization autorun_on_main: true commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: OpenAI API correctness # 22min timeout_in_minutes: 30 @@ -1064,7 +1064,7 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md index 29c5199e1e87a..dcbfd85bfeee8 100644 --- a/tests/evals/gsm8k/README.md +++ b/tests/evals/gsm8k/README.md @@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation, ### Run tests with pytest (like buildkite) ```bash -pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \ - --config-list-file=configs/models-small.txt \ - --tp-size=1 +pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \ + --config-list-file=configs/models-small.txt ``` ### Run standalone evaluation script @@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct" accuracy_threshold: 0.54 # Minimum expected accuracy num_questions: 1319 # Number of questions (default: full test set) num_fewshot: 5 # Few-shot examples from train set -max_model_len: 4096 # Model context length +server_args: "--max-model-len 4096 --tensor-parallel-size 2" # Server arguments +env: # Environment variables (optional) + VLLM_USE_FLASHINFER_MOE_FP4: "1" ``` + +The `server_args` field accepts any arguments that can be passed to `vllm serve`. + +The `env` field accepts a dictionary of environment variables to set for the server process. diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml index 7ec6a1e0be27f..72fa7e8a38c73 100644 --- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml +++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml @@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" accuracy_threshold: 0.72 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 - +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml index caa0448f23d48..b7b59e9dcd5ce 100644 --- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml +++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml @@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" accuracy_threshold: 0.74 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 \ No newline at end of file +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml index 615aa69a2d2b6..8b3c9ff645e87 100644 --- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml +++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml @@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8" accuracy_threshold: 0.31 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 \ No newline at end of file +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml index 9297bf6ddf2d3..4a1b1948acac8 100644 --- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml +++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml @@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" accuracy_threshold: 0.45 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index 5319ada30f645..5ce3af8be346a 100644 --- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" accuracy_threshold: 0.60 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 \ No newline at end of file +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml index c39fb979d98ac..5452ebe753f04 100644 --- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml +++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml @@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8" accuracy_threshold: 0.375 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 \ No newline at end of file +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml index 6b7bdd1e65bb3..f162aa8bfe5b0 100644 --- a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml +++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml @@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4" accuracy_threshold: 0.89 num_questions: 1319 num_fewshot: 5 -max_model_len: 4096 - +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml new file mode 100644 index 0000000000000..673b473f817eb --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml @@ -0,0 +1,12 @@ +model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4" +accuracy_threshold: 0.75 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --enforce-eager + --max-model-len 4096 + --tensor-parallel-size 2 + --enable-expert-parallel + --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt index 3c9b1084de7bc..39978aa6ffbe9 100644 --- a/tests/evals/gsm8k/configs/models-blackwell.txt +++ b/tests/evals/gsm8k/configs/models-blackwell.txt @@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Qwen1.5-MoE-W4A16-CT.yaml DeepSeek-V2-Lite-Instruct-FP8.yaml Qwen3-30B-A3B-NVFP4.yaml +Qwen3-Next-80B-A3B-NVFP4-EP2.yaml diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py index 1932a13cdfc63..6f25fe6414af4 100644 --- a/tests/evals/gsm8k/conftest.py +++ b/tests/evals/gsm8k/conftest.py @@ -11,14 +11,12 @@ def pytest_addoption(parser): default="configs/models-small.txt", help="File containing list of config files to test", ) - parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size") def pytest_generate_tests(metafunc): """Generate test parameters from config files.""" if "config_filename" in metafunc.fixturenames: config_list_file = metafunc.config.getoption("--config-list-file") - tp_size = metafunc.config.getoption("--tp-size") # Handle both relative and absolute paths config_list_path = Path(config_list_file) @@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc): # Generate test parameters if config_files: metafunc.parametrize( - ["config_filename", "tp_size"], - [(config_file, int(tp_size)) for config_file in config_files], - ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files], + "config_filename", + config_files, + ids=[config_file.stem for config_file in config_files], ) else: print("No config files found, test will be skipped") diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py index b5d67df7bf3db..ea6715f5cb532 100644 --- a/tests/evals/gsm8k/test_gsm8k_correctness.py +++ b/tests/evals/gsm8k/test_gsm8k_correctness.py @@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script. Replacement for lm-eval-harness with better performance and control. Usage: -pytest -s -v test_gsm8k_correctness.py \ - --config-list-file=configs/models-small.txt \ - --tp-size=1 +pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \ + --config-list-file=configs/models-small.txt """ +import shlex + import yaml from tests.utils import RemoteOpenAIServer from .gsm8k_eval import evaluate_gsm8k -RTOL = 0.08 # Relative tolerance for accuracy comparison +TOL = 0.08 # Absolute tolerance for accuracy comparison -def launch_gsm8k_eval(eval_config, server_url, tp_size): - """Launch GSM8K evaluation using our isolated script.""" +def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict: + """Run GSM8K evaluation using our isolated script.""" # Extract host and port from server URL if "://" in server_url: server_url = server_url.split("://")[1] host_port = server_url.split("/")[0] # Remove path if present if ":" in host_port: - host, port = host_port.split(":") - port = int(port) + host, p = host_port.split(":") + port = int(p) else: host = host_port port = 8000 @@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size): return results -def test_gsm8k_correctness_param(config_filename, tp_size): +def test_gsm8k_correctness(config_filename): """Test GSM8K correctness for a given model configuration.""" eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) - # Server arguments - server_args = [ - "--max-model-len", - str(eval_config.get("max_model_len", 4096)), - "--enforce-eager", - "--trust-remote-code", - "--tensor-parallel-size", - str(tp_size), - ] + # Parse server arguments from config (use shlex to handle quoted strings) + server_args_str = eval_config.get("server_args", "") + server_args = shlex.split(server_args_str) if server_args_str else [] + + # Add standard server arguments + server_args.extend( + [ + "--trust-remote-code", + ] + ) env_dict = eval_config.get("env", None) + print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}") + print(f"Expected metric threshold: {eval_config['accuracy_threshold']}") + print(f"Number of questions: {eval_config['num_questions']}") + print(f"Number of few-shot examples: {eval_config['num_fewshot']}") + print(f"Server args: {' '.join(server_args)}") + # Launch server and run evaluation with RemoteOpenAIServer( - eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480 + eval_config["model_name"], + server_args, + env_dict=env_dict, + max_wait_seconds=600, ) as remote_server: server_url = remote_server.url_for("v1") + print(f"Server started at: {server_url}") - results = launch_gsm8k_eval(eval_config, server_url, tp_size) + results = run_gsm8k_eval(eval_config, server_url) - # Check accuracy against threshold - measured_accuracy = results["accuracy"] - expected_accuracy = eval_config["accuracy_threshold"] + measured_metric = results["accuracy"] + expected_metric = eval_config["accuracy_threshold"] print(f"GSM8K Results for {eval_config['model_name']}:") - print(f" Accuracy: {measured_accuracy:.3f}") - print(f" Expected: {expected_accuracy:.3f}") + print(f" Measured metric: {measured_metric:.4f}") + print(f" Expected metric: {expected_metric:.4f}") + print(f" Tolerance: {TOL:.4f}") print(f" Questions: {results['num_questions']}") print(f" Invalid rate: {results['invalid_rate']:.3f}") print(f" Latency: {results['latency']:.1f}s") print(f" QPS: {results['questions_per_second']:.1f}") - # Verify accuracy is within tolerance - assert measured_accuracy >= expected_accuracy - RTOL, ( - f"Accuracy too low: {measured_accuracy:.3f} < " - f"{expected_accuracy:.3f} - {RTOL:.3f}" + # Verify metric is within tolerance + assert measured_metric >= expected_metric - TOL, ( + f"GSM8K metric too low: {measured_metric:.4f} < " + f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}" ) print(f"✅ GSM8K test passed for {eval_config['model_name']}") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index f650a6eabbb9c..c302e465aedb7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -626,17 +626,11 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: + # If no modular kernel is provided, use cutlass_moe_fp4 for TP case + # only (no EP). from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 - assert layer.expert_map is None, ( - "Expert Parallelism / expert_map " - "is currently not supported for " - "CompressedTensorsW4A4Nvfp4MoEMethod." - ) assert self.moe_quant_config is not None - - # Cutlass moe takes in activations in BF16/Half precision - # and fp4 quantized weights loaded from the checkpoint return cutlass_moe_fp4( a=x, w1_fp4=layer.w13_weight, @@ -644,6 +638,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, quant_config=self.moe_quant_config, + expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, # TODO(bnell): derive these from arguments m=x.shape[0], From ca702a14dc2d4c5c077dbb8098e66ca244cea185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 16 Dec 2025 20:36:49 +0100 Subject: [PATCH 02/49] [Frontend] Add `max-completion-token` option to transcription/translation endpoints (#30769) Signed-off-by: NickLucche --- .../test_transcription_validation_whisper.py | 32 ++++++++++++++++++ .../openai/test_translation_validation.py | 33 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 6 ++++ vllm/entrypoints/openai/speech_to_text.py | 10 ++++-- 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py index 3c507ee0a3fa7..8bf729c517f7a 100644 --- a/tests/entrypoints/openai/test_transcription_validation_whisper.py +++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py @@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client): ) assert transcription.segments is not None assert len(transcription.segments) > 0 + + +@pytest.mark.asyncio +async def test_audio_with_max_tokens(whisper_client, mary_had_lamb): + transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0, + extra_body={"max_completion_tokens": 1}, + ) + out = json.loads(transcription) + out_text = out["text"] + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(MODEL_NAME) + out_tokens = tok(out_text, add_special_tokens=False)["input_ids"] + assert len(out_tokens) == 1 + # max_completion_tokens > max_model_len + transcription = await whisper_client.audio.transcriptions.create( + model=MODEL_NAME, + file=mary_had_lamb, + language="en", + response_format="text", + temperature=0.0, + extra_body={"max_completion_tokens": int(1e6)}, + ) + out = json.loads(transcription) + out_text = out["text"] + out_tokens = tok(out_text, add_special_tokens=False)["input_ids"] + assert len(out_tokens) < 450 # ~Whisper max output len diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index d7d407484f16d..2c577237691ab 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model): ) out = json.loads(translation)["text"].strip().lower() assert out.count("greek sea") == 2 + + +@pytest.mark.asyncio +async def test_audio_with_max_tokens(mary_had_lamb, client_and_model): + client, model_name = client_and_model + transcription = await client.audio.translations.create( + model=model_name, + file=mary_had_lamb, + response_format="text", + temperature=0.0, + extra_body={"max_completion_tokens": 1}, + ) + out = json.loads(transcription) + out_text = out["text"] + print(out_text) + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(model_name) + out_tokens = tok(out_text, add_special_tokens=False)["input_ids"] + assert len(out_tokens) == 1 + # max_completion_tokens > max_model_len + transcription = await client.audio.transcriptions.create( + model=model_name, + file=mary_had_lamb, + response_format="text", + temperature=0.0, + extra_body={"max_completion_tokens": int(1e6)}, + ) + out = json.loads(transcription) + out_text = out["text"] + print(out_text) + out_tokens = tok(out_text, add_special_tokens=False)["input_ids"] + assert len(out_tokens) < 450 # ~Whisper max output len diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a7c4980cd3674..94dde4564ea0c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2054,6 +2054,9 @@ class TranscriptionRequest(OpenAIBaseModel): presence_penalty: float | None = 0.0 """The presence penalty to use for sampling.""" + + max_completion_tokens: int | None = None + """The maximum number of tokens to generate.""" # --8<-- [end:transcription-sampling-params] # Default sampling parameters for transcription requests. @@ -2300,6 +2303,9 @@ class TranslationRequest(OpenAIBaseModel): # Flattened stream option to simplify form data. stream_include_usage: bool | None = False stream_continuous_usage_stats: bool | None = False + + max_completion_tokens: int | None = None + """The maximum number of tokens to generate.""" # --8<-- [end:translation-extra-params] # Default sampling parameters for translation requests. diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index cea9924ebbaca..df9c06adb105a 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -293,8 +293,14 @@ class OpenAISpeechToText(OpenAIServing): try: # Unlike most decoder-only models, whisper generation length is not # constrained by the size of the input audio, which is mapped to a - # fixed-size log-mel-spectogram. - default_max_tokens = self.model_config.max_model_len + # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be + # generated by respecting the extra completion tokens arg. + if request.max_completion_tokens is None: + default_max_tokens = self.model_config.max_model_len + else: + default_max_tokens = min( + self.model_config.max_model_len, request.max_completion_tokens + ) sampling_params = request.to_sampling_params( default_max_tokens, self.default_sampling_params ) From f21f5ea38c6fa0e824bc00d5762d17e049199cd3 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:50:59 -0500 Subject: [PATCH 03/49] [Refactor] Small refactor for group topk (#30562) Signed-off-by: yewentao256 Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- csrc/moe/grouped_topk_kernels.cu | 13 ++++++++++--- tests/v1/determinism/test_batch_invariance.py | 1 - 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 5fa367abd96f5..7229e420d3fe4 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -446,9 +446,13 @@ __device__ inline T apply_sigmoid(T val) { template __device__ inline T apply_scoring(T val) { - if constexpr (SF == SCORING_SIGMOID) { + if constexpr (SF == SCORING_NONE) { + return val; + } else if constexpr (SF == SCORING_SIGMOID) { return apply_sigmoid(val); } else { + static_assert(SF == SCORING_NONE || SF == SCORING_SIGMOID, + "Unsupported ScoringFunc in apply_scoring"); return val; } } @@ -670,10 +674,13 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { if (if_proceed_next_topk) { + float scale = routed_scaling_factor; + if (renormalize) { + scale /= topk_sum; + } for (int i = lane_id; i < topk; i += WARP_SIZE) { float base = cuda_cast(s_topk_value[i]); - float value = renormalize ? (base / topk_sum * routed_scaling_factor) - : (base * routed_scaling_factor); + float value = base * scale; topk_indices[i] = s_topk_idx[i]; topk_values[i] = value; } diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index 1c45e7fe366ff..7a58e1c9bad03 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -188,7 +188,6 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - # enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", # not everything is supported From 254a7f8fd613d6b6964abc277b73ca1f0b823cdb Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Tue, 16 Dec 2025 13:01:48 -0800 Subject: [PATCH 04/49] [Perf] Do FP4 quant before All gather on flashinfer trtllmgen MOE (#30014) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> --- .../device_communicators/all2all.py | 29 ++++++++++--- .../base_device_communicator.py | 7 +++- .../device_communicators/cuda_communicator.py | 16 +++++--- vllm/distributed/parallel_state.py | 13 ++++-- .../layers/fused_moe/fused_moe_method_base.py | 12 ++++++ vllm/model_executor/layers/fused_moe/layer.py | 41 ++++++++++++++++++- .../layers/quantization/modelopt.py | 25 ++++++++++- .../quantization/utils/flashinfer_fp4_moe.py | 36 +++++++++------- vllm/utils/flashinfer.py | 17 ++++++++ 9 files changed, 165 insertions(+), 31 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index c40dde26b741f..7a4e81cf967de 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -64,7 +64,12 @@ class NaiveAll2AllManager(All2AllManagerBase): hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, + extra_tensors: list[torch.Tensor] | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if extra_tensors is not None: + raise NotImplementedError( + "extra_tensors is not supported for NaiveAll2AllManager" + ) sp_size = self.tp_group.world_size if is_sequence_parallel else 1 dp_metadata = get_forward_context().dp_metadata assert dp_metadata is not None @@ -76,6 +81,7 @@ class NaiveAll2AllManager(All2AllManagerBase): router_logits = self.naive_multicast( router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel ) + return hidden_states, router_logits def combine( @@ -113,7 +119,11 @@ class AgRsAll2AllManager(All2AllManagerBase): hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor]: + extra_tensors: list[torch.Tensor] | None = None, + ) -> ( + tuple[torch.Tensor, torch.Tensor] + | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]] + ): """ Gather hidden_states and router_logits from all dp ranks. """ @@ -121,15 +131,22 @@ class AgRsAll2AllManager(All2AllManagerBase): assert dp_metadata is not None sizes = dp_metadata.get_chunk_sizes_across_dp_rank() assert sizes is not None - dist_group = get_ep_group() if is_sequence_parallel else get_dp_group() assert sizes[dist_group.rank_in_group] == hidden_states.shape[0] - hidden_states, router_logits = dist_group.all_gatherv( - [hidden_states, router_logits], + + tensors_to_gather = [hidden_states, router_logits] + if extra_tensors is not None: + tensors_to_gather.extend(extra_tensors) + + gathered_tensors = dist_group.all_gatherv( + tensors_to_gather, dim=0, sizes=sizes, ) - return hidden_states, router_logits + + if extra_tensors is not None: + return (gathered_tensors[0], gathered_tensors[1], gathered_tensors[2:]) + return gathered_tensors[0], gathered_tensors[1] def combine( self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False @@ -204,6 +221,7 @@ class PPLXAll2AllManager(All2AllManagerBase): hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, + extra_tensors: list[torch.Tensor] | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError @@ -251,6 +269,7 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase): hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, + extra_tensors: list[torch.Tensor] | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 3a849da70e4cb..caeff54406b59 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading +from typing import Any from weakref import WeakValueDictionary import torch @@ -68,7 +69,11 @@ class All2AllManagerBase: hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, - ): + extra_tensors: list[torch.Tensor] | None = None, + ) -> Any: + # Subclasses should either: + # - implement handling for extra_tensors, or + # - raise a clear error if extra_tensors is not supported. raise NotImplementedError def set_num_sms(self, num_sms: int): diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index cd9c267beb5b5..9542498c453ec 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -318,17 +318,23 @@ class CudaCommunicator(DeviceCommunicatorBase): return output_list - def dispatch( + def dispatch( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor]: + extra_tensors: list[torch.Tensor] | None = None, + ) -> ( + tuple[torch.Tensor, torch.Tensor] + | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]] + ): assert self.all2all_manager is not None - hidden_states, router_logits = self.all2all_manager.dispatch( - hidden_states, router_logits, is_sequence_parallel + return self.all2all_manager.dispatch( + hidden_states, + router_logits, + is_sequence_parallel, + extra_tensors, # type: ignore[call-arg] ) - return hidden_states, router_logits def combine( self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 338cb1f1814b5..f5ada5a009ec3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1007,10 +1007,17 @@ class GroupCoordinator: hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, - ) -> tuple[torch.Tensor, torch.Tensor]: + extra_tensors: list[torch.Tensor] | None = None, + ) -> ( + tuple[torch.Tensor, torch.Tensor] + | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]] + ): if self.device_communicator is not None: - return self.device_communicator.dispatch( - hidden_states, router_logits, is_sequence_parallel + return self.device_communicator.dispatch( # type: ignore[call-arg] + hidden_states, + router_logits, + is_sequence_parallel, + extra_tensors, ) else: return hidden_states, router_logits diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 8c9d8a2777d58..a46e3972ed8e3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -71,6 +71,18 @@ class FusedMoEMethodBase(QuantizeMethodBase): "implementation based on the prepare_finalize" ) + def prepare_dp_allgather_tensor( + self, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch.""" + raise NotImplementedError( + "Method 'prepare_dp_allgather_tensor' is not implemented in " + f"{self.__class__.__name__}." + ) + @abstractmethod def get_fused_moe_quant_config( self, layer: torch.nn.Module diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index cc3afade709d9..b39ce415a0f83 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -44,6 +44,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( is_flashinfer_supporting_global_sf, ) from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import ( aux_stream, @@ -1933,10 +1934,46 @@ class FusedMoE(CustomOp): ) with sp_ctx: + extra_tensors = None if do_naive_dispatch_combine: - hidden_states_combined, router_logits = get_ep_group().dispatch( - hidden_states, router_logits, self.is_sequence_parallel + # Avoid circular import + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptNvFp4FusedMoE, ) + + post_quant_allgather = ( + has_flashinfer_trtllm_fused_moe() + and self.quant_method is not None + and self.dp_size > 1 + and self.use_ep + and isinstance(self.quant_method, ModelOptNvFp4FusedMoE) + ) + if post_quant_allgather: + hidden_states_to_dispatch, extra_tensors = ( + self.quant_method.prepare_dp_allgather_tensor( + self, hidden_states, router_logits + ) + ) + else: + hidden_states_to_dispatch = hidden_states + + dispatch_res = get_ep_group().dispatch( + hidden_states_to_dispatch, + router_logits, + self.is_sequence_parallel, + extra_tensors=extra_tensors, + ) + if extra_tensors is not None: + hidden_states_combined, router_logits, extra_tensors_combined = ( + dispatch_res + ) + hidden_states_combined = ( + hidden_states_combined, + extra_tensors_combined[0], + ) + else: + hidden_states_combined, router_logits = dispatch_res + # Run shared experts before matrix multiply. # because matrix multiply maybe modify the hidden_states. if has_separate_shared_experts and not use_shared_experts_stream: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index f71854e6b63c5..d5d7e7bfaae73 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1522,6 +1522,24 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): w2_blockscale_swizzled, requires_grad=False ) + def prepare_dp_allgather_tensor( + self, + layer: FusedMoE, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Optionally prepare extra tensors to carry through DP allgather/EP.""" + import flashinfer + + a1_gscale = layer.w13_input_scale_quant + hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize( + hidden_states, + a1_gscale, + is_sf_swizzled_layout=False, + ) + extra_tensors: list[torch.Tensor] = [hidden_states_sf] + return hidden_states_fp4, extra_tensors + def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: @@ -1576,8 +1594,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): e_score_correction_bias=layer.e_score_correction_bias, ) + # Hidden_states in select_experts is only used to extract metadata + if isinstance(x, tuple): + x_routing, _ = x + else: + x_routing = x topk_weights, topk_ids, _ = layer.select_experts( - hidden_states=x, + hidden_states=x_routing, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 76bce8a8d98d6..1d410316d6299 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -238,7 +238,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( def flashinfer_trtllm_fp4_moe( layer: torch.nn.Module, - x: torch.Tensor, + x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], router_logits: torch.Tensor, top_k: int, global_num_experts: int, @@ -269,12 +269,16 @@ def flashinfer_trtllm_fp4_moe( from vllm.model_executor.models.llama4 import Llama4MoE # Quantize input to FP4 - a1_gscale = layer.w13_input_scale_quant - (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( - x, - a1_gscale, - is_sf_swizzled_layout=False, - ) + if isinstance(x, tuple): + hidden_states_fp4, hidden_states_scale_linear_fp4 = x + else: + # hidden_states is the already quantized + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) # Determine routing method type use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function @@ -360,13 +364,17 @@ def flashinfer_trtllm_fp4_routed_moe( torch.bfloat16 ).view(torch.int16) - # Quantize input to FP4 - a1_gscale = layer.w13_input_scale_quant - (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( - x, - a1_gscale, - is_sf_swizzled_layout=False, - ) + if isinstance(x, tuple): + # Hidden_states is the already quantized + hidden_states_fp4, hidden_states_scale_linear_fp4 = x + else: + # Quantize input to FP4 + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) # Call TRT-LLM FP4 block-scale MoE kernel out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe( diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 5019b771f4a14..1c2710be3173b 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -184,6 +184,23 @@ def has_flashinfer_cutedsl() -> bool: ) +@functools.cache +def has_flashinfer_trtllm_fused_moe() -> bool: + """Return `True` if FlashInfer TRTLLM fused MoE is available.""" + if not has_flashinfer_moe(): + return False + required_functions = [ + ("flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"), + ("flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe"), + ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"), + ] + for module_name, attr_name in required_functions: + mod = _get_submodule(module_name) + if not mod or not hasattr(mod, attr_name): + return False + return True + + @functools.cache def has_flashinfer_cutlass_fused_moe() -> bool: """Return `True` if FlashInfer CUTLASS fused MoE is available.""" From 9fec0e13d512b6b9082e40297582d8052f434610 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 16 Dec 2025 17:10:16 -0500 Subject: [PATCH 05/49] [Attention] Cache attention metadata builds across hybrid KV-cache groups (#29627) Signed-off-by: Lucas Wilkinson Co-authored-by: Stanislaw Wozniak --- .../attention/test_chunked_local_attention.py | 2 +- .../layers/chunked_local_attention.py | 16 +++++++--- vllm/envs.py | 4 +-- vllm/v1/attention/backends/flash_attn.py | 13 ++++++++ vllm/v1/attention/backends/mamba2_attn.py | 27 ++++++++++++++++ vllm/v1/attention/backends/utils.py | 32 ++++++++++++++++--- vllm/v1/worker/gpu_model_runner.py | 24 +++++++++++++- 7 files changed, 105 insertions(+), 13 deletions(-) diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py index faace3473a281..4529c2cfc29b6 100644 --- a/tests/v1/attention/test_chunked_local_attention.py +++ b/tests/v1/attention/test_chunked_local_attention.py @@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): ) # Call the function - result = make_local_attention_virtual_batches( + result, _ = make_local_attention_virtual_batches( attn_chunk_size, common_attn_metadata, block_size ) diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py index 0ced0028ded9e..7e3794d408332 100644 --- a/vllm/attention/layers/chunked_local_attention.py +++ b/vllm/attention/layers/chunked_local_attention.py @@ -4,7 +4,7 @@ import functools import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig @@ -51,11 +51,19 @@ def create_chunked_local_attention_backend( common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False, - ) -> AttentionMetadata: - common_attn_metadata = make_local_attention_virtual_batches( + ): + cm, make_virtual_batches_block_table = make_local_attention_virtual_batches( attention_chunk_size, common_attn_metadata, block_size ) - return super().build(common_prefix_len, common_attn_metadata, fast_build) + metadata = super().build(common_prefix_len, cm, fast_build) + metadata.make_virtual_batches_block_table = make_virtual_batches_block_table + return metadata + + def update_block_table( + self, metadata, blk_table: torch.Tensor, slot_mapping: torch.Tensor + ): + blk_table = metadata.make_virtual_batches_block_table(blk_table) + return super().update_block_table(metadata, blk_table, slot_mapping) attn_backend = subclass_attention_backend( name_prefix=prefix, diff --git a/vllm/envs.py b/vllm/envs.py index d0f2798096263..7e072a588591c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -207,7 +207,7 @@ if TYPE_CHECKING: VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" - VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False + VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True VLLM_ENABLE_RESPONSES_API_STORE: bool = False VLLM_USE_TRTLLM_ATTENTION: str | None = None VLLM_NVFP4_GEMM_BACKEND: str | None = None @@ -1430,7 +1430,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # kv-cache memory usage and enable longer contexts) # TODO(lucas): Remove this flag once latency regression is resolved. "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool( - int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0")) + int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "1")) ), # Enables support for the "store" option in the OpenAI Responses API. # When set to 1, vLLM's OpenAI server will retain the input and output diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index f5ad98cf2125c..3445e998d6371 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" +import copy from dataclasses import dataclass from typing import ClassVar @@ -250,6 +251,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad if get_flash_attn_version() == 3 else AttentionCGSupport.UNIFORM_BATCH ) + supports_update_block_table: bool = True def __init__( self, @@ -493,6 +495,17 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad ) return attn_metadata + def update_block_table( + self, + metadata: FlashAttentionMetadata, + blk_table: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> FlashAttentionMetadata: + new_metadata = copy.copy(metadata) + new_metadata.block_table = blk_table + new_metadata.slot_mapping = slot_mapping + return new_metadata + def use_cascade_attention(self, *args, **kwargs) -> bool: return use_cascade_attention(*args, **kwargs) diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index bf1d8f09ab0ac..f923371283aa0 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import itertools from dataclasses import dataclass @@ -134,6 +135,8 @@ class Mamba2AttentionMetadata: class Mamba2AttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata] ): + supports_update_block_table: bool = True + def __init__( self, kv_cache_spec: AttentionSpec, @@ -346,3 +349,27 @@ class Mamba2AttentionMetadataBuilder( num_computed_tokens_p=num_computed_tokens_p, ) return attn_metadata + + def update_block_table( + self, + metadata: Mamba2AttentionMetadata, + blk_table: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> Mamba2AttentionMetadata: + new_metadata = copy.copy(metadata) + prefix_caching = self.vllm_config.cache_config.enable_prefix_caching + state_indices_t = blk_table if prefix_caching else blk_table[:, 0] + num_reqs = blk_table.shape[0] + + # For CUDA graphs, copy to persistent buffer + if ( + metadata.num_prefills == 0 + and num_reqs <= self.decode_cudagraph_max_bs + and self.compilation_config.cudagraph_mode.has_full_cudagraphs() + ): + persistent_state_indices_t = self.state_indices_tensor[:num_reqs] + persistent_state_indices_t.copy_(state_indices_t, non_blocking=True) + state_indices_t = persistent_state_indices_t + + new_metadata.state_indices_tensor = state_indices_t + return new_metadata diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 1cbe929fc57a8..56763f4b52539 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -4,6 +4,7 @@ import abc import enum import functools from abc import abstractmethod +from collections.abc import Callable from dataclasses import dataclass, field, fields, make_dataclass from typing import ( TYPE_CHECKING, @@ -317,6 +318,9 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): # If not, set this to None. Otherwise set it to the query # length that will be pulled into the front of the batch. reorder_batch_threshold: int | None = None + # Does this backend/builder support updating the block table in existing + # metadata + supports_update_block_table: bool = False @abstractmethod def __init__( @@ -387,6 +391,21 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]): """ raise NotImplementedError + def update_block_table( + self, + metadata: M, + blk_table: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> M: + """ + Update the block table for the attention metadata. + Faster when theres multiple kv-cache groups that create virtually the + same metadata but just with different block tables. + + Only needs to be implemented if supports_update_block_table is True. + """ + raise NotImplementedError + def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata ) -> M: @@ -603,7 +622,7 @@ def make_local_attention_virtual_batches( attn_chunk_size: int, common_attn_metadata: CommonAttentionMetadata, block_size: int = 0, -) -> CommonAttentionMetadata: +) -> tuple[CommonAttentionMetadata, Callable[[torch.Tensor], torch.Tensor]]: query_start_loc_np = common_attn_metadata.query_start_loc_cpu.numpy() seq_lens_np = common_attn_metadata.seq_lens_cpu.numpy() block_table = common_attn_metadata.block_table_tensor @@ -715,9 +734,12 @@ def make_local_attention_virtual_batches( # tensor first, which recovers perf. batch_indices_torch = torch.from_numpy(batch_indices) block_indices_torch = torch.from_numpy(block_indices) - block_table_local = block_table[batch_indices_torch, block_indices_torch].view( - virtual_batches, -1 - ) + + # Save as a lambda so we can return this for update_block_table + make_block_table = lambda block_table: block_table[ + batch_indices_torch, block_indices_torch + ].view(virtual_batches, -1) + block_table_local = make_block_table(block_table) query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local) seq_lens_cpu = torch.from_numpy(seqlens_k_local) @@ -736,7 +758,7 @@ def make_local_attention_virtual_batches( causal=True, _seq_lens_cpu=seq_lens_cpu, _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local), - ) + ), make_block_table def make_kv_sharing_fast_prefill_common_attn_metadata( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1aa2ec6bb655c..179f713c4d86a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1630,6 +1630,15 @@ class GPUModelRunner( logits_indices ) + # Cache attention metadata builds across hybrid KV-cache groups + # The only thing that changes between different hybrid KV-cache groups when the + # same metadata builder and KVCacheSpec is the same is the block table, so we + # can cache the attention metadata builds and just update the block table using + # `builder.update_block_table` if the builder supports it. + cached_attn_metadata: dict[ + tuple[KVCacheSpec, type[AttentionMetadataBuilder]], AttentionMetadata + ] = {} + def _build_attn_group_metadata( kv_cache_gid: int, attn_gid: int, @@ -1637,13 +1646,15 @@ class GPUModelRunner( ubid: int | None = None, ) -> None: attn_group = self.attn_groups[kv_cache_gid][attn_gid] + builder = attn_group.get_metadata_builder(ubid or 0) + cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder)) + cascade_attn_prefix_len = ( cascade_attn_prefix_lens[kv_cache_gid][attn_gid] if cascade_attn_prefix_lens else 0 ) - builder = attn_group.get_metadata_builder(ubid or 0) extra_attn_metadata_args = {} if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): assert ubid is None, "UBatching not supported with GDN yet" @@ -1658,12 +1669,23 @@ class GPUModelRunner( attn_metadata_i = builder.build_for_cudagraph_capture( common_attn_metadata ) + elif ( + cache_key in cached_attn_metadata + and builder.supports_update_block_table + ): + attn_metadata_i = builder.update_block_table( + cached_attn_metadata[cache_key], + common_attn_metadata.block_table_tensor, + common_attn_metadata.slot_mapping, + ) else: attn_metadata_i = builder.build( common_prefix_len=cascade_attn_prefix_len, common_attn_metadata=common_attn_metadata, **extra_attn_metadata_args, ) + if builder.supports_update_block_table: + cached_attn_metadata[cache_key] = attn_metadata_i if ubid is None: assert isinstance(attn_metadata, dict) From f5f51e5931ffd99afe69696b60765b88d3eb13f2 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 16 Dec 2025 14:18:17 -0800 Subject: [PATCH 06/49] [Core][MM] Optimize encoder cache manager by operating with embeddings only (#30475) Signed-off-by: Roger Wang Co-authored-by: Sun Kim --- .../multimodal/processing/test_mllama4.py | 4 +- tests/multimodal/test_utils.py | 92 +++++++++++++++++++ tests/v1/core/test_encoder_cache_manager.py | 79 +++++++++++++++- .../unit/test_ec_example_connector.py | 2 +- .../ec_connector/example_connector.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 8 +- vllm/multimodal/inputs.py | 39 +++++++- vllm/multimodal/profiling.py | 32 ++----- vllm/multimodal/registry.py | 2 +- vllm/v1/core/encoder_cache_manager.py | 80 ++++++++-------- vllm/v1/core/sched/scheduler.py | 35 +++++-- vllm/v1/request.py | 6 +- vllm/v1/worker/gpu_model_runner.py | 49 +++------- vllm/v1/worker/utils.py | 6 ++ 14 files changed, 306 insertions(+), 130 deletions(-) diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index e5ff2d1391b62..325159965c803 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int): total_num_patches.item() + num_tiles.item() + 3 ) # image start, image, image end - profiled_tokens = profiler.get_mm_max_contiguous_tokens( + profiled_tokens = profiler.get_mm_max_tokens( max_model_len, mm_counts=mm_counts, ) - assert total_tokens == profiled_tokens["image"] + assert total_num_patches == profiled_tokens["image"] assert total_tokens == sum( placeholder.length for placeholder in decoder_dummy_data.multi_modal_placeholders["image"] diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 636cd0ffd445e..02bb1f769baad 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -9,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory import numpy as np import pytest +import torch from PIL import Image, ImageChops from vllm.multimodal.image import convert_image_mode @@ -410,6 +411,97 @@ def test_argsort_mm_positions(case): assert modality_idxs == expected_modality_idxs +@pytest.mark.parametrize( + "is_embed,expected", + [ + (None, 5), + (torch.tensor([True, True, True, True, True]), 5), + (torch.tensor([False, False, False, False, False]), 0), + (torch.tensor([True, False, True, False, True]), 3), + (torch.tensor([True]), 1), + ], +) +def test_placeholder_range_get_num_embeds(is_embed, expected): + length = len(is_embed) if is_embed is not None else 5 + pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed) + assert pr.get_num_embeds == expected + + +@pytest.mark.parametrize( + "is_embed,expected", + [ + (None, None), + ( + torch.tensor([False, True, False, True, True]), + torch.tensor([0, 1, 1, 2, 3]), + ), + (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])), + ], +) +def test_placeholder_range_embeds_cumsum(is_embed, expected): + length = len(is_embed) if is_embed is not None else 5 + pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed) + + if expected is None: + assert pr.embeds_cumsum is None + return + + assert torch.equal(pr.embeds_cumsum, expected) + # cached_property should return the same object on repeated access + assert pr.embeds_cumsum is pr.embeds_cumsum + + +@pytest.mark.parametrize( + "is_embed,start_idx,end_idx,expected", + [ + (None, 2, 4, (2, 4)), + ( + torch.tensor([False, True, False, True, True]), + 3, + 5, + (1, 3), + ), + ( + torch.tensor([False, True, False, True, True]), + 0, + 2, + (0, 1), + ), + ( + torch.tensor([True, False, True, False]), + 2, + 2, + (1, 1), + ), + ], +) +def test_placeholder_range_get_embeds_indices_in_range( + is_embed, start_idx, end_idx, expected +): + length = len(is_embed) if is_embed is not None else 5 + pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed) + assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected + + +@pytest.mark.parametrize( + "offset,is_embed,expected", + [ + (0, None, [(0, 4)]), + ( + 2, + torch.tensor([False, True, False, True, True]), + [(3, 3), (5, 6)], + ), + (0, torch.tensor([True, True, True, True]), [(0, 3)]), + (0, torch.tensor([False, False, False, False]), []), + ], +) +def test_placeholder_range_extract_embeds_range(offset, is_embed, expected): + length = len(is_embed) if is_embed is not None else 5 + pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed) + assert pr.extract_embeds_range() == expected + + @pytest.mark.asyncio @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("num_frames", [-1, 32, 1800]) diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py index 8a52b5bd78977..511ff48c401ca 100644 --- a/tests/v1/core/test_encoder_cache_manager.py +++ b/tests/v1/core/test_encoder_cache_manager.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +import torch from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange from vllm.v1.core.encoder_cache_manager import EncoderCacheManager @@ -23,7 +24,7 @@ class MockRequest: ) self.mm_features.append(feature) - def get_num_encoder_tokens(self, input_id: int) -> int: + def get_num_encoder_embeds(self, input_id: int) -> int: return self._token_counts[input_id] @@ -162,8 +163,8 @@ def test_schedule_request_multi_images_respect_space_limit(): num_tokens_to_schedule = 0 assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule) - num_tokens_to_schedule += req.get_num_encoder_tokens(0) - compute_budget -= req.get_num_encoder_tokens(0) + num_tokens_to_schedule += req.get_num_encoder_embeds(0) + compute_budget -= req.get_num_encoder_embeds(0) assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule) @@ -174,7 +175,75 @@ def test_schedule_request_multi_images_respect_compute_limit(): compute_budget = 10 num_tokens_to_schedule = 0 assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule) - num_tokens_to_schedule += req.get_num_encoder_tokens(0) - compute_budget -= req.get_num_encoder_tokens(0) + num_tokens_to_schedule += req.get_num_encoder_embeds(0) + compute_budget -= req.get_num_encoder_embeds(0) assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule) + + +def test_encoder_cache_with_is_embed_mask(): + class MockRequestWithMask(MockRequest): + def get_num_encoder_embeds(self, input_id: int) -> int: + return self.mm_features[input_id].mm_position.get_num_embeds + + is_embed = torch.zeros(100, dtype=torch.bool) + is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True + + request = MockRequestWithMask("r1", ["img1"], [100]) + request.mm_features[0] = MultiModalFeatureSpec( + data=None, + modality="image", + identifier="img1", + mm_position=PlaceholderRange(offset=0, length=100, is_embed=is_embed), + ) + + manager = EncoderCacheManager(cache_size=100) + manager.allocate(request, 0) + + assert manager.num_free_slots == 92 + assert "img1" in manager.cached + + old_size = 100 + new_size = request.mm_features[0].mm_position.get_num_embeds + assert new_size == 8 + savings_ratio = old_size / new_size + assert savings_ratio == 12.5 + + +def test_encoder_cache_mask_based_retrieval(): + class MockRequestWithMask(MockRequest): + def get_num_encoder_embeds(self, input_id: int) -> int: + return self.mm_features[input_id].mm_position.get_num_embeds + + is_embed = torch.tensor( + [False, False, True, True, False, True, True, True, False, False] + ) + + request = MockRequestWithMask("r1", ["img1"], [10]) + request.mm_features[0] = MultiModalFeatureSpec( + data=None, + modality="image", + identifier="img1", + mm_position=PlaceholderRange(offset=0, length=10, is_embed=is_embed), + ) + + manager = EncoderCacheManager(cache_size=50) + manager.allocate(request, 0) + + assert request.mm_features[0].mm_position.get_num_embeds == 5 + + start_idx = 2 + end_idx = 8 + num_embeds_before = is_embed[:start_idx].sum().item() + num_embeds_in_range = is_embed[start_idx:end_idx].sum().item() + + assert num_embeds_before == 0 + assert num_embeds_in_range == 5 + + start_idx = 0 + end_idx = 5 + num_embeds_before = is_embed[:start_idx].sum().item() if start_idx > 0 else 0 + num_embeds_in_range = is_embed[start_idx:end_idx].sum().item() + + assert num_embeds_before == 0 + assert num_embeds_in_range == 2 diff --git a/tests/v1/ec_connector/unit/test_ec_example_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py index 7e9eb21310031..9ed82e1cef823 100644 --- a/tests/v1/ec_connector/unit/test_ec_example_connector.py +++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py @@ -38,7 +38,7 @@ class MockRequest: ) self.mm_features.append(feature) - def get_num_encoder_tokens(self, input_id: int) -> int: + def get_num_encoder_embeds(self, input_id: int) -> int: assert input_id < len(self._token_counts) return self._token_counts[input_id] diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py index 5f2eff5a8e6a8..c9aad9e9fc8f3 100644 --- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py @@ -144,7 +144,7 @@ class ECExampleConnector(ECConnectorBase): Update ECConnector state after encoder cache allocation. """ mm_hash = request.mm_features[index].identifier - num_encoder_token = request.get_num_encoder_tokens(index) + num_encoder_token = request.get_num_encoder_embeds(index) # Insert mm_hash only if this block has not been recorded yet. self._mm_datas_need_loads[mm_hash] = num_encoder_token diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c0589986d1fe8..4838f68e06f70 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -713,17 +713,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): mm_counts: Mapping[str, int], ) -> int: target_width, target_height = self.get_image_size_with_most_features() - video_soft_tokens = self.get_num_video_tokens( + num_video_soft_tokens = self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts), image_processor=None, ) - - # NOTE: By default in Qwen3-VL, one video token is converted to - # "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501 - formatted_video_soft_tokens = video_soft_tokens * 12.5 - return int(formatted_video_soft_tokens) + return num_video_soft_tokens def _calculate_timestamps( self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 6b1cbbe24e2e7..fa69818a7b1f8 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass -from functools import partial +from functools import cached_property, partial from itertools import accumulate from typing import ( TYPE_CHECKING, @@ -169,11 +169,42 @@ class PlaceholderRange: between `offset` and `offset + length` to assign embeddings to. """ - def get_num_embeds(self) -> int: + @cached_property + def embeds_cumsum(self) -> torch.Tensor | None: if self.is_embed is None: + return None + + return self.is_embed.cumsum(dim=0) + + @cached_property + def get_num_embeds(self) -> int: + if self.embeds_cumsum is None: return self.length - return int(self.is_embed.sum().item()) + return int(self.embeds_cumsum[-1]) + + def get_embeds_indices_in_range( + self, start_idx: int, end_idx: int + ) -> tuple[int, int]: + """ + Returns the starting and ending indices of the embeddings of encoder outputs + in the range of [start_idx, end_idx) in the placeholders. + + For example, given: + PlaceholderRange(offset=2, length=5, is_embed=[False, True, False, True, True]) + + If start_idx=3 and end_idx=5, the output is (1, 3) because we want to get + the second and the third embeddings from the encoder output. + """ + if self.embeds_cumsum is None: + return start_idx, end_idx + + embeds_start_idx = ( + int(self.embeds_cumsum[start_idx - 1]) if start_idx > 0 else 0 + ) + embeds_end_idx = int(self.embeds_cumsum[end_idx - 1]) + + return embeds_start_idx, embeds_end_idx def extract_embeds_range(self) -> list[tuple[int, int]]: """Extract the start and end indices of the embedded region in prompt. @@ -188,7 +219,7 @@ class PlaceholderRange: Returns full placeholder range if `is_embed` is `None`. """ if self.is_embed is None: - return [(self.offset, self.offset + self.length)] + return [(self.offset, self.offset + self.length - 1)] mask_i = self.is_embed.int() starts = torch.nonzero( diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index cb70041e9744f..a690948f759e9 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -274,15 +274,11 @@ class MultiModalProfiler(Generic[_I]): def _get_mm_num_tokens( self, mm_inputs: MultiModalInputs, - mm_embeddings_only: bool = True, ) -> Mapping[str, int]: placeholders_by_modality = mm_inputs["mm_placeholders"] return { - modality: sum( - item.get_num_embeds() if mm_embeddings_only else item.length - for item in placeholders - ) + modality: sum(item.get_num_embeds for item in placeholders) for modality, placeholders in placeholders_by_modality.items() } @@ -328,12 +324,15 @@ class MultiModalProfiler(Generic[_I]): multi_modal_placeholders=mm_inputs["mm_placeholders"], ) - def _get_mm_max_tokens( + def get_mm_max_tokens( self, seq_len: int, mm_counts: Mapping[str, int] | None = None, - mm_embeddings_only: bool = True, ) -> Mapping[str, int]: + """ + Returns the maximum number of embeddings per item of each modality, excluding + any break/text tokens in-between multimodal embeddings/encoder outputs. + """ if mm_counts is None: mm_counts = self.get_mm_limits() @@ -349,21 +348,4 @@ class MultiModalProfiler(Generic[_I]): } mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) - return self._get_mm_num_tokens(mm_inputs, mm_embeddings_only=mm_embeddings_only) - - def get_mm_max_contiguous_tokens( - self, - seq_len: int, - mm_counts: Mapping[str, int] | None = None, - ) -> Mapping[str, int]: - """ - Returns the maximum length of the multimodal (image placeholders+text) - tokens, including any break/text tokens in-between image embeddings. - - ` [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] ` - Returns 9, even when the number of image embeddings is 6. - - This is important to take into account when profiling and - initializing the encoder cache size. - """ - return self._get_mm_max_tokens(seq_len, mm_counts, mm_embeddings_only=False) + return self._get_mm_num_tokens(mm_inputs) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 00a84f9dec4f7..1e7fe8648ab71 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -164,7 +164,7 @@ class MultiModalRegistry: profiler.get_mm_limits() if profiler_limits is None else profiler_limits ) - return profiler.get_mm_max_contiguous_tokens( + return profiler.get_mm_max_tokens( seq_len, {modality: 1 for modality, limit in profiler_limits.items() if limit > 0}, ) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 50f738713590b..d73c05d2cf80b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -39,20 +39,26 @@ class EncoderCacheManager: space for new embeddings. Oldest cached embeddings with no request referenced will be first evicted. + NOTE: The EncoderCacheManager operates on the level of multimodal embeddings + instead of encoder tokens (i.e. all tokens that represent the multimodal data + in the input sequence). This means all break/text tokens in-between multimodal + embeddings are not considered with respect to the cache size and the number + of free slots. + Args: cache_size: Limit the size of the cache, measured by the number of - tokens from the input sequence. + encoder embeddings from the input sequence. Attributes: - cache_size: Total cache capacity in encoder tokens. - num_free_slots: Current available cache capacity in encoder tokens. + cache_size: Total cache capacity in encoder embeddings. + num_free_slots: Current available cache capacity in encoder embeddings. num_freeable_slots: Capacity that can be immediately reclaimed by - evicting entries with zero references (in encoder tokens). + evicting entries with zero references (in encoder embeddings). cached: Mapping from mm_hash to a set of request IDs that currently reference the cached entry. If the set is empty, the entry exists but is not referenced by any request and is eligible for reclamation. - freeable: List of tuples (mm_hash, num_tokens) representing entries + freeable: List of tuples (mm_hash, num_encoder_embeds) representing entries whose no current running request is needed and that can be freed to make space when needed. freed: List of mm_hash strings that were actually evicted since the @@ -67,7 +73,7 @@ class EncoderCacheManager: # mm_hash of mm_data => ids of requests that reference the mm_data self.cached: dict[str, set[str]] = {} - # mm_hash of mm_data => num_encoder_tokens of the mm_data + # mm_hash of mm_data => num_encoder_embeds of the mm_data self.freeable: OrderedDict[str, int] = OrderedDict() self.freed: list[str] = [] @@ -93,8 +99,8 @@ class EncoderCacheManager: # Cached but currently not referenced by any request if not self.cached[mm_hash]: - num_tokens = self.freeable.pop(mm_hash) - self.num_freeable_slots -= num_tokens + num_encoder_embeds = self.freeable.pop(mm_hash) + self.num_freeable_slots -= num_encoder_embeds self.cached[mm_hash].add(request.request_id) return True @@ -104,7 +110,7 @@ class EncoderCacheManager: request: Request, input_id: int, encoder_compute_budget: int, - num_tokens_to_schedule: int, + num_embeds_to_schedule: int, ) -> bool: """Check if there's sufficient cache space for a multimodal input. If there is, return True and update EncoderCacheManager state. @@ -121,9 +127,9 @@ class EncoderCacheManager: Args: request: The request containing the multimodal input. input_id: Index of the multimodal input within the request. - encoder_compute_budget: Number of encoder tokens allowed to be + encoder_compute_budget: Number of encoder embeddings allowed to be computed when this method is invoked. - num_tokens_to_schedule: Number of tokens already scheduled to be + num_embeds_to_schedule: Number of encoder embeddings already scheduled to be allocated with cache space when this method is invoked. Returns: @@ -134,30 +140,30 @@ class EncoderCacheManager: Note: This method does not allocate physical memory for the encoder output but only the state of EncoderCacheManager. """ - num_tokens = request.get_num_encoder_tokens(input_id) + num_embeds = request.get_num_encoder_embeds(input_id) # Not enough compute budget - if num_tokens > encoder_compute_budget: + if num_embeds > encoder_compute_budget: return False - num_tokens += num_tokens_to_schedule + num_embeds += num_embeds_to_schedule # Enough free slots - if num_tokens <= self.num_free_slots: + if num_embeds <= self.num_free_slots: return True # Not enough reclaimable slots - if num_tokens > self.num_freeable_slots: + if num_embeds > self.num_freeable_slots: return False # Not enough free slots but enough reclaimable slots # NOTE: Eviction takes place here, but physical memory is not freed # until model runner is notified by the scheduler output. - while num_tokens > self.num_free_slots: - mm_hash, num_free_token = self.freeable.popitem(last=False) + while num_embeds > self.num_free_slots: + mm_hash, num_free_embeds = self.freeable.popitem(last=False) del self.cached[mm_hash] self.freed.append(mm_hash) - self.num_free_slots += num_free_token + self.num_free_slots += num_free_embeds return True def allocate(self, request: Request, input_id: int) -> None: @@ -176,16 +182,16 @@ class EncoderCacheManager: if mm_hash not in self.cached: self.cached[mm_hash] = set() - num_encoder_tokens = request.get_num_encoder_tokens(input_id) + num_encoder_embeds = request.get_num_encoder_embeds(input_id) # NOTE: Encoder cache should always have enough space for encoder inputs # that are scheduled since eviction takes place at can_allocate(). - assert self.num_free_slots >= num_encoder_tokens - assert self.num_freeable_slots >= num_encoder_tokens + assert self.num_free_slots >= num_encoder_embeds + assert self.num_freeable_slots >= num_encoder_embeds self.cached[mm_hash].add(request_id) - self.num_free_slots -= num_encoder_tokens - self.num_freeable_slots -= num_encoder_tokens + self.num_free_slots -= num_encoder_embeds + self.num_freeable_slots -= num_encoder_embeds def get_cached_input_ids(self, request: Request) -> set[int]: """Get all cached multimodal input IDs for a request. @@ -206,7 +212,7 @@ class EncoderCacheManager: When the reference set for the corresponding `mm_hash` becomes empty, the entry is appended to `freeable` and `num_freeable_slots` is - increased by the number of encoder tokens for that input. + increased by the number of encoder embeddings for that input. The entry is NOT physically freed until capacity is needed (e.g., by `can_allocate`). @@ -218,9 +224,9 @@ class EncoderCacheManager: return self.cached[mm_hash].discard(req_id) if not self.cached[mm_hash]: - num_tokens = request.get_num_encoder_tokens(input_id) - self.freeable[mm_hash] = num_tokens - self.num_freeable_slots += num_tokens + num_encoder_embeds = request.get_num_encoder_embeds(input_id) + self.freeable[mm_hash] = num_encoder_embeds + self.num_freeable_slots += num_encoder_embeds def free(self, request: Request) -> None: """Free all encoder input cache reference held by *request*. @@ -361,20 +367,20 @@ class EncoderDecoderCacheManager(EncoderCacheManager): request: Request, input_id: int, encoder_compute_budget: int, - num_tokens_to_schedule: int, + num_embeds_to_schedule: int, ) -> bool: - num_tokens = request.get_num_encoder_tokens(input_id) + num_encoder_embeds = request.get_num_encoder_embeds(input_id) # Not enough compute budget - if num_tokens > encoder_compute_budget: + if num_encoder_embeds > encoder_compute_budget: return False - num_tokens += num_tokens_to_schedule + num_encoder_embeds += num_embeds_to_schedule # Enough free slots - return num_tokens <= self.num_free_slots + return num_encoder_embeds <= self.num_free_slots def allocate(self, request: Request, input_id: int) -> None: - num_encoder_tokens = request.get_num_encoder_tokens(input_id) - self.num_free_slots -= num_encoder_tokens + num_encoder_embeds = request.get_num_encoder_embeds(input_id) + self.num_free_slots -= num_encoder_embeds mm_hash = request.mm_features[input_id].identifier self.freed.append(mm_hash) @@ -392,5 +398,5 @@ class EncoderDecoderCacheManager(EncoderCacheManager): return freed def free_encoder_input(self, request: Request, input_id: int) -> None: - num_tokens = request.get_num_encoder_tokens(input_id) - self.num_free_slots += num_tokens + num_encoder_embeds = request.get_num_encoder_embeds(input_id) + self.num_free_slots += num_encoder_embeds diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 754e0b9d08316..8e835ad096405 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -355,11 +355,11 @@ class Scheduler(SchedulerInterface): if preempted_encoder_inputs: # Restore encoder compute budget if the preempted # request had encoder inputs scheduled in this step. - num_tokens_to_restore = sum( - preempted_req.get_num_encoder_tokens(i) + num_embeds_to_restore = sum( + preempted_req.get_num_encoder_embeds(i) for i in preempted_encoder_inputs ) - encoder_compute_budget += num_tokens_to_restore + encoder_compute_budget += num_embeds_to_restore req_index -= 1 else: preempted_req = self.running.pop() @@ -911,10 +911,11 @@ class Scheduler(SchedulerInterface): # multiple encoder inputs per request), we need to create temporary # trackers for accounting at the encoder input level. mm_hashes_to_schedule = set() - num_tokens_to_schedule = 0 + num_embeds_to_schedule = 0 for i, mm_feature in enumerate(mm_features): start_pos = mm_feature.mm_position.offset num_encoder_tokens = mm_feature.mm_position.length + num_encoder_embeds = mm_feature.mm_position.get_num_embeds # The encoder output is needed if the two ranges overlap: # [num_computed_tokens, num_computed_tokens + num_new_tokens) and @@ -970,9 +971,8 @@ class Scheduler(SchedulerInterface): ): num_new_tokens = start_pos - num_computed_tokens break - if not self.encoder_cache_manager.can_allocate( - request, i, encoder_compute_budget, num_tokens_to_schedule + request, i, encoder_compute_budget, num_embeds_to_schedule ): # The encoder cache is full or the encoder budget is exhausted. # NOTE(woosuk): We assume that the encoder input tokens should @@ -992,14 +992,31 @@ class Scheduler(SchedulerInterface): num_new_tokens = 0 break + # Calculate the number of embeddings to schedule in the current range + # of scheduled encoder placholder tokens. + start_idx_rel = max(0, num_computed_tokens - start_pos) + end_idx_rel = min( + num_encoder_tokens, num_computed_tokens + num_new_tokens - start_pos + ) + curr_embeds_start, curr_embeds_end = ( + mm_feature.mm_position.get_embeds_indices_in_range( + start_idx_rel, + end_idx_rel, + ) + ) + # There's no embeddings in the current range of encoder placeholder tokens + # so we can skip the encoder input. + if curr_embeds_end - curr_embeds_start == 0: + continue + if self.ec_connector is not None and remote_cache_has_item[i]: mm_hashes_to_schedule.add(request.mm_features[i].identifier) external_load_encoder_input.append(i) - num_tokens_to_schedule += num_encoder_tokens + num_embeds_to_schedule += num_encoder_embeds continue - num_tokens_to_schedule += num_encoder_tokens - encoder_compute_budget -= num_encoder_tokens + num_embeds_to_schedule += num_encoder_embeds + encoder_compute_budget -= num_encoder_embeds mm_hashes_to_schedule.add(request.mm_features[i].identifier) encoder_inputs_to_schedule.append(i) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index a775e840e841c..f33059b80b894 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -209,10 +209,10 @@ class Request: def get_finished_reason(self) -> FinishReason | None: return RequestStatus.get_finished_reason(self.status) - def get_num_encoder_tokens(self, input_id: int) -> int: + def get_num_encoder_embeds(self, input_id: int) -> int: assert input_id < len(self.mm_features) - num_tokens = self.mm_features[input_id].mm_position.length - return num_tokens + num_embeds = self.mm_features[input_id].mm_position.get_num_embeds + return num_embeds def record_event( self, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 179f713c4d86a..1db5bc99fff6c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -169,9 +169,7 @@ from .utils import ( MultiModalBudget, add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache, - gather_mm_placeholders, sanity_check_mm_encoder_outputs, - scatter_mm_placeholders, ) if TYPE_CHECKING: @@ -2209,10 +2207,7 @@ class GPUModelRunner( # Cache the encoder outputs by mm_hash for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): - self.encoder_cache[mm_hash] = scatter_mm_placeholders( - output, - is_embed=pos_info.is_embed, - ) + self.encoder_cache[mm_hash] = output logger.debug("Finish execute for mm hash %s", mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) @@ -2263,6 +2258,13 @@ class GPUModelRunner( num_encoder_tokens, ) assert start_idx < end_idx + curr_embeds_start, curr_embeds_end = ( + pos_info.get_embeds_indices_in_range(start_idx, end_idx) + ) + # If there are no embeddings in the current range, we skip + # gathering the embeddings. + if curr_embeds_start == curr_embeds_end: + continue mm_hash = mm_feature.identifier encoder_output = self.encoder_cache.get(mm_hash, None) @@ -2270,16 +2272,14 @@ class GPUModelRunner( if (is_embed := pos_info.is_embed) is not None: is_embed = is_embed[start_idx:end_idx] + mm_embeds_item = encoder_output[curr_embeds_start:curr_embeds_end] + else: + mm_embeds_item = encoder_output[start_idx:end_idx] req_start_pos = req_start_idx + start_pos - num_computed_tokens is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = ( True if is_embed is None else is_embed ) - - mm_embeds_item = gather_mm_placeholders( - encoder_output[start_idx:end_idx], - is_embed=is_embed, - ) mm_embeds_req.append(mm_embeds_item) if self.is_multimodal_pruning_enabled and self.uses_mrope: @@ -4508,31 +4508,8 @@ class GPUModelRunner( dummy_encoder_outputs, expected_num_items=max_mm_items_per_batch, ) - - # NOTE: This happens when encoder cache needs to store - # the embeddings that encoder outputs are scattered onto. - # In this case we create dummy embeddings of size - # (max_tokens_for_modality, hidden_size) and scatter - # encoder output into it. - encoder_output_shape = dummy_encoder_outputs[0].shape - max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[ - dummy_modality - ] - if encoder_output_shape[0] < max_mm_tokens_per_item: - encoder_hidden_size = encoder_output_shape[-1] - expanded_outputs = [] - for output in dummy_encoder_outputs: - expanded = output.new_zeros( - (max_mm_tokens_per_item, encoder_hidden_size) - ) - num_tokens = output.shape[0] - expanded[:num_tokens].copy_(output) - expanded_outputs.append(expanded) - - dummy_encoder_outputs = expanded_outputs - - # Cache the dummy encoder outputs. - self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + for i, output in enumerate(dummy_encoder_outputs): + self.encoder_cache[f"tmp_{i}"] = output # Add `is_profile` here to pre-allocate communication buffers hidden_states, last_hidden_states = self._dummy_run( diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index e9c48223d58b9..2e8afec024ce9 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -4,10 +4,12 @@ from collections import defaultdict from dataclasses import dataclass, field import torch +from typing_extensions import deprecated from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec +logger = init_logger(__name__) + class MultiModalBudget: """Helper class to calculate budget information for multi-modal models.""" @@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs( ) +@deprecated("`scatter_mm_placeholders` is deprecated and will be removed in v0.15.0.") def scatter_mm_placeholders( embeds: torch.Tensor, is_embed: torch.Tensor | None, @@ -226,6 +231,7 @@ def scatter_mm_placeholders( return placeholders +@deprecated("`gather_mm_placeholders` is deprecated and will be removed in v0.15.0.") def gather_mm_placeholders( placeholders: torch.Tensor, is_embed: torch.Tensor | None, From eaa82a709a963ab744647a701fe267223ed7b02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20C=C3=A1mpora?= <961215+dcampora@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:21:17 +0100 Subject: [PATCH 07/49] [Bugfix][DSV32] Fix overflow in topk. (#30754) Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> Signed-off-by: mgoin Co-authored-by: mgoin --- csrc/sampler.cu | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/csrc/sampler.cu b/csrc/sampler.cu index fc2154beff9e0..d458f8e4c1d02 100644 --- a/csrc/sampler.cu +++ b/csrc/sampler.cu @@ -550,8 +550,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill( int rowEnd = rowEnds[rowIdx]; // Local pointers to this block - outIndices += rowIdx * topK; - logits += rowIdx * stride0; + outIndices += static_cast(rowIdx) * topK; + logits += static_cast(rowIdx) * stride0; topKPerRowJob( nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK); @@ -576,19 +576,21 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode( // Local pointers to this block if constexpr (!multipleBlocksPerRow && !mergeBlocks) { - outIndices += rowIdx * topK; + outIndices += static_cast(rowIdx) * topK; } else if constexpr (multipleBlocksPerRow) { const auto blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192 rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192 rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize; - outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK; - outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK; + outIndices += + static_cast(rowIdx) * gridDim.y * topK + blockIdx.y * topK; + outLogits += + static_cast(rowIdx) * gridDim.y * topK + blockIdx.y * topK; } else if constexpr (mergeBlocks) { rowEnd = numBlocksToMerge * topK; - indices += rowIdx * numBlocksToMerge * topK; - outIndices += rowIdx * topK; + indices += static_cast(rowIdx) * numBlocksToMerge * topK; + outIndices += static_cast(rowIdx) * topK; } - logits += rowIdx * stride0; + logits += static_cast(rowIdx) * stride0; topKPerRowJob( From ce96857fdd2bf2390aaa2183561fd1a0f5c464c7 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Wed, 17 Dec 2025 06:35:28 +0800 Subject: [PATCH 08/49] [Kernel][Quantization][MoE] add marlin kernel support for turing (sm75) (#29901) Signed-off-by: Jinzhen Lin Co-authored-by: Michael Goin --- CMakeLists.txt | 109 ++++--- csrc/moe/marlin_moe_wna16/.gitignore | 1 + csrc/moe/marlin_moe_wna16/generate_kernels.py | 132 +++++---- csrc/moe/marlin_moe_wna16/marlin_template.h | 208 ++++---------- csrc/moe/marlin_moe_wna16/ops.cu | 54 ++-- csrc/quantization/gptq_marlin/.gitignore | 1 + csrc/quantization/gptq_marlin/dequant.h | 2 +- .../gptq_marlin/generate_kernels.py | 132 +++++---- csrc/quantization/gptq_marlin/gptq_marlin.cu | 68 +++-- csrc/quantization/gptq_marlin/marlin.cuh | 74 ++++- csrc/quantization/gptq_marlin/marlin_mma.h | 269 ++++++++++++++++++ .../gptq_marlin/marlin_template.h | 184 +++--------- .../layers/quantization/awq_marlin.py | 2 +- .../model_executor/layers/quantization/fp8.py | 2 +- .../layers/quantization/gptq_marlin.py | 2 +- .../layers/quantization/modelopt.py | 2 +- 16 files changed, 729 insertions(+), 513 deletions(-) create mode 100644 csrc/quantization/gptq_marlin/marlin_mma.h diff --git a/CMakeLists.txt b/CMakeLists.txt index cd52df86e0346..5ca71f6ba4df0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -357,6 +357,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # marlin arches for fp16 output cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + # marlin has limited support for turing + cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX) cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") # marlin arches for fp8 input @@ -364,8 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}") + # marlin arches for other files + cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}") - if (MARLIN_ARCHS) + if (MARLIN_OTHER_ARCHS) # # For the Marlin kernels we automatically generate sources for various @@ -406,25 +410,39 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Marlin generation script has not changed, skipping generation.") endif() - file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu") - set_gencode_flags_for_srcs( - SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" - CUDA_ARCHS "${MARLIN_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") - endif() - list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) + if (MARLIN_ARCHS) + file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) - file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu") - set_gencode_flags_for_srcs( - SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}" - CUDA_ARCHS "${MARLIN_BF16_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_BF16_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC}) + endif() + + if (MARLIN_SM75_ARCHS) + file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_SM75_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC}) endif() - list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC}) if (MARLIN_FP8_ARCHS) file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu") @@ -446,14 +464,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_SRCS}" - CUDA_ARCHS "${MARLIN_ARCHS}") + CUDA_ARCHS "${MARLIN_OTHER_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" + set_source_files_properties(${MARLIN_SRCS} PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") endif() list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") - message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") + message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" " in CUDA target architectures") @@ -980,12 +998,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # note that we always set `use_atomic_add=False` for moe marlin now, # so we don't need 9.0 for bf16 atomicAdd PTX cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + # moe marlin has limited support for turing + cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}") # moe marlin arches for fp8 input # - sm80 doesn't support fp8 computation # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}") - if (MARLIN_MOE_ARCHS) + # moe marlin arches for other files + cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}") + if (MARLIN_MOE_OTHER_ARCHS) # # For the Marlin MOE kernels we automatically generate sources for various @@ -1026,16 +1048,29 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Marlin MOE generation script has not changed, skipping generation.") endif() - file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu") - list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu") - set_gencode_flags_for_srcs( - SRCS "${MARLIN_MOE_SRC}" - CUDA_ARCHS "${MARLIN_MOE_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties(${MARLIN_MOE_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + if (MARLIN_MOE_ARCHS) + file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_SRC}" + CUDA_ARCHS "${MARLIN_MOE_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC}) + endif() + + if (MARLIN_MOE_SM75_ARCHS) + file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_SM75_SRC}" + CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_SM75_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC}) endif() - list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC}) if (MARLIN_MOE_FP8_ARCHS) file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu") @@ -1049,7 +1084,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC}) endif() - message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") + set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_OTHER_SRC}" + CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_OTHER_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}") + + message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" " in CUDA target architectures") diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore index ba805f9250ece..7dc482a894660 100644 --- a/csrc/moe/marlin_moe_wna16/.gitignore +++ b/csrc/moe/marlin_moe_wna16/.gitignore @@ -1,2 +1,3 @@ sm*_kernel_*.cu kernel_selector.h +kernel_*.cu diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 88f1055337fd5..9db03ea149d0c 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -10,6 +10,8 @@ import jinja2 ARCHS = [] SUPPORT_FP8 = False +SUPPORT_SM75 = False +SUPPORT_SM80 = False for arch in sys.argv[1].split(","): arch = arch[: arch.index(".") + 2].replace(".", "") arch = int(arch) @@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","): # with FP16 MMA, so it cannot achieve any acceleration. if arch in [89, 120]: SUPPORT_FP8 = True + if arch >= 80: + SUPPORT_SM80 = True + if arch == 75: + SUPPORT_SM75 = True FILE_HEAD_COMMENT = """ // auto generated by generate_kernels.py @@ -157,6 +163,7 @@ def remove_old_kernels(): def generate_new_kernels(): result_dict = {} + sm_75_result_dict = {} for quant_config in QUANT_CONFIGS: c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"]) @@ -174,6 +181,8 @@ def generate_new_kernels(): s_type = quant_config.get("s_type", c_type) if (a_type, b_type, c_type) not in result_dict: result_dict[(a_type, b_type, c_type)] = [] + if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16": + sm_75_result_dict[(a_type, b_type, c_type)] = [] for group_blocks, m_blocks, thread_configs in itertools.product( all_group_blocks, all_m_blocks, all_thread_configs @@ -197,78 +206,89 @@ def generate_new_kernels(): "thread_k_blocks": thread_k // 16, "thread_n_blocks": thread_n // 16, "m_block_size_8": "true" if m_blocks == 0.5 else "false", - "stages": "pipe_stages", + "stages": 4, "group_blocks": group_blocks, "is_zp_float": "false", } - result_dict[(a_type, b_type, c_type)].append(config) + if SUPPORT_SM80: + result_dict[(a_type, b_type, c_type)].append(config) + if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75: + config_sm75 = config.copy() + config_sm75["stages"] = 2 + sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75) kernel_selector_str = FILE_HEAD_COMMENT - for (a_type, b_type, c_type), config_list in result_dict.items(): - all_template_str_list = [] - for config in config_list: - s_type = config["s_type"] - template_str = jinja2.Template(TEMPLATE).render( - a_type_id=f"vllm::{a_type}.id()", - b_type_id=f"vllm::{b_type}.id()", - c_type_id=f"vllm::{c_type}.id()", - s_type_id=f"vllm::{s_type}.id()", - **config, - ) - all_template_str_list.append(template_str) - - conditions = [ - f"a_type == vllm::{a_type}", - f"b_type == vllm::{b_type}", - f"c_type == vllm::{c_type}", - f"s_type == vllm::{s_type}", - f"threads == {config['threads']}", - f"thread_m_blocks == {config['thread_m_blocks']}", - f"thread_n_blocks == {config['thread_n_blocks']}", - f"thread_k_blocks == {config['thread_k_blocks']}", - f"m_block_size_8 == {config['m_block_size_8']}", - f"group_blocks == {config['group_blocks']}", - f"is_zp_float == {config['is_zp_float']}", - ] - conditions = " && ".join(conditions) - - if kernel_selector_str == FILE_HEAD_COMMENT: - kernel_selector_str += f"if ({conditions})\n kernel = " - else: - kernel_selector_str += f"else if ({conditions})\n kernel = " - - kernel_template2 = ( - "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, " - "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, " - "{{thread_n_blocks}}, {{thread_k_blocks}}, " - "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, " - "{{is_zp_float}}>;" - ) - - kernel_selector_str += ( - jinja2.Template(kernel_template2).render( + for result_dict_tmp in [result_dict, sm_75_result_dict]: + for (a_type, b_type, c_type), config_list in result_dict_tmp.items(): + all_template_str_list = [] + if not config_list: + continue + for config in config_list: + s_type = config["s_type"] + template_str = jinja2.Template(TEMPLATE).render( a_type_id=f"vllm::{a_type}.id()", b_type_id=f"vllm::{b_type}.id()", c_type_id=f"vllm::{c_type}.id()", s_type_id=f"vllm::{s_type}.id()", **config, ) - + "\n" - ) + all_template_str_list.append(template_str) - file_content = FILE_HEAD + "\n\n" - file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" - if a_type == "kFE4M3fn": - filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" - else: - filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + conditions = [ + f"a_type == vllm::{a_type}", + f"b_type == vllm::{b_type}", + f"c_type == vllm::{c_type}", + f"s_type == vllm::{s_type}", + f"threads == {config['threads']}", + f"thread_m_blocks == {config['thread_m_blocks']}", + f"thread_n_blocks == {config['thread_n_blocks']}", + f"thread_k_blocks == {config['thread_k_blocks']}", + f"m_block_size_8 == {config['m_block_size_8']}", + f"stages == {config['stages']}", + f"group_blocks == {config['group_blocks']}", + f"is_zp_float == {config['is_zp_float']}", + ] + conditions = " && ".join(conditions) - filename = filename.lower() + if kernel_selector_str == FILE_HEAD_COMMENT: + kernel_selector_str += f"if ({conditions})\n kernel = " + else: + kernel_selector_str += f"else if ({conditions})\n kernel = " - with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: - f.write(file_content) + kernel_template2 = ( + "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, " + "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, " + "{{thread_n_blocks}}, {{thread_k_blocks}}, " + "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, " + "{{is_zp_float}}>;" + ) + + kernel_selector_str += ( + jinja2.Template(kernel_template2).render( + a_type_id=f"vllm::{a_type}.id()", + b_type_id=f"vllm::{b_type}.id()", + c_type_id=f"vllm::{c_type}.id()", + s_type_id=f"vllm::{s_type}.id()", + **config, + ) + + "\n" + ) + + file_content = FILE_HEAD + "\n\n" + file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" + if a_type == "kFE4M3fn": + filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + elif result_dict_tmp is sm_75_result_dict: + filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + else: + filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + + filename = filename.lower() + + with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: + f.write(file_content) if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT: kernel_selector_str += ( diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h index 5b6b2456b4111..138197b76f026 100644 --- a/csrc/moe/marlin_moe_wna16/marlin_template.h +++ b/csrc/moe/marlin_moe_wna16/marlin_template.h @@ -26,6 +26,7 @@ #include "quantization/gptq_marlin/marlin.cuh" #include "quantization/gptq_marlin/marlin_dtypes.cuh" #include "quantization/gptq_marlin/dequant.h" +#include "quantization/gptq_marlin/marlin_mma.h" #include "core/scalar_type.hpp" #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ @@ -35,7 +36,7 @@ namespace MARLIN_NAMESPACE_NAME { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 template -__device__ inline void mma( - const typename MarlinScalarType::FragA& a_frag, - const typename MarlinScalarType::FragB& frag_b, - typename MarlinScalarType::FragC& frag_c, int idx = 0) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - using scalar_t = typename MarlinScalarType::scalar_t; - if constexpr (k_size == 16) { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]), - "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]), - "r"(c[1]), "r"(c[2]), "r"(c[3])); - } - } else if (k_size == 32) { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3])); - } - } -} - -template -__device__ inline void mma_trans( - const typename MarlinScalarType::FragA& a_frag, - const typename MarlinScalarType::FragB& frag_b, - const typename MarlinScalarType::FragB& frag_b2, - typename MarlinScalarType::FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - const uint32_t* b2 = reinterpret_cast(&frag_b2); - float* c = reinterpret_cast(&frag_c); - using scalar_t = typename MarlinScalarType::scalar_t; - if constexpr (k_size == 16) { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]), - "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]), - "r"(c[3])); - } - } else { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200 - asm volatile( - "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - #else - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - #endif - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3])); - } - } -} - // Instruction for loading a full 16x16 matrix fragment of operand A from shared // memory, directly in tensor core layout. template @@ -439,9 +300,20 @@ __global__ void Marlin( if constexpr (a_type_id == vllm::kFE4M3fn.id()) return; #endif + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + // Turing TensorCore only supports fp16 and int8 + if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id()) + return; + #endif + int num_tokens_past_padded = num_tokens_past_padded_ptr[0]; constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks); + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id(); + #else + constexpr bool use_fp16_accum = false; + #endif using Adtype = MarlinScalarType; using Cdtype = MarlinScalarType; @@ -618,7 +490,22 @@ __global__ void Marlin( } } + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + + if constexpr (moe_block_size >= 16) + local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 16); + if constexpr (moe_block_size >= 8) + local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 8); + if constexpr (moe_block_size >= 4) + local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 4); + if constexpr (moe_block_size >= 2) + local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 2); + + local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 1); + block_num_valid_tokens = local_count; + #else block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count); + #endif if (lane_id == 0) reinterpret_cast(sh_new)[0] = block_num_valid_tokens; @@ -1018,10 +905,6 @@ __global__ void Marlin( constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage); int4* sh_s = sh_zp + (stages * zp_sh_stage); - // shared memory reused by reduction should be smaller than - // shared memory used by weight. - static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= - stages * b_sh_stage); int4* sh_a = sh_s + sh_s_size; // Register storage for double buffer of shared memory reads. @@ -1545,11 +1428,13 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { if constexpr (m_block_size_8) { - mma_trans(frag_a[k2][i], frag_b0, frag_b1, - frag_c[i][j][0]); + mma_trans(frag_a[k2][i], frag_b0, frag_b1, + frag_c[i][j][0]); } else { - mma(frag_a[k2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k2][i], frag_b1, frag_c[i][j][1]); + mma(frag_a[k2][i], frag_b0, + frag_c[i][j][0]); + mma(frag_a[k2][i], frag_b1, + frag_c[i][j][1]); } } } @@ -1583,10 +1468,12 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k2][i], frag_b[0], - (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]); - mma(frag_a[k2][i], frag_b[1], - (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]); + mma( + frag_a[k2][i], frag_b[0], + (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]); + mma( + frag_a[k2][i], frag_b[1], + (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]); } if constexpr (group_blocks != -1) { @@ -2132,6 +2019,21 @@ __global__ void Marlin( // While this pattern may not be the most readable, other ways of writing // the loop seemed to noticeably worse performance after compilation. if (slice_iters == 0) { + // convert fp16 accum to fp32 for reduction + if constexpr (use_fp16_accum) { + #pragma unroll + for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) { + float* frag_c_part_float = reinterpret_cast(frag_c) + i * 4; + scalar_t* frag_c_part_half = + reinterpret_cast(frag_c_part_float); + + #pragma unroll + for (int i = 3; i >= 0; i--) { + frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]); + } + } + } + if constexpr (is_a_8bit) { float frag_a_s[2 * thread_m_blocks]; diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu index 4fd8fc5c54202..8ac1691220a6b 100644 --- a/csrc/moe/marlin_moe_wna16/ops.cu +++ b/csrc/moe/marlin_moe_wna16/ops.cu @@ -142,7 +142,7 @@ typedef struct { int get_scales_cache_size(thread_config_t const& th_config, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, - bool has_act_order, bool is_k_full) { + bool has_act_order, bool is_k_full, int stages) { bool cache_scales_chunk = has_act_order && !is_k_full; int tb_n = th_config.thread_n; @@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m, if (cache_scales_chunk) { int load_groups = - tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K + tb_groups * stages * 2; // Chunk size is 2x pipeline over dim K load_groups = max(load_groups, 32); // We load at least 32 scale groups return load_groups * tb_n * 2; } else { int tb_scales = tb_groups * tb_n * 2; - return tb_scales * pipe_stages; + return tb_scales * stages; } } @@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, int thread_m_blocks, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, int has_zp, - int is_zp_float, bool is_a_8bit) { + int is_zp_float, bool is_a_8bit, int stages) { int pack_factor = 32 / num_bits; // Get B size @@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32) int sh_block_meta_size = tb_m * 16; - int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2); - int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; + int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2); + int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4; int sh_red_size = tb_m * (tb_n + 8) * 2; int sh_bias_size = tb_n * 2; int tmp_size = @@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, int sh_s_size = get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, - group_size, has_act_order, is_k_full); - int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0; + group_size, has_act_order, is_k_full, stages); + int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0; int sh_zp_size = 0; if (has_zp) { if (is_zp_float) @@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8, int thread_m_blocks, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, int has_zp, int is_zp_float, - int max_shared_mem, bool is_a_8bit) { + bool is_a_8bit, int stages, int max_shared_mem) { // Sanity if (th_config.thread_k == -1 || th_config.thread_n == -1 || th_config.num_threads == -1) { @@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8, int cache_size = get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, - is_k_full, has_zp, is_zp_float, is_a_8bit); + is_k_full, has_zp, is_zp_float, is_a_8bit, stages); return cache_size <= max_shared_mem; } @@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel( const vllm::ScalarType c_type, const vllm::ScalarType s_type, int thread_m_blocks, int thread_n_blocks, int thread_k_blocks, bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks, - int threads, bool is_zp_float) { + int threads, bool is_zp_float, int stages) { int num_bits = b_type.size_bits(); auto kernel = MarlinDefault; @@ -266,8 +266,8 @@ exec_config_t determine_exec_config( const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m, int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks, bool m_block_size_8, int num_bits, int group_size, bool has_act_order, - bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms, - bool is_a_8bit) { + bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages, + int max_shared_mem, int sms) { exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}}; thread_config_t* thread_configs = thread_m_blocks > 1 ? large_batch_thread_configs @@ -284,15 +284,15 @@ exec_config_t determine_exec_config( if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, - is_k_full, has_zp, is_zp_float, max_shared_mem - 512, - is_a_8bit)) { + is_k_full, has_zp, is_zp_float, is_a_8bit, stages, + max_shared_mem - 512)) { continue; } int cache_size = get_kernel_cache_size( th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float, - is_a_8bit); + is_a_8bit, stages); int group_blocks = 0; if (!has_act_order) { @@ -303,7 +303,7 @@ exec_config_t determine_exec_config( get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks, th_config.thread_n / 16, th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp, group_blocks, - th_config.num_threads, is_zp_float); + th_config.num_threads, is_zp_float, stages); if (kernel == MarlinDefault) continue; @@ -433,8 +433,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, dev); cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor, dev); - TORCH_CHECK(major_capability * 10 + minor_capability >= 80, - "marlin kernel only support Ampere or newer GPUs."); + TORCH_CHECK(major_capability * 10 + minor_capability >= 75, + "marlin kernel only support Turing or newer GPUs."); + int stages = 4; + if (major_capability == 7 && minor_capability == 5) { + stages = 2; + TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8, + "Turing only support FP16 or INT8 activation."); + } if (a_type == vllm::kFE4M3fn) { TORCH_CHECK(major_capability * 10 + minor_capability >= 89, "FP8 only support Ada Lovelace or newer GPUs."); @@ -461,8 +467,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, exec_cfg = determine_exec_config( a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts, top_k, thread_m_blocks, m_block_size_8, num_bits, group_size, - has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms, - is_a_8bit); + has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages, + max_shared_mem, sms); thread_tfg = exec_cfg.tb_cfg; } @@ -479,7 +485,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float, - max_shared_mem, is_a_8bit), + is_a_8bit, stages, max_shared_mem), "Invalid thread config: thread_m_blocks = ", thread_m_blocks, ", thread_k = ", thread_tfg.thread_k, ", thread_n = ", thread_tfg.thread_n, @@ -493,12 +499,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, int sh_cache_size = get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, - is_k_full, has_zp, is_zp_float, is_a_8bit); + is_k_full, has_zp, is_zp_float, is_a_8bit, stages); auto kernel = get_marlin_kernel( a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks, - num_threads, is_zp_float); + num_threads, is_zp_float, stages); if (kernel == MarlinDefault) { TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore index ba805f9250ece..7dc482a894660 100644 --- a/csrc/quantization/gptq_marlin/.gitignore +++ b/csrc/quantization/gptq_marlin/.gitignore @@ -1,2 +1,3 @@ sm*_kernel_*.cu kernel_selector.h +kernel_*.cu diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h index 26b8d40368aa9..edd97dbfcd8e5 100644 --- a/csrc/quantization/gptq_marlin/dequant.h +++ b/csrc/quantization/gptq_marlin/dequant.h @@ -67,7 +67,7 @@ where `scale_factor * multiplier` can be computed at weight loading. namespace MARLIN_NAMESPACE_NAME { -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 750 // Lookup-table based 3-input logical operation; explicitly used for // dequantization as the compiler does not seem to automatically recognize it in // all cases. diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py index 27ef7271ba41c..24866fc5cd546 100644 --- a/csrc/quantization/gptq_marlin/generate_kernels.py +++ b/csrc/quantization/gptq_marlin/generate_kernels.py @@ -10,6 +10,8 @@ import jinja2 ARCHS = [] SUPPORT_FP8 = False +SUPPORT_SM75 = False +SUPPORT_SM80 = False for arch in sys.argv[1].split(","): arch = arch[: arch.index(".") + 2].replace(".", "") arch = int(arch) @@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","): # with FP16 MMA, so it cannot achieve any acceleration. if arch in [89, 120]: SUPPORT_FP8 = True + if arch >= 80: + SUPPORT_SM80 = True + if arch == 75: + SUPPORT_SM75 = True FILE_HEAD_COMMENT = """ // auto generated by generate_kernels.py @@ -166,6 +172,7 @@ def remove_old_kernels(): def generate_new_kernels(): result_dict = {} + sm_75_result_dict = {} for quant_config in QUANT_CONFIGS: c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"]) @@ -184,6 +191,8 @@ def generate_new_kernels(): s_type = quant_config.get("s_type", c_type) if (a_type, b_type, c_type) not in result_dict: result_dict[(a_type, b_type, c_type)] = [] + if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16": + sm_75_result_dict[(a_type, b_type, c_type)] = [] for group_blocks, m_blocks, thread_configs in itertools.product( all_group_blocks, all_m_blocks, all_thread_configs @@ -207,78 +216,89 @@ def generate_new_kernels(): "thread_k_blocks": thread_k // 16, "thread_n_blocks": thread_n // 16, "m_block_size_8": "true" if m_blocks == 0.5 else "false", - "stages": "pipe_stages", + "stages": 4, "group_blocks": group_blocks, "is_zp_float": "true" if is_zp_float else "false", } - result_dict[(a_type, b_type, c_type)].append(config) + if SUPPORT_SM80: + result_dict[(a_type, b_type, c_type)].append(config) + if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75: + config_sm75 = config.copy() + config_sm75["stages"] = 2 + sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75) kernel_selector_str = FILE_HEAD_COMMENT - for (a_type, b_type, c_type), config_list in result_dict.items(): - all_template_str_list = [] - for config in config_list: - s_type = config["s_type"] - template_str = jinja2.Template(TEMPLATE).render( - a_type_id=f"vllm::{a_type}.id()", - b_type_id=f"vllm::{b_type}.id()", - c_type_id=f"vllm::{c_type}.id()", - s_type_id=f"vllm::{s_type}.id()", - **config, - ) - all_template_str_list.append(template_str) - - conditions = [ - f"a_type == vllm::{a_type}", - f"b_type == vllm::{b_type}", - f"c_type == vllm::{c_type}", - f"s_type == vllm::{s_type}", - f"threads == {config['threads']}", - f"thread_m_blocks == {config['thread_m_blocks']}", - f"thread_n_blocks == {config['thread_n_blocks']}", - f"thread_k_blocks == {config['thread_k_blocks']}", - f"m_block_size_8 == {config['m_block_size_8']}", - f"group_blocks == {config['group_blocks']}", - f"is_zp_float == {config['is_zp_float']}", - ] - conditions = " && ".join(conditions) - - if kernel_selector_str == FILE_HEAD_COMMENT: - kernel_selector_str += f"if ({conditions})\n kernel = " - else: - kernel_selector_str += f"else if ({conditions})\n kernel = " - - kernel_template2 = ( - "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, " - "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, " - "{{thread_n_blocks}}, {{thread_k_blocks}}, " - "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, " - "{{is_zp_float}}>;" - ) - - kernel_selector_str += ( - jinja2.Template(kernel_template2).render( + for result_dict_tmp in [result_dict, sm_75_result_dict]: + for (a_type, b_type, c_type), config_list in result_dict_tmp.items(): + all_template_str_list = [] + if not config_list: + continue + for config in config_list: + s_type = config["s_type"] + template_str = jinja2.Template(TEMPLATE).render( a_type_id=f"vllm::{a_type}.id()", b_type_id=f"vllm::{b_type}.id()", c_type_id=f"vllm::{c_type}.id()", s_type_id=f"vllm::{s_type}.id()", **config, ) - + "\n" - ) + all_template_str_list.append(template_str) - file_content = FILE_HEAD + "\n\n" - file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" - if a_type == "kFE4M3fn": - filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" - else: - filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + conditions = [ + f"a_type == vllm::{a_type}", + f"b_type == vllm::{b_type}", + f"c_type == vllm::{c_type}", + f"s_type == vllm::{s_type}", + f"threads == {config['threads']}", + f"thread_m_blocks == {config['thread_m_blocks']}", + f"thread_n_blocks == {config['thread_n_blocks']}", + f"thread_k_blocks == {config['thread_k_blocks']}", + f"m_block_size_8 == {config['m_block_size_8']}", + f"stages == {config['stages']}", + f"group_blocks == {config['group_blocks']}", + f"is_zp_float == {config['is_zp_float']}", + ] + conditions = " && ".join(conditions) - filename = filename.lower() + if kernel_selector_str == FILE_HEAD_COMMENT: + kernel_selector_str += f"if ({conditions})\n kernel = " + else: + kernel_selector_str += f"else if ({conditions})\n kernel = " - with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: - f.write(file_content) + kernel_template2 = ( + "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, " + "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, " + "{{thread_n_blocks}}, {{thread_k_blocks}}, " + "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, " + "{{is_zp_float}}>;" + ) + + kernel_selector_str += ( + jinja2.Template(kernel_template2).render( + a_type_id=f"vllm::{a_type}.id()", + b_type_id=f"vllm::{b_type}.id()", + c_type_id=f"vllm::{c_type}.id()", + s_type_id=f"vllm::{s_type}.id()", + **config, + ) + + "\n" + ) + + file_content = FILE_HEAD + "\n\n" + file_content += "\n\n".join(all_template_str_list) + "\n\n}\n" + if a_type == "kFE4M3fn": + filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + elif result_dict_tmp is sm_75_result_dict: + filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + else: + filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu" + + filename = filename.lower() + + with open(os.path.join(os.path.dirname(__file__), filename), "w") as f: + f.write(file_content) if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT: kernel_selector_str += ( diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 28ff06559a98a..77f319d53bc52 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -37,7 +37,7 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){}; using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr, @@ -148,7 +148,7 @@ typedef struct { int get_scales_cache_size(thread_config_t const& th_config, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, - bool has_act_order, bool is_k_full) { + bool has_act_order, bool is_k_full, int stages) { bool cache_scales_chunk = has_act_order && !is_k_full; int tb_n = th_config.thread_n; @@ -166,28 +166,29 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m, if (cache_scales_chunk) { int load_groups = - tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K + tb_groups * stages * 2; // Chunk size is 2x pipeline over dim K load_groups = max(load_groups, 32); // We load at least 32 scale groups return load_groups * tb_n * 2; } else { int tb_scales = tb_groups * tb_n * 2; - return tb_scales * pipe_stages; + return tb_scales * stages; } } int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, - int has_zp, int is_zp_float) { + int has_zp, bool is_zp_float, bool is_a_8bit, + int stages) { int pack_factor = 32 / num_bits; // Get B size int tb_k = th_config.thread_k; int tb_n = th_config.thread_n; int tb_m = thread_m_blocks * 16; - int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; - int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; + int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2); + int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4; int sh_red_size = tb_m * (tb_n + 8) * 2; int sh_bias_size = tb_n * 2; int tmp_size = @@ -196,8 +197,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks, int sh_s_size = get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, - group_size, has_act_order, is_k_full); - int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0; + group_size, has_act_order, is_k_full, stages); + int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0; int sh_zp_size = 0; if (has_zp) { if (is_zp_float) @@ -217,7 +218,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks, bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks, int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, - int has_zp, int is_zp_float, int max_shared_mem) { + int has_zp, bool is_zp_float, bool is_a_8bit, int stages, + int max_shared_mem) { // Sanity if (th_config.thread_k == -1 || th_config.thread_n == -1 || th_config.num_threads == -1) { @@ -242,7 +244,7 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks, // Check that pipeline fits into cache int cache_size = get_kernel_cache_size( th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, - has_act_order, is_k_full, has_zp, is_zp_float); + has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages); return cache_size <= max_shared_mem; } @@ -251,7 +253,7 @@ MarlinFuncPtr get_marlin_kernel( const vllm::ScalarType c_type, const vllm::ScalarType s_type, int thread_m_blocks, int thread_n_blocks, int thread_k_blocks, bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks, - int threads, bool is_zp_float) { + int threads, bool is_zp_float, int stages) { int num_bits = b_type.size_bits(); auto kernel = MarlinDefault; @@ -265,7 +267,8 @@ exec_config_t determine_exec_config( const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m, int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8, int num_bits, int group_size, bool has_act_order, bool is_k_full, - bool has_zp, bool is_zp_float, int max_shared_mem, int sms) { + bool has_zp, bool is_zp_float, int is_a_8bit, int stages, + int max_shared_mem, int sms) { exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}}; thread_config_t* thread_configs = thread_m_blocks > 1 ? large_batch_thread_configs @@ -280,13 +283,15 @@ exec_config_t determine_exec_config( if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, - is_zp_float, max_shared_mem - 512)) { + is_zp_float, is_a_8bit, stages, + max_shared_mem - 512)) { continue; } - int cache_size = get_kernel_cache_size( - th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, - group_size, has_act_order, is_k_full, has_zp, is_zp_float); + int cache_size = get_kernel_cache_size(th_config, thread_m_blocks, prob_m, + prob_n, prob_k, num_bits, group_size, + has_act_order, is_k_full, has_zp, + is_zp_float, is_a_8bit, stages); int group_blocks = 0; if (!has_act_order) { @@ -297,14 +302,10 @@ exec_config_t determine_exec_config( get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks, th_config.thread_n / 16, th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp, group_blocks, - th_config.num_threads, is_zp_float); + th_config.num_threads, is_zp_float, stages); if (kernel == MarlinDefault) continue; - // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16); - // int n_tiles = prob_n / th_config.thread_n; - // int k_tiles = prob_k / th_config.thread_k; - return {1, th_config}; } @@ -321,6 +322,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, int group_size, int dev, cudaStream_t stream, int thread_k_init, int thread_n_init, int sms, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) { + bool is_a_8bit = a_type.size_bits() == 8; TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]"); @@ -389,8 +391,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, dev); cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor, dev); - TORCH_CHECK(major_capability * 10 + minor_capability >= 80, - "marlin kernel only support Ampere or newer GPUs."); + TORCH_CHECK(major_capability * 10 + minor_capability >= 75, + "marlin kernel only support Turing or newer GPUs."); + int stages = 4; + if (major_capability == 7 && minor_capability == 5) { + stages = 2; + TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8, + "Turing only support FP16 or INT8 activation."); + } if (a_type == vllm::kFE4M3fn) { TORCH_CHECK( major_capability * 10 + minor_capability == 89 || @@ -431,7 +439,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, exec_cfg = determine_exec_config( a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order, - is_k_full, has_zp, is_zp_float, max_shared_mem, sms); + is_k_full, has_zp, is_zp_float, is_a_8bit, stages, max_shared_mem, + sms); thread_tfg = exec_cfg.tb_cfg; if (thread_tfg.thread_n != -1) { if (prob_n / thread_tfg.thread_n * @@ -440,7 +449,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float, - max_shared_mem_new)) { + is_a_8bit, stages, max_shared_mem_new)) { thread_tfg = {128, 64, 128}; exec_cfg = {1, thread_tfg}; } @@ -466,7 +475,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, TORCH_CHECK( is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, - has_zp, is_zp_float, max_shared_mem_new), + has_zp, is_zp_float, is_a_8bit, stages, + max_shared_mem_new), "Invalid thread config: thread_m_blocks = ", thread_m_blocks, ", thread_k = ", thread_tfg.thread_k, ", thread_n = ", thread_tfg.thread_n, @@ -475,12 +485,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, ", prob_m_split = ", prob_m_split, ", group_size = ", group_size, ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full, ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float, - ", max_shared_mem_new = ", max_shared_mem_new); + ", stages = ", stages, ", max_shared_mem_new = ", max_shared_mem_new); auto kernel = get_marlin_kernel( a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks, - num_threads, is_zp_float); + num_threads, is_zp_float, stages); if (kernel == MarlinDefault) { TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh index 2505e221322dd..33fe52f605b42 100644 --- a/csrc/quantization/gptq_marlin/marlin.cuh +++ b/csrc/quantization/gptq_marlin/marlin.cuh @@ -1,17 +1,19 @@ #pragma once -#include +#ifndef _marlin_cuh + #define _marlin_cuh + #include -#include -#include -#include -#include -#include -#include + #include + #include + #include + #include + #include + #include -#ifndef MARLIN_NAMESPACE_NAME - #define MARLIN_NAMESPACE_NAME marlin -#endif + #ifndef MARLIN_NAMESPACE_NAME + #define MARLIN_NAMESPACE_NAME marlin + #endif namespace MARLIN_NAMESPACE_NAME { @@ -51,9 +53,51 @@ using I4 = Vec; constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 -// No support for async -#else + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { + if (pred) { + reinterpret_cast(smem_ptr)[0] = + reinterpret_cast(glob_ptr)[0]; + } +} + +__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { + if (pred) { + reinterpret_cast(smem_ptr)[0] = + reinterpret_cast(glob_ptr)[0]; + } +} + +__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { + if (pred) { + reinterpret_cast(smem_ptr)[0] = + reinterpret_cast(glob_ptr)[0]; + } +} + +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, + bool pred = true) { + if (pred) { + reinterpret_cast(smem_ptr)[0] = + reinterpret_cast(glob_ptr)[0]; + } +} + +__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { + reinterpret_cast(smem_ptr)[0] = + reinterpret_cast(glob_ptr)[0]; +} + +__device__ inline void cp_async_fence() {} + +template +__device__ inline void cp_async_wait() {} + + #else __device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) { @@ -126,6 +170,8 @@ __device__ inline void cp_async_wait() { asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); } -#endif + #endif } // namespace MARLIN_NAMESPACE_NAME + +#endif \ No newline at end of file diff --git a/csrc/quantization/gptq_marlin/marlin_mma.h b/csrc/quantization/gptq_marlin/marlin_mma.h new file mode 100644 index 0000000000000..6ec2aaafc4392 --- /dev/null +++ b/csrc/quantization/gptq_marlin/marlin_mma.h @@ -0,0 +1,269 @@ + +#include "marlin_dtypes.cuh" + +namespace MARLIN_NAMESPACE_NAME { + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +template +__device__ inline void mma( + const typename MarlinScalarType::FragA& a_frag, + const typename MarlinScalarType::FragB& frag_b, + typename MarlinScalarType::FragC& frag_c, int idx = 0) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + using scalar_t = typename MarlinScalarType::scalar_t; + if constexpr (!std::is_same::value || k_size != 16) { + static_assert(!use_fp16_accum); + } + + if constexpr (k_size == 16) { + if constexpr (std::is_same::value && !use_fp16_accum) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(b[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[2]), "r"(a[3]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); +#else + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); +#endif + } else if constexpr (std::is_same::value && + use_fp16_accum) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + uint32_t* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " + "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1])); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " + "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(a[2]), "r"(a[3]), "r"(b[1]), "r"(c[0]), "r"(c[1])); +#else + uint32_t* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " + "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "r"(c[0]), "r"(c[1])); +#endif + } else if constexpr (std::is_same::value) { + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]), + "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + int32_t* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) + : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]), + "r"(c[1]), "r"(c[2]), "r"(c[3])); + } + } else if (k_size == 32) { + if constexpr (std::is_same::value) { + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + int32_t* c = reinterpret_cast(&frag_c); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(a[0]), "r"(b[0]), "r"(c[0]), "r"(c[1])); + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[2]), "=r"(c[3]) + : "r"(a[1]), "r"(b[0]), "r"(c[2]), "r"(c[3])); + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(a[2]), "r"(b[1]), "r"(c[0]), "r"(c[1])); + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[2]), "=r"(c[3]) + : "r"(a[3]), "r"(b[1]), "r"(c[2]), "r"(c[3])); +#else + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), + "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3])); +#endif + } + } +} + +template +__device__ inline void mma_trans( + const typename MarlinScalarType::FragA& a_frag, + const typename MarlinScalarType::FragB& frag_b, + const typename MarlinScalarType::FragB& frag_b2, + typename MarlinScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + const uint32_t* b2 = reinterpret_cast(&frag_b2); + float* c = reinterpret_cast(&frag_c); + using scalar_t = typename MarlinScalarType::scalar_t; + if constexpr (!std::is_same::value || k_size != 16) { + static_assert(!use_fp16_accum); + } + + if constexpr (k_size == 16) { + if constexpr (std::is_same::value && !use_fp16_accum) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); +#else + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); +#endif + } else if constexpr (std::is_same::value && + use_fp16_accum) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + uint32_t* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " + "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1])); + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " + "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "r"(c[0]), "r"(c[1])); +#else + uint32_t* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 " + "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "r"(c[0]), "r"(c[1])); +#endif + } else if constexpr (std::is_same::value) { + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + } else if constexpr (std::is_same::value) { + int32_t* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]), + "r"(c[3])); + } + } else { + if constexpr (std::is_same::value) { + float* c = reinterpret_cast(&frag_c); + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); + } else if constexpr (std::is_same::value) { + int32_t* c = reinterpret_cast(&frag_c); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(b[0]), "r"(a[0]), "r"(c[0]), "r"(c[1])); + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[2]), "=r"(c[3]) + : "r"(b2[1]), "r"(a[0]), "r"(c[2]), "r"(c[3])); + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[0]), "=r"(c[1]) + : "r"(b[0]), "r"(a[1]), "r"(c[0]), "r"(c[1])); + asm volatile( + "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(c[2]), "=r"(c[3]) + : "r"(b2[1]), "r"(a[1]), "r"(c[2]), "r"(c[3])); +#else + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), + "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3])); +#endif + } + } +} + +} // namespace MARLIN_NAMESPACE_NAME \ No newline at end of file diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h index 22bb71e482ce8..c7b53696c1223 100644 --- a/csrc/quantization/gptq_marlin/marlin_template.h +++ b/csrc/quantization/gptq_marlin/marlin_template.h @@ -26,6 +26,7 @@ #include "marlin.cuh" #include "marlin_dtypes.cuh" #include "dequant.h" +#include "marlin_mma.h" #include "core/scalar_type.hpp" #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ @@ -35,7 +36,7 @@ namespace MARLIN_NAMESPACE_NAME { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 template -__device__ inline void mma( - const typename MarlinScalarType::FragA& a_frag, - const typename MarlinScalarType::FragB& frag_b, - typename MarlinScalarType::FragC& frag_c, int idx = 0) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - using scalar_t = typename MarlinScalarType::scalar_t; - if constexpr (k_size == 16) { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]), - "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]), - "r"(c[1]), "r"(c[2]), "r"(c[3])); - } - } else if (k_size == 32) { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), - "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3])); - } - } -} - -template -__device__ inline void mma_trans( - const typename MarlinScalarType::FragA& a_frag, - const typename MarlinScalarType::FragB& frag_b, - const typename MarlinScalarType::FragB& frag_b2, - typename MarlinScalarType::FragC& frag_c) { - const uint32_t* a = reinterpret_cast(&a_frag); - const uint32_t* b = reinterpret_cast(&frag_b); - const uint32_t* b2 = reinterpret_cast(&frag_b2); - float* c = reinterpret_cast(&frag_c); - using scalar_t = typename MarlinScalarType::scalar_t; - if constexpr (k_size == 16) { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]), - "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]), - "r"(c[3])); - } - } else { - if constexpr (std::is_same::value) { - float* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); - } else if constexpr (std::is_same::value) { - int32_t* c = reinterpret_cast(&frag_c); - asm volatile( - "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite " - "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" - : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3]) - : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), - "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3])); - } - } -} - // Instruction for loading a full 16x16 matrix fragment of operand A from shared // memory, directly in tensor core layout. template @@ -415,6 +285,17 @@ __global__ void Marlin( if constexpr (a_type_id == vllm::kFE4M3fn.id()) return; #endif + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + // Turing TensorCore only supports fp16 and int8 + if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id()) + return; + #endif + + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 + constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id(); + #else + constexpr bool use_fp16_accum = false; + #endif using Adtype = MarlinScalarType; using Cdtype = MarlinScalarType; const int4* A = A0; @@ -873,10 +754,6 @@ __global__ void Marlin( constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage); int4* sh_s = sh_zp + (stages * zp_sh_stage); - // shared memory reused by reduction should be smaller than - // shared memory used by weight. - static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= - stages * b_sh_stage); int4* sh_a = sh_s + sh_s_size; // Register storage for double buffer of shared memory reads. @@ -1395,11 +1272,13 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { if constexpr (m_block_size_8) { - mma_trans(frag_a[k2][i], frag_b0, frag_b1, - frag_c[i][j][0]); + mma_trans(frag_a[k2][i], frag_b0, frag_b1, + frag_c[i][j][0]); } else { - mma(frag_a[k2][i], frag_b0, frag_c[i][j][0]); - mma(frag_a[k2][i], frag_b1, frag_c[i][j][1]); + mma(frag_a[k2][i], frag_b0, + frag_c[i][j][0]); + mma(frag_a[k2][i], frag_b1, + frag_c[i][j][1]); } } } @@ -1433,10 +1312,12 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < thread_m_blocks; i++) { - mma(frag_a[k2][i], frag_b[0], - (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]); - mma(frag_a[k2][i], frag_b[1], - (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]); + mma( + frag_a[k2][i], frag_b[0], + (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]); + mma( + frag_a[k2][i], frag_b[1], + (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]); } if constexpr (group_blocks != -1) { @@ -1956,6 +1837,21 @@ __global__ void Marlin( // While this pattern may not be the most readable, other ways of writing // the loop seemed to noticeably worse performance after compilation. if (slice_iters == 0) { + // convert fp16 accum to fp32 for reduction + if constexpr (use_fp16_accum) { + #pragma unroll + for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) { + float* frag_c_part_float = reinterpret_cast(frag_c) + i * 4; + scalar_t* frag_c_part_half = + reinterpret_cast(frag_c_part_float); + + #pragma unroll + for (int i = 3; i >= 0; i--) { + frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]); + } + } + } + if constexpr (is_a_8bit) { float frag_a_s[2 * thread_m_blocks]; diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 3ed15ed7dd422..314848721a80a 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -121,7 +121,7 @@ class AWQMarlinConfig(QuantizationConfig): @classmethod def get_min_capability(cls) -> int: - return 80 + return 75 @classmethod def get_config_filenames(cls) -> list[str]: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f2b66a2beb6d7..800340ed6043c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -253,7 +253,7 @@ class Fp8Config(QuantizationConfig): @classmethod def get_min_capability(cls) -> int: - return 80 + return 75 @classmethod def get_config_filenames(cls) -> list[str]: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 6e5dcfe59b2f9..347c7b2008d12 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -181,7 +181,7 @@ class GPTQMarlinConfig(QuantizationConfig): @classmethod def get_min_capability(cls) -> int: - return 80 + return 75 @classmethod def get_config_filenames(cls) -> list[str]: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index d5d7e7bfaae73..aa3937d4c03ff 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -871,7 +871,7 @@ class ModelOptNvFp4Config(ModelOptQuantConfigBase): @classmethod def get_min_capability(cls) -> int: - return 80 + return 75 @classmethod def override_quantization_method( From b6ec077e058e15e5b853793924e6643ec6c579aa Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 16 Dec 2025 17:47:53 -0500 Subject: [PATCH 09/49] [CI] Skip ci failure test (#30804) Signed-off-by: yewentao256 --- tests/compile/distributed/test_fusions_e2e.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index bd326f1157d8f..80086c4e03a9c 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -523,6 +523,8 @@ CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"] list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)), ) @pytest.mark.parametrize("inductor_graph_partition", [True, False]) +# TODO: remove skip after we fix the fusion thoroughly +@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell") def test_rms_group_quant( model_name: str, model_kwargs: dict[str, Any], @@ -562,7 +564,7 @@ def test_rms_group_quant( splitting_ops=splitting_ops, # Common mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(eliminate_noops=True, enable_fusion=True), + pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True), # Inductor caches custom passes by default as well via uuid inductor_compile_config={"force_disable_caches": True}, ) From 0a1ab1e565fce5070bc1c1b1f3374537e437550c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 16 Dec 2025 17:56:02 -0500 Subject: [PATCH 10/49] [Perf][Kernels] Vectorize `csrc/activations_kernels.cu` (#29512) Signed-off-by: mgoin --- benchmarks/kernels/benchmark_activation.py | 4 +- csrc/activation_kernels.cu | 208 +++++++++++++++++---- 2 files changed, 175 insertions(+), 37 deletions(-) diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index 66268b71b3de6..d31e67057d8f6 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -13,8 +13,8 @@ from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE -batch_size_range = [1, 16, 32, 64, 128] -seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +batch_size_range = [1, 16, 128] +seq_len_range = [1, 16, 64, 1024, 4096] intermediate_size = [3072, 9728, 12288] configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index a4a880f13cf7e..8268065ef02c8 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x, const scalar_t& y) { return act_first ? ACT_FN(x) * y : x * ACT_FN(y); } -// Activation and gating kernel template. +// Check if all pointers are 16-byte aligned for int4 vectorized access +__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) { + return (reinterpret_cast(ptr) & 15) == 0; +} + +// Activation and gating kernel template. template __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] const int d) { + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = compute(x, y); + const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* y_ptr = x_ptr + d; + scalar_t* out_ptr = out + token_idx * d; + + // Check alignment for 128-bit vectorized access. + // All three pointers must be 16-byte aligned for safe int4 operations. + const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && + is_16byte_aligned(out_ptr); + + if (aligned && d >= VEC_SIZE) { + // Fast path: 128-bit vectorized loop + const int4* x_vec = reinterpret_cast(x_ptr); + const int4* y_vec = reinterpret_cast(y_ptr); + int4* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / VEC_SIZE; + const int vec_end = num_vecs * VEC_SIZE; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; + auto* xp = reinterpret_cast(&x); + auto* yp = reinterpret_cast(&y); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < VEC_SIZE; j++) { + rp[j] = compute(xp[j], yp[j]); + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = compute(VLLM_LDG(&x_ptr[i]), + VLLM_LDG(&y_ptr[i])); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&x_ptr[idx]); + const scalar_t y = VLLM_LDG(&y_ptr[idx]); + out_ptr[idx] = compute(x, y); + } } } @@ -120,50 +162,115 @@ template __global__ void act_and_mul_kernel_with_param( scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, const float param) { + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = ACT_FN(x, param) * y; + const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* y_ptr = x_ptr + d; + scalar_t* out_ptr = out + token_idx * d; + + // Check alignment for 128-bit vectorized access + const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && + is_16byte_aligned(out_ptr); + + if (aligned && d >= VEC_SIZE) { + // Fast path: 128-bit vectorized loop + const int4* x_vec = reinterpret_cast(x_ptr); + const int4* y_vec = reinterpret_cast(y_ptr); + int4* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / VEC_SIZE; + const int vec_end = num_vecs * VEC_SIZE; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; + auto* xp = reinterpret_cast(&x); + auto* yp = reinterpret_cast(&y); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < VEC_SIZE; j++) { + rp[j] = ACT_FN(xp[j], param) * yp[j]; + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&x_ptr[idx]); + const scalar_t y = VLLM_LDG(&y_ptr[idx]); + out_ptr[idx] = ACT_FN(x, param) * y; + } } } template __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, float alpha, float limit) { - // clamp gate: min=None, max=limit - const float gate_f = (float)gate; - const float clamped_gate = gate_f > limit ? limit : gate_f; - - // clamp up: min=-limit, max=limit - const float up_f = (float)up; - const float clamped_up = - up_f > limit ? limit : (up_f < -limit ? -limit : up_f); - - // glu = gate * sigmoid(gate * alpha) - const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); - const float glu = clamped_gate * sigmoid_val; - - // (up + 1) * glu - return (T)((clamped_up + 1.0f) * glu); + // Clamp gate to (-inf, limit] and up to [-limit, limit] + const float g = fminf((float)gate, limit); + const float u = fmaxf(fminf((float)up, limit), -limit); + // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu + return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha))); } +// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...]. template __global__ void swigluoai_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., 2, d] + const scalar_t* __restrict__ input, // [..., 2 * d] (interleaved) const int d, const float alpha, const float limit) { + // For interleaved data: input has 2*d elements per token (gate/up pairs) + // output has d elements per token + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); + constexpr int PAIRS = VEC_SIZE / 2; // Number of gate/up pairs per int4 load const int64_t token_idx = blockIdx.x; - // TODO: Vectorize loads and stores. - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - // gate = x[..., ::2] (even indices) - const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); - // up = x[..., 1::2] (odd indices) - const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + const scalar_t* in_ptr = input + token_idx * 2 * d; + scalar_t* out_ptr = out + token_idx * d; - out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + // Check alignment for 128-bit vectorized access on input. + // For output we use int2 (64-bit) which has 8-byte alignment requirement. + const bool in_aligned = is_16byte_aligned(in_ptr); + const bool out_aligned = + (reinterpret_cast(out_ptr) & 7) == 0; // 8-byte for int2 + + if (in_aligned && out_aligned && d >= PAIRS) { + // Fast path: vectorized loop + // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs + // Each int2 store writes PAIRS output elements + const int4* in_vec = reinterpret_cast(in_ptr); + int2* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / PAIRS; + const int vec_end = num_vecs * PAIRS; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 v = VLLM_LDG(&in_vec[i]); + int2 r; + auto* vp = reinterpret_cast(&v); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < PAIRS; j++) { + rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit); + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]), + VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]); + out_ptr[idx] = ACT_FN(gate, up, alpha, limit); + } } } @@ -217,10 +324,41 @@ __global__ void activation_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., d] const int d) { + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]); - out[token_idx * d + idx] = ACT_FN(x); + const scalar_t* in_ptr = input + token_idx * d; + scalar_t* out_ptr = out + token_idx * d; + + // Check alignment for 128-bit vectorized access + const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr); + + if (aligned && d >= VEC_SIZE) { + // Fast path: 128-bit vectorized loop + const int4* in_vec = reinterpret_cast(in_ptr); + int4* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / VEC_SIZE; + const int vec_end = num_vecs * VEC_SIZE; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 v = VLLM_LDG(&in_vec[i]), r; + auto* vp = reinterpret_cast(&v); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < VEC_SIZE; j++) { + rp[j] = ACT_FN(vp[j]); + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i])); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&in_ptr[idx]); + out_ptr[idx] = ACT_FN(x); + } } } From 2410132bb1f9faa5b252fad3f2b83dc926946b08 Mon Sep 17 00:00:00 2001 From: TJian Date: Wed, 17 Dec 2025 07:32:43 +0800 Subject: [PATCH 11/49] [ROCm] [Bugfix] Fix torch sdpa hallucination (#30789) Signed-off-by: tjtanaa --- vllm/attention/ops/vit_attn_wrappers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index 46c7d83dfa5c2..892c4209c01e0 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -16,6 +16,7 @@ import einops import torch import torch.nn.functional as F +from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op @@ -89,6 +90,13 @@ def torch_sdpa_wrapper( v: torch.Tensor, cu_seqlens: torch.Tensor, ) -> torch.Tensor: + # Never remove the contiguous logic for ROCm + # Without it, hallucinations occur with the backend + if current_platform.is_rocm(): + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() + outputs = [] lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() From e80455ca8b696452b98d91785175210ed7a1bd41 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 16 Dec 2025 18:40:47 -0500 Subject: [PATCH 12/49] Replace deprecated enable_fusion with fuse_norm_quant in test_rms_group_quant (#30817) Signed-off-by: mgoin From e087fbc393055fb69e9acf71fa124be0190498ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 17 Dec 2025 00:54:45 +0100 Subject: [PATCH 13/49] [MM] Pass FA version in ViT Attn (#30756) Signed-off-by: NickLucche Co-authored-by: Cyrus Leung --- vllm/attention/layers/mm_encoder_attention.py | 6 ++++++ vllm/attention/ops/vit_attn_wrappers.py | 9 ++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py index c9107ebcab856..8b3dee1340b9f 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/attention/layers/mm_encoder_attention.py @@ -10,6 +10,7 @@ from vllm.attention.ops.vit_attn_wrappers import ( vit_flash_attn_wrapper, vit_torch_sdpa_wrapper, ) +from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import MultiModalConfig from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp @@ -101,6 +102,10 @@ class MMEncoderAttention(CustomOp): self.attn_backend, ) + if self.is_flash_attn_backend: + assert self.flash_attn_varlen_func is not None + self._fa_version = get_flash_attn_version() + logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.") @classmethod @@ -204,6 +209,7 @@ class MMEncoderAttention(CustomOp): max_seqlen=max_seqlen, batch_size=bsz, is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA), + fa_version=self._fa_version, ) return output diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index 892c4209c01e0..5a74e1310133d 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -28,11 +28,15 @@ def flash_attn_maxseqlen_wrapper( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, + fa_version: int, ) -> torch.Tensor: + kwargs = {} if is_rocm_aiter: from aiter import flash_attn_varlen_func else: from vllm.attention.utils.fa_utils import flash_attn_varlen_func + + kwargs["fa_version"] = fa_version q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( q, @@ -44,6 +48,7 @@ def flash_attn_maxseqlen_wrapper( max_seqlen_k=max_seqlen.item(), dropout_p=0.0, causal=False, + **kwargs, ) context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size) return context_layer @@ -57,6 +62,7 @@ def flash_attn_maxseqlen_wrapper_fake( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, + fa_version: int, ) -> torch.Tensor: return torch.empty_like(q) @@ -76,9 +82,10 @@ def vit_flash_attn_wrapper( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, + fa_version: int, ) -> torch.Tensor: return torch.ops.vllm.flash_attn_maxseqlen_wrapper( - q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter + q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version ) From c0a88df7f771a48247a934e8821e6e230b3fc5a4 Mon Sep 17 00:00:00 2001 From: Amr Mahdi Date: Wed, 17 Dec 2025 02:41:57 +0200 Subject: [PATCH 14/49] [docker] Allow kv_connectors install to fail on arm64 (#30806) Signed-off-by: Amr Mahdi --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index ae2624ace67b9..e61021b6eeb85 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -621,7 +621,7 @@ ENV UV_HTTP_TIMEOUT=500 RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ - uv pip install --system -r /tmp/kv_connectors.txt; \ + uv pip install --system -r /tmp/kv_connectors.txt || true; \ fi ENV VLLM_USAGE_SOURCE production-docker-image From f5db6385a19b04e76b5834618305485753e75544 Mon Sep 17 00:00:00 2001 From: "Grzegorz K. Karch" Date: Wed, 17 Dec 2025 02:06:28 +0100 Subject: [PATCH 15/49] Fix nemotron_nas intermediate_size computation (#30795) Signed-off-by: Grzegorz Karch --- vllm/model_executor/models/nemotron_nas.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 19a942a5277cc..83ef5e7e1282d 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -169,10 +169,13 @@ class DeciLMDecoderLayer(nn.Module): self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) if not self._is_no_op_ffn: - ffn_mult = block_config.ffn.ffn_mult - intermediate_size = _ffn_mult_to_intermediate_size( - ffn_mult, config.hidden_size - ) + if hasattr(block_config.ffn, "ffn_mult"): + ffn_mult = block_config.ffn.ffn_mult + intermediate_size = _ffn_mult_to_intermediate_size( + ffn_mult, config.hidden_size + ) + else: + intermediate_size = block_config.ffn.intermediate_size self.mlp = LlamaMLP( hidden_size=self.hidden_size, From 811cdf5197acb4d6ab42250a5b0f822887d1190a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 16 Dec 2025 20:52:14 -0500 Subject: [PATCH 16/49] Update model-hosting-container-standards to 0.1.10 (#30815) Signed-off-by: Michael Goin --- requirements/common.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 31c8fb404f63a..426d281c26704 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -50,5 +50,5 @@ ijson # Required for mistral streaming tool parser setproctitle # Used to set process names for better debugging and monitoring openai-harmony >= 0.0.3 # Required for gpt-oss anthropic == 0.71.0 -model-hosting-container-standards >= 0.1.9, < 1.0.0 -mcp \ No newline at end of file +model-hosting-container-standards >= 0.1.10, < 1.0.0 +mcp From bb5ac1fe38c9fcc7bafaee47fd45c8d1696ad176 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Wed, 17 Dec 2025 04:21:07 +0000 Subject: [PATCH 17/49] [CPU] Add action to automatically label CPU related PRs (#30678) Signed-off-by: Fadi Arafeh --- .github/mergify.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 3ad79f93bc7ad..3e4e21efe39df 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -235,6 +235,20 @@ pull_request_rules: add: - rocm +- name: label-cpu + description: Automatically apply cpu label + conditions: + - label != stale + - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.* + actions: + label: + add: + - cpu + assign: + users: + - "fadara01" + - "aditew01" + - name: label-structured-output description: Automatically apply structured-output label conditions: From 44d3b1df3d6416b76d84c360d751b8f5220c0b11 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 17 Dec 2025 12:21:19 +0800 Subject: [PATCH 18/49] [CI/Build] Fix compatibility between #30244 and #30396 (#30787) Signed-off-by: DarkLight1337 --- tests/compile/distributed/test_fusions_e2e.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 80086c4e03a9c..960b5b4bd7ad4 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -564,7 +564,9 @@ def test_rms_group_quant( splitting_ops=splitting_ops, # Common mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True), + pass_config=PassConfig( + fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True + ), # Inductor caches custom passes by default as well via uuid inductor_compile_config={"force_disable_caches": True}, ) From 009a773828fee13504ee2976ad02abb6020152c8 Mon Sep 17 00:00:00 2001 From: shanjiaz <43143795+shanjiaz@users.noreply.github.com> Date: Wed, 17 Dec 2025 00:01:04 -0500 Subject: [PATCH 19/49] bump up compressed tensors version to 0.13.0 (#30799) Signed-off-by: shanjiaz Co-authored-by: Dipika Sikka --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 426d281c26704..7c89385da6ba5 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -37,7 +37,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.12.2 # required for compressed-tensors +compressed-tensors == 0.13.0 # required for compressed-tensors depyf==0.20.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files From d4d2751732c3ccae162a5a0160c7d4fe05d2779a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 17 Dec 2025 00:29:03 -0500 Subject: [PATCH 20/49] Update note comment for flashinfer attention warmup (#30711) Signed-off-by: mgoin --- vllm/model_executor/warmup/kernel_warmup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 95f5982bc8c7b..98b28d3e5292f 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -49,13 +49,12 @@ def kernel_warmup(worker: "Worker"): except NotImplementedError: return False - # NOTE: we add check for empty attn_groups to avoid errors when - # deploying models such as E instances and encoder-only models. - # As for those models, worker.model_runner.attn_groups is empty. - # This change is made during EPD feature development. if ( not worker.model_runner.is_pooling_model and worker.model_runner.attn_groups + # NOTE: This should be `any` instead of `all` but other hybrid attention + # backends don't support this dummy run. Once we remove + # `build_for_cudagraph_capture`, we can change it to `any`. and all( _is_flashinfer_backend(group.backend) for groups in worker.model_runner.attn_groups From 0cd5353644d3d045ab33c7e8e19c182bfd7db911 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 17 Dec 2025 15:25:12 +0800 Subject: [PATCH 21/49] [Bugfix][CPU] Fix CPU backend ROPE dispatch for VL models (#30829) Signed-off-by: jiang1.li Signed-off-by: Li, Jiang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/rotary_embedding/common.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 3e6584dbc3da0..50660c6ecc223 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -264,6 +264,15 @@ class ApplyRotaryEmb(CustomOp): return output + def forward_cpu( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + # TODO (bigPYJ1151): need to enable fused CPU ROPE here + return self.forward_native(x, cos, sin) + def extra_repr(self) -> str: s = f"is_neox_style={self.is_neox_style}" s += f"enable_fp32_compute={self.enable_fp32_compute}" From 4f735babb7353987137b85ec0465e594e9ed1384 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Wed, 17 Dec 2025 16:28:13 +0800 Subject: [PATCH 22/49] [XPU] fix broken fp8 online quantization for XPU platform (#30831) Signed-off-by: Yan Ma --- .../layers/quantization/ipex_quant.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 463c74c1c1482..f33ee43727f19 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -27,6 +27,10 @@ from vllm.model_executor.layers.quantization.awq import AWQLinearMethod from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + maybe_create_device_identity, +) +from vllm.model_executor.parameter import ModelWeightParameter from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -305,6 +309,37 @@ class XPUFp8LinearMethod(Fp8LinearMethod): def __init__(self, quant_config: Fp8Config): super().__init__(quant_config) + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + maybe_create_device_identity() + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + layer.weight_block_size = None + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + def process_weights_after_loading(self, layer: Module) -> None: # If checkpoint not serialized fp8, quantize the weights. if not self.quant_config.is_checkpoint_fp8_serialized: From 20fda431515d19a883cc962d3a1fa727f225e82d Mon Sep 17 00:00:00 2001 From: Robin <863579016@qq.com> Date: Wed, 17 Dec 2025 16:37:57 +0800 Subject: [PATCH 23/49] [Bugfix][Frontend] Prevent IndexError in MiniMax M2 tool parser during streaming extraction (#30555) Signed-off-by: WangErXiao <863579016@qq.com> --- tests/tool_use/test_minimax_m2_tool_parser.py | 119 ++++++++++++++++++ vllm/tool_parsers/minimax_m2_tool_parser.py | 22 +++- 2 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 tests/tool_use/test_minimax_m2_tool_parser.py diff --git a/tests/tool_use/test_minimax_m2_tool_parser.py b/tests/tool_use/test_minimax_m2_tool_parser.py new file mode 100644 index 0000000000000..cf1835b1928b4 --- /dev/null +++ b/tests/tool_use/test_minimax_m2_tool_parser.py @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from vllm.tool_parsers.minimax_m2_tool_parser import ( + MinimaxM2ToolParser, +) + +pytestmark = pytest.mark.cpu_test + + +class FakeTokenizer: + """Minimal fake tokenizer that exposes the attributes used by the + parser: a truthy model_tokenizer marker and a vocab mapping for the + special tokens. + """ + + def __init__(self): + self.model_tokenizer = True + # The parser will look up start/end tokens by their literal strings + self.vocab = { + "": 1, + "": 2, + } + + def get_vocab(self): + return self.vocab + + +@pytest.fixture +def minimax_m2_tool_parser(): + return MinimaxM2ToolParser(FakeTokenizer()) + + +def test_extract_tool_calls_streaming_incremental(minimax_m2_tool_parser): + parser = minimax_m2_tool_parser + parser._reset_streaming_state() + chunks = [ + "", + '', + '', + "Seattle", + "", + ] + previous = "" + for chunk in chunks: + current = previous + chunk + delta = chunk + parser.extract_tool_calls_streaming( + previous_text=previous, + current_text=current, + delta_text=delta, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + previous = current + + assert len(parser.prev_tool_call_arr) == 1 + entry = parser.prev_tool_call_arr[0] + + assert entry["name"] == "get_weather" + args = entry["arguments"] + assert args["city"] == "Seattle" + + +def test_streaming_minimax_m2_multiple_invokes(minimax_m2_tool_parser): + parser = minimax_m2_tool_parser + parser._reset_streaming_state() + + chunks = [ + "", + '', + '', + '["technology", "events"]', + '', + '["OpenAI", "latest", "release"]', + "", + '', + '', + '["technology", "events"]', + '', + '["Gemini", "latest", "release"]', + "", + "", + ] + previous = "" + for chunk in chunks: + current = previous + chunk + delta = chunk + parser.extract_tool_calls_streaming( + previous_text=previous, + current_text=current, + delta_text=delta, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + previous = current + + assert len(parser.prev_tool_call_arr) == 2 + + for entry, expect_model in zip(parser.prev_tool_call_arr, ["OpenAI", "Gemini"]): + assert entry["name"] == "search_web" + args = json.dumps(entry["arguments"]) + assert "technology" in args and "events" in args + assert expect_model in args + + # check streamed_args_for_tool for serving_chat.py + for index in range(2): + expected_call = parser.prev_tool_call_arr[index].get("arguments", {}) + expected_call = json.dumps(expected_call) + actual_call = parser.streamed_args_for_tool[index] + assert expected_call == actual_call diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py index dcb2b64f6e73c..a1ab75f548bfc 100644 --- a/vllm/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/tool_parsers/minimax_m2_tool_parser.py @@ -122,6 +122,8 @@ class MinimaxM2ToolParser(ToolParser): self.streaming_request = None # Clear previous tool call history to avoid state pollution self.prev_tool_call_arr.clear() + # Reset streamed args tracking + self.streamed_args_for_tool.clear() def _extract_name(self, name_str: str) -> str: """Extract name from quoted string.""" @@ -421,9 +423,12 @@ class MinimaxM2ToolParser(ToolParser): self.prev_tool_call_arr.append( { "name": self.current_function_name, - "arguments": "{}", # Placeholder, will be updated later + "arguments": {}, # Placeholder, will be updated later } ) + # Initialize streamed_args_for_tool for this tool call + if len(self.streamed_args_for_tool) <= self.current_tool_index: + self.streamed_args_for_tool.append("") # Send header with function info return DeltaMessage( @@ -445,6 +450,9 @@ class MinimaxM2ToolParser(ToolParser): # Send opening brace if not sent yet if self.in_function and not self.json_started: self.json_started = True + # Update streamed_args_for_tool for opening brace + if self.current_tool_index < len(self.streamed_args_for_tool): + self.streamed_args_for_tool[self.current_tool_index] += "{" return DeltaMessage( tool_calls=[ DeltaToolCall( @@ -493,7 +501,7 @@ class MinimaxM2ToolParser(ToolParser): args = parsed_tool.function.arguments self.prev_tool_call_arr[self.current_tool_index][ "arguments" - ] = args + ] = json.loads(args) except Exception: pass # Ignore parsing errors during streaming @@ -505,7 +513,9 @@ class MinimaxM2ToolParser(ToolParser): ) ] ) - + # Update streamed_args_for_tool for closing brace + if self.current_tool_index < len(self.streamed_args_for_tool): + self.streamed_args_for_tool[self.current_tool_index] += "}" # Reset state for next tool self.json_closed = True self.in_function = False @@ -630,7 +640,11 @@ class MinimaxM2ToolParser(ToolParser): ) self.param_count += 1 - + # Update streamed_args_for_tool for this tool call + if self.current_tool_index < len(self.streamed_args_for_tool): + self.streamed_args_for_tool[self.current_tool_index] += ( + json_fragment + ) return DeltaMessage( tool_calls=[ DeltaToolCall( From a9e15c21efbbc5b4a7a1e69e40378fdfe1acdcb7 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Wed, 17 Dec 2025 10:48:53 +0200 Subject: [PATCH 24/49] [Mamba] Removed disable cascade attn in MambaModelConfig (#30712) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- vllm/model_executor/models/config.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 4b08472538db4..a3624b1cfa5f2 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -308,12 +308,6 @@ class MambaModelConfig(VerifyAndUpdateConfig): if cache_config.mamba_block_size is None: cache_config.mamba_block_size = model_config.max_model_len - # TODO(tdoublep): remove once cascade attention is supported - logger.info( - "Disabling cascade attention since it is not supported for hybrid models." - ) - model_config.disable_cascade_attn = True - class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): @classmethod From 3b1d440ede42855f031ba72af4817583e5dddba0 Mon Sep 17 00:00:00 2001 From: Xinyu Chen Date: Wed, 17 Dec 2025 17:43:00 +0800 Subject: [PATCH 25/49] CustomOp: grouped topk (#29575) Signed-off-by: Xinyu Chen --- tests/kernels/moe/test_grouped_topk.py | 10 ++-- .../layers/fused_moe/__init__.py | 4 +- .../layers/fused_moe/fused_moe.py | 52 +++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 23 +++++--- 4 files changed, 75 insertions(+), 14 deletions(-) diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index 662e0723b7583..d26fe50b815b4 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -9,8 +9,8 @@ import pytest import torch from vllm.model_executor.layers.fused_moe.fused_moe import ( + GroupedTopk, fused_grouped_topk, - grouped_topk, ) from vllm.platforms import current_platform @@ -50,15 +50,17 @@ def test_grouped_topk( with monkeypatch.context() as m: m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0") - baseline_topk_weights, baseline_topk_ids = grouped_topk( - hidden_states=hidden_states, - gating_output=gating_output, + grouped_topk = GroupedTopk( topk=topk, renormalize=renormalize, num_expert_group=num_expert_group, topk_group=topk_group, scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor, + ) + baseline_topk_weights, baseline_topk_ids = grouped_topk( + hidden_states=hidden_states, + gating_output=gating_output, e_score_correction_bias=e_score_correction_bias, ) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index d71cfc5ad8200..8fee4038b60b8 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -77,11 +77,11 @@ if HAS_TRITON: BatchedTritonExperts, ) from vllm.model_executor.layers.fused_moe.fused_moe import ( + GroupedTopk, TritonExperts, fused_experts, fused_topk, get_config_file_name, - grouped_topk, ) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, @@ -91,7 +91,7 @@ if HAS_TRITON: "fused_topk", "fused_experts", "get_config_file_name", - "grouped_topk", + "GroupedTopk", "cutlass_moe_fp8", "cutlass_moe_fp4", "cutlass_moe_w4a8_fp8", diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b286c3bc6fc07..20782e2712f27 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -16,6 +16,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) @@ -1286,6 +1287,57 @@ def grouped_topk( return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +@CustomOp.register("grouped_topk") +class GroupedTopk(CustomOp): + """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model.""" + + def __init__( + self, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + ) -> None: + super().__init__() + self.native_impl = grouped_topk + self.topk = topk + self.renormalize = renormalize + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.scoring_func = scoring_func + self.routed_scaling_factor = routed_scaling_factor + + def forward_native( + self, + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + e_score_correction_bias: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.native_impl( + hidden_states, + gating_output, + self.topk, + self.renormalize, + self.num_expert_group, + self.topk_group, + self.scoring_func, + self.routed_scaling_factor, + e_score_correction_bias, + ) + + def forward_cuda( + self, + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + e_score_correction_bias: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self.forward_native( + hidden_states, gating_output, e_score_correction_bias + ) + + @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def eplb_map_to_physical_and_record( topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b39ce415a0f83..db97d6eb88ea5 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -67,7 +67,7 @@ else: return topk_ids eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record -from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk +from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_grouped_topk, ) @@ -1594,19 +1594,26 @@ class FusedMoE(CustomOp): grouped_topk_impl = partial( rocm_aiter_grouped_topk, num_fused_shared_experts=self.num_fused_shared_experts, + topk=self.top_k, + renormalize=self.renormalize, + num_expert_group=self.num_expert_group, + topk_group=self.topk_group, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, ) else: - grouped_topk_impl = grouped_topk + grouped_topk_impl = GroupedTopk( + topk=self.top_k, + renormalize=self.renormalize, + num_expert_group=self.num_expert_group, + topk_group=self.topk_group, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, + ) topk_weights, topk_ids = grouped_topk_impl( hidden_states=hidden_states, gating_output=router_logits, - topk=self.top_k, - renormalize=self.renormalize, - num_expert_group=self.num_expert_group, - topk_group=self.topk_group, - scoring_func=self.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.e_score_correction_bias, ) elif self.e_score_correction_bias is not None: From f4e884f2224a25612eaeaeac2a854c1dd330c144 Mon Sep 17 00:00:00 2001 From: Sheng Lin Date: Wed, 17 Dec 2025 17:52:58 +0800 Subject: [PATCH 26/49] [NIXL][Bugfix] Fix NIXL/RDMA registration failure over CuMemAllocator (#29569) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Somoku Co-authored-by: Nicolò Lucchesi --- csrc/cumem_allocator.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp index 78dc840a98b67..6c2c18a6602d2 100644 --- a/csrc/cumem_allocator.cpp +++ b/csrc/cumem_allocator.cpp @@ -107,6 +107,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, prop.location.id = device; prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; +#ifndef USE_ROCM + int flag = 0; + CUDA_CHECK(cuDeviceGetAttribute( + &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, + device)); + if (flag) { // support GPUDirect RDMA if possible + prop.allocFlags.gpuDirectRDMACapable = 1; + } +#endif + #ifndef USE_ROCM // Allocate memory using cuMemCreate CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); From 4c054d89aa5972014ba7e13c0accb0ab631b5638 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Wed, 17 Dec 2025 17:53:02 +0800 Subject: [PATCH 27/49] [Doc][ResponsesAPI] add documentation (#30840) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- docs/serving/openai_compatible_server.md | 27 ++++++++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 18 ++++++++++++---- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 0e29204f8947c..6a08f872def15 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -47,6 +47,8 @@ We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - Only applicable to [text generation models](../models/generative_models.md). - *Note: `suffix` parameter is not supported.* +- [Responses API](#responses-api) (`/v1/responses`) + - Only applicable to [text generation models](../models/generative_models.md). - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template). - *Note: `user` parameter is ignored.* @@ -229,6 +231,31 @@ The following extra parameters are supported: --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" ``` +### Responses API + +Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. + +Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py) + +#### Extra parameters + +The following extra parameters in the request object are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params" + ``` + +The following extra parameters in the response object are supported: + +??? code + + ```python + --8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params" + ``` + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 94dde4564ea0c..a3c347cb1bd3f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1654,13 +1654,23 @@ class ResponsesResponse(OpenAIBaseModel): usage: ResponseUsage | None = None user: str | None = None - # --8<-- [start:responses-extra-params] + # --8<-- [start:responses-response-extra-params] # These are populated when enable_response_messages is set to True # NOTE: custom serialization is needed # see serialize_input_messages and serialize_output_messages - input_messages: ResponseInputOutputMessage | None = None - output_messages: ResponseInputOutputMessage | None = None - # --8<-- [end:responses-extra-params] + input_messages: ResponseInputOutputMessage | None = Field( + default=None, + description=( + "If enable_response_messages, we can show raw token input to model." + ), + ) + output_messages: ResponseInputOutputMessage | None = Field( + default=None, + description=( + "If enable_response_messages, we can show raw token output of model." + ), + ) + # --8<-- [end:responses-response-extra-params] # NOTE: openAI harmony doesn't serialize TextContent properly, # TODO: this fixes for TextContent, but need to verify for tools etc From a100152288c8ec50336aea842f0b3d8e36624024 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Wed, 17 Dec 2025 01:54:21 -0800 Subject: [PATCH 28/49] [Kernels][FI] Skip trtllm attention when num_kv_heads=1 (#30842) Signed-off-by: Ye (Charlotte) Qi --- .../test_flashinfer_trtllm_attention.py | 35 +++++++++++++++++++ vllm/utils/flashinfer.py | 22 +++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 06a7085a82ba0..220d827b9d5fa 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline( torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), f"{torch.max(torch.abs(output - output_trtllm))}", ) + + +def test_trtllm_attention_rejects_num_kv_heads_1() -> None: + """Test that TRTLLM attention correctly rejects num_kv_heads=1. + + When num_kv_heads=1 (MQA), the KV cache strides become degenerate + (stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled + to fail because TMA descriptors cannot handle degenerate 4D tensors with + singleton dimensions. + + This test verifies that can_use_trtllm_attention returns False for + num_kv_heads=1 configurations. + """ + from vllm.utils.flashinfer import can_use_trtllm_attention + + # num_kv_heads=1 should be rejected + assert not can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1), ( + "can_use_trtllm_attention should return False for num_kv_heads=1" + ) + assert not can_use_trtllm_attention(num_qo_heads=32, num_kv_heads=1), ( + "can_use_trtllm_attention should return False for num_kv_heads=1" + ) + + # num_kv_heads > 1 should be accepted (if platform supports it) + # Note: This may return False on non-Blackwell platforms, which is fine + result_kv8 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=8) + result_kv1 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1) + + # Even if platform doesn't support TRTLLM, num_kv_heads=1 should never + # return True when num_kv_heads > 1 returns True + if result_kv8: + assert not result_kv1, ( + "If TRTLLM is supported for num_kv_heads=8, " + "it must be rejected for num_kv_heads=1" + ) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 1c2710be3173b..6bbe02348eaf1 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -305,7 +305,18 @@ def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool: if force_use_trtllm_attention() is False: return False has_trtllm = supports_trtllm_attention() - return has_trtllm and (num_qo_heads % num_kv_heads == 0) + # num_kv_heads=1 is not supported due to TMA descriptor building limitations. + # When num_kv_heads=1, the KV cache strides become degenerate (stride_heads == + # stride_batch), which causes CUDA's cuTensorMapEncodeTiled to fail because + # TMA descriptors cannot handle degenerate 4D tensors with singleton dimensions. + # See: https://fburl.com/352mrydz + if has_trtllm and num_kv_heads == 1: + logger.warning_once( + "TRTLLM attention does not support num_kv_heads=1. " + "This configuration causes TMA descriptor building to fail due to " + "degenerate tensor strides. Falling back to FlashInfer attention." + ) + return has_trtllm and (num_qo_heads % num_kv_heads == 0) and (num_kv_heads != 1) def use_trtllm_attention( @@ -355,6 +366,15 @@ def use_trtllm_attention( ) return False + # num_kv_heads=1 is not supported + if num_kv_heads == 1: + if force_use_trtllm: + logger.warning_once( + "TRTLLM attention does not support num_kv_heads=1, " + "but --attention-config.use_trtllm_attention is set to 1" + ) + return False + if has_spec and not is_prefill: # Speculative decoding requires TRTLLM attention for decodes logger.info_once("Using TRTLLM attention (enabled for speculative decoding).") From 519ef9a91111d2d6f8545c8a6b2c1a28d87309fa Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 17 Dec 2025 04:55:30 -0500 Subject: [PATCH 29/49] [UX] Make `vllm bench serve` discover model by default and use --input-len (#30816) Signed-off-by: mgoin --- tests/benchmarks/test_serve_cli.py | 9 ++-- vllm/benchmarks/serve.py | 83 +++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 13 deletions(-) diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 90d685c966d3e..c579b38069864 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -19,21 +19,18 @@ def server(): @pytest.mark.benchmark def test_bench_serve(server): + # Test default model detection and input/output len command = [ "vllm", "bench", "serve", - "--model", - MODEL_NAME, "--host", server.host, "--port", str(server.port), - "--dataset-name", - "random", - "--random-input-len", + "--input-len", "32", - "--random-output-len", + "--output-len", "4", "--num-prompts", "5", diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index f5d8ea5a975a9..12756d1700c9f 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -10,8 +10,10 @@ On the client side, run: vllm bench serve \ --backend \ --label \ - --model \ + --model \ --dataset-name \ + --input-len \ + --output-len \ --request-rate \ --num-prompts """ @@ -57,6 +59,33 @@ TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) a ) +async def get_first_model_from_server( + base_url: str, headers: dict | None = None +) -> str: + """Fetch the first model from the server's /v1/models endpoint.""" + models_url = f"{base_url}/v1/models" + async with aiohttp.ClientSession() as session: + try: + async with session.get(models_url, headers=headers) as response: + response.raise_for_status() + data = await response.json() + if "data" in data and len(data["data"]) > 0: + return data["data"][0]["id"] + else: + raise ValueError( + f"No models found on the server at {base_url}. " + "Make sure the server is running and has models loaded." + ) + except (aiohttp.ClientError, json.JSONDecodeError) as e: + raise RuntimeError( + f"Failed to fetch models from server at {models_url}. " + "Check that:\n" + "1. The server is running\n" + "2. The server URL is correct\n" + f"Error: {e}" + ) from e + + class TaskType(Enum): GENERATION = "generation" POOLING = "pooling" @@ -1025,8 +1054,26 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--model", type=str, - required=True, - help="Name of the model.", + required=False, + default=None, + help="Name of the model. If not specified, will fetch the first model " + "from the server's /v1/models endpoint.", + ) + parser.add_argument( + "--input-len", + type=int, + default=None, + help="General input length for datasets. Maps to dataset-specific " + "input length arguments (e.g., --random-input-len, --sonnet-input-len). " + "If not specified, uses dataset defaults.", + ) + parser.add_argument( + "--output-len", + type=int, + default=None, + help="General output length for datasets. Maps to dataset-specific " + "output length arguments (e.g., --random-output-len, --sonnet-output-len). " + "If not specified, uses dataset defaults.", ) parser.add_argument( "--tokenizer", @@ -1332,10 +1379,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") label = args.label - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" @@ -1356,6 +1399,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: else: raise ValueError("Invalid header format. Please use KEY=VALUE format.") + # Fetch model from server if not specified + if args.model is None: + print("Model not specified, fetching first model from server...") + model_id = await get_first_model_from_server(base_url, headers) + print(f"Using model: {model_id}") + else: + model_id = args.model + + model_name = args.served_model_name + tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id + tokenizer_mode = args.tokenizer_mode + tokenizer = get_tokenizer( tokenizer_id, tokenizer_mode=tokenizer_mode, @@ -1368,6 +1423,20 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: "'--dataset-path' if required." ) + # Map general --input-len and --output-len to all dataset-specific arguments + if args.input_len is not None: + args.random_input_len = args.input_len + args.sonnet_input_len = args.input_len + + if args.output_len is not None: + args.random_output_len = args.output_len + args.sonnet_output_len = args.output_len + args.sharegpt_output_len = args.output_len + args.custom_output_len = args.output_len + args.hf_output_len = args.output_len + args.spec_bench_output_len = args.output_len + args.prefix_repetition_output_len = args.output_len + # when using random datasets, default to ignoring EOS # so generation runs to the requested length if ( From 177c391db2ad8dfc05906473525d4ae0a55549e0 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 04:55:56 -0500 Subject: [PATCH 30/49] [compile] Disable aot when eager backend is used. (#30810) Signed-off-by: zhxchen17 --- vllm/compilation/decorators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index d1ee995ee8959..40bde97ac61d8 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -435,7 +435,10 @@ def _support_torch_compile( return self.aot_compiled_fn(self, *args, **kwargs) if self.compiled: - assert not envs.VLLM_USE_AOT_COMPILE + assert ( + not envs.VLLM_USE_AOT_COMPILE + or self.vllm_config.compilation_config.backend == "eager" + ) return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) # This is the path for the first compilation. @@ -508,7 +511,11 @@ def _support_torch_compile( _torch27_patch_tensor_subclasses(), torch._inductor.config.patch(**inductor_config_patches), ): - if envs.VLLM_USE_AOT_COMPILE: + use_aot_compile = envs.VLLM_USE_AOT_COMPILE + if self.vllm_config.compilation_config.backend == "eager": + logger.warning("Detected eager backend, disabling AOT compile.") + use_aot_compile = False + if use_aot_compile: self.aot_compiled_fn = self.aot_compile(*args, **kwargs) output = self.aot_compiled_fn(self, *args, **kwargs) assert aot_compilation_path is not None From 9db1db5949f7abd4b03cd0231450f81bfeeaba0f Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 04:56:24 -0500 Subject: [PATCH 31/49] [compile] Ignore VLLM_FORCE_AOT_LOAD from cache factors (#30809) Signed-off-by: zhxchen17 --- vllm/envs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/envs.py b/vllm/envs.py index 7e072a588591c..2f8158d88d6c5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1654,6 +1654,7 @@ def compile_factors() -> dict[str, object]: "VLLM_CI_USE_S3", "VLLM_MODEL_REDIRECT_PATH", "VLLM_HOST_IP", + "VLLM_FORCE_AOT_LOAD", "S3_ACCESS_KEY_ID", "S3_SECRET_ACCESS_KEY", "S3_ENDPOINT_URL", From 7b966ae2ba73b5391937907bfd8aaf63af033ff1 Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:56:38 +0200 Subject: [PATCH 32/49] [Fix]Load kv-cache dtype from hf_quant_config.json automatically (fix for reverted PR) (#30785) Signed-off-by: <> Co-authored-by: root --- vllm/engine/arg_utils.py | 9 ++++- vllm/utils/torch_utils.py | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca19e468914c7..03720bd2516d4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -93,6 +93,7 @@ from vllm.transformers_utils.utils import is_cloud_storage from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip +from vllm.utils.torch_utils import resolve_kv_cache_dtype_string from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: @@ -106,6 +107,7 @@ else: LoadFormats = Any UsageContext = Any + logger = init_logger(__name__) # object is used to allow for special typing forms @@ -1361,12 +1363,17 @@ class EngineArgs: f"dcp_size={self.decode_context_parallel_size}." ) + # Resolve "auto" kv_cache_dtype to actual value from model config + resolved_cache_dtype = resolve_kv_cache_dtype_string( + self.kv_cache_dtype, model_config + ) + cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, kv_cache_memory_bytes=self.kv_cache_memory_bytes, swap_space=self.swap_space, - cache_dtype=self.kv_cache_dtype, + cache_dtype=resolved_cache_dtype, is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, sliding_window=sliding_window, diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index c97efce312b56..b82e0171b7f7f 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -24,6 +24,10 @@ else: ModelConfig = object IntermediateTensors = object +import logging + +logger = logging.getLogger(__name__) + STR_DTYPE_TO_TORCH_DTYPE = { "float32": torch.float32, @@ -49,6 +53,13 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = { } +MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP = { + # TODO: Add more modelopt kv cache dtype + # mappings here when it supported by some attention backend + # (for example supports nvfp4). + "fp8": "fp8_e4m3", +} + T = TypeVar("T") @@ -194,6 +205,70 @@ def get_kv_cache_torch_dtype( return torch_dtype +def get_kv_cache_quant_algo_string(quant_cfg: dict[str, Any]) -> str | None: + """Get the KV cache quantization algorithm string from the quantization config. + + Maps various FP8 format names to vLLM's standard cache dtype strings. + Returns None if no kv_cache_quant_algo is specified. + Returns "auto" if the value is not recognized/supported. + """ + # Mapping from model config values to vLLM cache_dtype strings + + quant_method = quant_cfg.get("quant_method", "") + if quant_method.startswith("modelopt"): + quantization_inner = quant_cfg.get("quantization", quant_cfg) + # Check if quant config is specified and use kv cache quant algo + kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get( + "kv_cache_quant_algo" + ) + if isinstance(kv_algo, str): + kv_algo_lower = kv_algo.lower() + + # Try to map to vLLM's standard format + if kv_algo_lower in MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP: + return MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP[kv_algo_lower] + else: + # Unknown/unsupported format - return "auto" as safe fallback + logger.warning( + "WARNING: Unknown kv_cache_quant_algo '%s' in model " + "config. Supported values: %s. Falling back to 'auto'.", + kv_algo, + list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()), + ) + return "auto" + return None + + +def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None: + """Get the KV cache quantization algorithm dtype from the quantization config.""" + kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg) + if kv_algo_str is not None and kv_algo_str != "auto": + # Only convert if we have a valid dtype string (not "auto" fallback) + return STR_DTYPE_TO_TORCH_DTYPE[kv_algo_str] + return None + + +def resolve_kv_cache_dtype_string( + kv_cache_dtype: str, model_config: ModelConfig +) -> str: + """Resolve 'auto' kv_cache_dtype to the actual string value from model config. + Returns the resolved cache_dtype string. + """ + if kv_cache_dtype != "auto": + return kv_cache_dtype + + hf_cfg = getattr(model_config, "hf_config", None) + if hf_cfg is not None: + quant_cfg = getattr(hf_cfg, "quantization_config", None) + if quant_cfg is not None: + kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg) + if kv_algo_str is not None: + return kv_algo_str + + # Default to auto (will be handled by downstream code) + return "auto" + + def kv_cache_dtype_str_to_dtype( kv_cache_dtype: str, model_config: ModelConfig ) -> torch.dtype: From 53cd7f868b3632cbbe982cffaee8e16fb49dd694 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 17 Dec 2025 05:00:12 -0500 Subject: [PATCH 33/49] [compile] Recompile graph module during Dynamo cache loading. (#30743) Signed-off-by: Zhengxu Chen --- vllm/compilation/caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index ce482572b401b..fc02a08f74265 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -104,6 +104,7 @@ class VllmSerializableFunction(SerializableCallable): state = pickle.loads(data) fake_mode = FakeTensorMode(shape_env=ShapeEnv()) state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode) + state["graph_module"].recompile() state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode) vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"]) From f284d7bd0c55f929fa7912936b1d247089679191 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 17 Dec 2025 05:00:35 -0500 Subject: [PATCH 34/49] [Bug] Fix AttributeError: 'ColumnParallelLinear' object has no attribute `weight_scale_inv` (#30823) Signed-off-by: yewentao256 --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index ea68745585160..bdc3d1fc7232d 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1437,14 +1437,17 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module): layer.orig_dtype, layer.weight ) if should_use_deepgemm: + scale_attr = ( + "weight_scale_inv" if hasattr(layer, "weight_scale_inv") else "weight_scale" + ) dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block( wq=layer.weight.data, - ws=layer.weight_scale_inv.data, + ws=getattr(layer, scale_attr).data, quant_block_shape=tuple(layer.weight_block_size), use_e8m0=is_deep_gemm_e8m0_used(), ) replace_parameter(layer, "weight", dg_weight) - replace_parameter(layer, "weight_scale_inv", dg_weight_scale) + replace_parameter(layer, scale_attr, dg_weight_scale) def expert_weight_is_col_major(x: torch.Tensor) -> bool: From 9ad5b2171002522772de0a0cc71b747068ec8862 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 17 Dec 2025 18:27:30 +0800 Subject: [PATCH 35/49] [Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into the serve directory (#30749) Signed-off-by: chaunceyjiang --- .../scripts/hardware_ci/run-amd-test.sh | 1 - .buildkite/test-amd.yaml | 37 ++++--- .buildkite/test-pipeline.yaml | 34 ++++--- .buildkite/test_areas/entrypoints.yaml | 23 ++++- .buildkite/test_areas/tool_use.yaml | 13 --- tests/entrypoints/instrumentator/__init__.py | 0 .../test_metrics.py | 5 +- tests/entrypoints/rpc/__init__.py | 0 .../{openai => rpc}/test_collective_rpc.py | 2 +- tests/entrypoints/sleep/__init__.py | 0 .../{openai => sleep}/test_sleep.py | 2 +- vllm/entrypoints/openai/api_server.py | 98 +------------------ vllm/entrypoints/serve/__init__.py | 29 ++++++ vllm/entrypoints/serve/cache/__init__.py | 0 vllm/entrypoints/serve/cache/api_router.py | 61 ++++++++++++ .../serve/instrumentator/server_info.py | 40 ++++++++ vllm/entrypoints/serve/rpc/__init__.py | 0 vllm/entrypoints/serve/rpc/api_router.py | 61 ++++++++++++ vllm/entrypoints/serve/sleep/api_router.py | 4 - 19 files changed, 259 insertions(+), 151 deletions(-) delete mode 100644 .buildkite/test_areas/tool_use.yaml create mode 100644 tests/entrypoints/instrumentator/__init__.py rename tests/entrypoints/{openai => instrumentator}/test_metrics.py (99%) create mode 100644 tests/entrypoints/rpc/__init__.py rename tests/entrypoints/{openai => rpc}/test_collective_rpc.py (96%) create mode 100644 tests/entrypoints/sleep/__init__.py rename tests/entrypoints/{openai => sleep}/test_sleep.py (98%) create mode 100644 vllm/entrypoints/serve/cache/__init__.py create mode 100644 vllm/entrypoints/serve/cache/api_router.py create mode 100644 vllm/entrypoints/serve/instrumentator/server_info.py create mode 100644 vllm/entrypoints/serve/rpc/__init__.py create mode 100644 vllm/entrypoints/serve/rpc/api_router.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 864eb470bb0a7..08da34d81d117 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then --ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_completion.py \ - --ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_lora_adapters.py \ --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 3c9b8cbedcf06..e8f99100a8de0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -128,7 +128,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -148,7 +148,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 @@ -162,10 +162,28 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -751,17 +769,6 @@ steps: # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use ##### models test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8e6d32f71f220..b4de630b09417 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -114,7 +114,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -132,7 +132,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -144,10 +144,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -666,16 +682,6 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use - ##### models test ##### - label: Basic Models Tests (Initialization) diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 0a789be943f37..5b16ea9c1ad07 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -10,7 +10,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration (LLM) timeout_in_minutes: 40 @@ -25,7 +25,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration (API Server) +- label: Entrypoints Integration (API Server 1) timeout_in_minutes: 130 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -34,11 +34,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration (API Server 2) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/tool_use + - tests/entrypoints/sleep + - tests/entrypoints/instrumentator + - tests/entrypoints/rpc + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - label: Entrypoints Integration (Pooling) timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml deleted file mode 100644 index 69527a1214229..0000000000000 --- a/.buildkite/test_areas/tool_use.yaml +++ /dev/null @@ -1,13 +0,0 @@ -group: Tool use -depends_on: - - image-build -steps: -- label: OpenAI-Compatible Tool Use - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/instrumentator/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py similarity index 99% rename from tests/entrypoints/openai/test_metrics.py rename to tests/entrypoints/instrumentator/test_metrics.py index 65a6fd20bd0d1..9f2ad105a380b 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/instrumentator/test_metrics.py @@ -14,11 +14,10 @@ import requests from prometheus_client.parser import text_string_to_metric_families from transformers import AutoTokenizer +from tests.conftest import LocalAssetServer +from tests.utils import RemoteOpenAIServer from vllm import version -from ...conftest import LocalAssetServer -from ...utils import RemoteOpenAIServer - MODELS = { "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct", diff --git a/tests/entrypoints/rpc/__init__.py b/tests/entrypoints/rpc/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/rpc/test_collective_rpc.py similarity index 96% rename from tests/entrypoints/openai/test_collective_rpc.py rename to tests/entrypoints/rpc/test_collective_rpc.py index cbd6b02f05dce..56d93a427315f 100644 --- a/tests/entrypoints/openai/test_collective_rpc.py +++ b/tests/entrypoints/rpc/test_collective_rpc.py @@ -37,7 +37,7 @@ def server(): "--max-num-seqs", "128", "--worker-extension-cls", - "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension", + "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension", ] with RemoteOpenAIServer( MODEL_NAME, diff --git a/tests/entrypoints/sleep/__init__.py b/tests/entrypoints/sleep/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/sleep/test_sleep.py similarity index 98% rename from tests/entrypoints/openai/test_sleep.py rename to tests/entrypoints/sleep/test_sleep.py index 5f94ac6da2c25..260dcd00bae91 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/sleep/test_sleep.py @@ -4,7 +4,7 @@ import requests from prometheus_client.parser import text_string_to_metric_families -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5d0eacae34dd7..bca9571e39344 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,21 +17,20 @@ from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Annotated, Any, Literal +from typing import Annotated, Any import model_hosting_container_standards.sagemaker as sagemaker_standards import pydantic import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request +from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response, StreamingResponse +from fastapi.responses import JSONResponse, StreamingResponse from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import URL, Headers, MutableHeaders, State from starlette.types import ASGIApp, Message, Receive, Scope, Send import vllm.envs as envs -from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.anthropic.protocol import ( @@ -639,97 +638,6 @@ async def create_translations( return StreamingResponse(content=generator, media_type="text/event-stream") -if envs.VLLM_SERVER_DEV_MODE: - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) - - PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) - - @router.get("/server_info") - async def show_server_info( - raw_request: Request, - config_format: Annotated[Literal["text", "json"], Query()] = "text", - ): - vllm_config: VllmConfig = raw_request.app.state.vllm_config - server_info = { - "vllm_config": str(vllm_config) - if config_format == "text" - else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) - # fallback=str is needed to handle e.g. torch.dtype - } - return JSONResponse(content=server_info) - - @router.post("/reset_prefix_cache") - async def reset_prefix_cache( - raw_request: Request, - reset_running_requests: bool = Query(default=False), - reset_external: bool = Query(default=False), - ): - """ - Reset the local prefix cache. - - Optionally, if the query parameter `reset_external=true` - also resets the external (connector-managed) prefix cache. - - Note that we currently do not check if the prefix cache - is successfully reset in the API server. - - Example: - POST /reset_prefix_cache?reset_external=true - """ - logger.info("Resetting prefix cache...") - - await engine_client(raw_request).reset_prefix_cache( - reset_running_requests, reset_external - ) - return Response(status_code=200) - - @router.post("/reset_mm_cache") - async def reset_mm_cache(raw_request: Request): - """ - Reset the multi-modal cache. Note that we currently do not check if the - multi-modal cache is successfully reset in the API server. - """ - logger.info("Resetting multi-modal cache...") - await engine_client(raw_request).reset_mm_cache() - return Response(status_code=200) - - @router.post("/collective_rpc") - async def collective_rpc(raw_request: Request): - try: - body = await raw_request.json() - except json.JSONDecodeError as e: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail=f"JSON decode error: {e}", - ) from e - method = body.get("method") - if method is None: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail="Missing 'method' in request body", - ) - # For security reason, only serialized string args/kwargs are passed. - # User-defined `method` is responsible for deserialization if needed. - args: list[str] = body.get("args", []) - kwargs: dict[str, str] = body.get("kwargs", {}) - timeout: float | None = body.get("timeout") - results = await engine_client(raw_request).collective_rpc( - method=method, timeout=timeout, args=tuple(args), kwargs=kwargs - ) - if results is None: - return Response(status_code=200) - response: list[Any] = [] - for result in results: - if result is None or isinstance(result, dict | list): - response.append(result) - else: - response.append(str(result)) - return JSONResponse(content={"results": response}) - - def load_log_config(log_config_file: str | None) -> dict | None: if not log_config_file: return None diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py index c4fcc92db931f..260fd44a02ccb 100644 --- a/vllm/entrypoints/serve/__init__.py +++ b/vllm/entrypoints/serve/__init__.py @@ -4,8 +4,19 @@ from fastapi import FastAPI +import vllm.envs as envs +from vllm.logger import init_logger + +logger = init_logger(__name__) + def register_vllm_serve_api_routers(app: FastAPI): + if envs.VLLM_SERVER_DEV_MODE: + logger.warning( + "SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!" + ) + from vllm.entrypoints.serve.lora.api_router import ( attach_router as attach_lora_router, ) @@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI): attach_sleep_router(app) + from vllm.entrypoints.serve.rpc.api_router import ( + attach_router as attach_rpc_router, + ) + + attach_rpc_router(app) + + from vllm.entrypoints.serve.cache.api_router import ( + attach_router as attach_cache_router, + ) + + attach_cache_router(app) + from vllm.entrypoints.serve.tokenize.api_router import ( attach_router as attach_tokenize_router, ) @@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI): ) attach_health_router(app) + + from vllm.entrypoints.serve.instrumentator.server_info import ( + attach_router as attach_server_info_router, + ) + + attach_server_info_router(app) diff --git a/vllm/entrypoints/serve/cache/__init__.py b/vllm/entrypoints/serve/cache/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/cache/api_router.py new file mode 100644 index 0000000000000..d659895463273 --- /dev/null +++ b/vllm/entrypoints/serve/cache/api_router.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/reset_prefix_cache") +async def reset_prefix_cache( + raw_request: Request, + reset_running_requests: bool = Query(default=False), + reset_external: bool = Query(default=False), +): + """ + Reset the local prefix cache. + + Optionally, if the query parameter `reset_external=true` + also resets the external (connector-managed) prefix cache. + + Note that we currently do not check if the prefix cache + is successfully reset in the API server. + + Example: + POST /reset_prefix_cache?reset_external=true + """ + logger.info("Resetting prefix cache...") + + await engine_client(raw_request).reset_prefix_cache( + reset_running_requests, reset_external + ) + return Response(status_code=200) + + +@router.post("/reset_mm_cache") +async def reset_mm_cache(raw_request: Request): + """ + Reset the multi-modal cache. Note that we currently do not check if the + multi-modal cache is successfully reset in the API server. + """ + logger.info("Resetting multi-modal cache...") + await engine_client(raw_request).reset_mm_cache() + return Response(status_code=200) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py new file mode 100644 index 0000000000000..1a69dfacae1c2 --- /dev/null +++ b/vllm/entrypoints/serve/instrumentator/server_info.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from typing import Annotated, Literal + +import pydantic +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import JSONResponse + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +router = APIRouter() +PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + + +@router.get("/server_info") +async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], Query()] = "text", +): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": str(vllm_config) + if config_format == "text" + else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype + } + return JSONResponse(content=server_info) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/rpc/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/rpc/api_router.py new file mode 100644 index 0000000000000..54f582c408d54 --- /dev/null +++ b/vllm/entrypoints/serve/rpc/api_router.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from http import HTTPStatus +from typing import Any + +from fastapi import APIRouter, FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/collective_rpc") +async def collective_rpc(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + method = body.get("method") + if method is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'method' in request body", + ) + # For security reason, only serialized string args/kwargs are passed. + # User-defined `method` is responsible for deserialization if needed. + args: list[str] = body.get("args", []) + kwargs: dict[str, str] = body.get("kwargs", {}) + timeout: float | None = body.get("timeout") + results = await engine_client(raw_request).collective_rpc( + method=method, timeout=timeout, args=tuple(args), kwargs=kwargs + ) + if results is None: + return Response(status_code=200) + response: list[Any] = [] + for result in results: + if result is None or isinstance(result, dict | list): + response.append(result) + else: + response.append(str(result)) + return JSONResponse(content={"results": response}) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py index bc01e185315c8..c0e4c3028b2ea 100644 --- a/vllm/entrypoints/serve/sleep/api_router.py +++ b/vllm/entrypoints/serve/sleep/api_router.py @@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request): def attach_router(app: FastAPI): if not envs.VLLM_SERVER_DEV_MODE: return - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) app.include_router(router) From 4bf6c2366818a1eeae257e06ec337039e6895f13 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 17 Dec 2025 02:30:56 -0800 Subject: [PATCH 36/49] [ci] Sync test areas yaml file with test-pipeline (#30862) Signed-off-by: Kevin H. Luu --- .buildkite/test_areas/e2e_integration.yaml | 19 +------------------ .buildkite/test_areas/lm_eval.yaml | 4 ++-- .buildkite/test_areas/lora.yaml | 2 ++ .buildkite/test_areas/models_basic.yaml | 2 ++ .buildkite/test_areas/pytorch.yaml | 4 +++- 5 files changed, 10 insertions(+), 21 deletions(-) diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 93d389815edac..2e0857986c3fa 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -32,6 +32,7 @@ steps: - label: Prime-RL Integration (2 GPUs) timeout_in_minutes: 30 optional: true + soft_fail: true num_gpus: 2 working_dir: "/vllm-workspace" source_file_dependencies: @@ -39,21 +40,3 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh - -- label: DeepSeek V2-Lite Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 9af43e0c375a8..e2498512bdef7 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -9,7 +9,7 @@ steps: - vllm/model_executor/layers/quantization autorun_on_main: true commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: LM Eval Large Models (4 GPUs)(A100) gpu: a100 @@ -43,4 +43,4 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index 809b4138f44ba..59ade40cc8f52 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -22,6 +22,8 @@ steps: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # Alot of these tests are on the edge of OOMing + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index 39a5d51c48833..2a86596a6d603 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -9,6 +9,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Run a subset of model initialization tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset @@ -20,6 +21,7 @@ steps: source_file_dependencies: - vllm/model_executor/models/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Only when vLLM model source is modified - test initialization of a large # subset of supported models (the complement of the small subset in the above diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index 703c82eb1a91b..332d5202d8338 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -13,7 +13,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test timeout_in_minutes: 30 From 84896fda22d3de74398a88b5769c98eef14258f1 Mon Sep 17 00:00:00 2001 From: baoqian426 <1354987947@qq.com> Date: Wed, 17 Dec 2025 19:32:34 +0800 Subject: [PATCH 37/49] [Bugfix] deepseek-V3.2 self.weights_proj has no bias (#30841) Signed-off-by: baoqian <1354987947@qq.com> Signed-off-by: baoqian426 <1354987947@qq.com> --- vllm/model_executor/models/deepseek_v2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 146124153c79d..6670143cda250 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -835,7 +835,11 @@ class Indexer(nn.Module): ) self.k_norm = LayerNorm(self.head_dim, eps=1e-6) self.weights_proj = ReplicatedLinear( - hidden_size, self.n_head, quant_config=None, prefix=f"{prefix}.weights_proj" + hidden_size, + self.n_head, + bias=False, + quant_config=None, + prefix=f"{prefix}.weights_proj", ) self.softmax_scale = self.head_dim**-0.5 From fb980eb2fdd15f81d4c5695347bdea308bb5515e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:33:50 +0000 Subject: [PATCH 38/49] Fix lazy import (#30858) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/structured_output/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index cb5ad99cfbdf7..74df0fa067670 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -20,9 +20,9 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput if TYPE_CHECKING: import outlines_core as oc + import transformers.convert_slow_tokenizer as convert_slow_tokenizer import transformers.file_utils as file_utils import xgrammar as xgr - from transformers.convert_slow_tokenizer import bytes_to_unicode from vllm.tokenizers import TokenizerLike from vllm.v1.worker.gpu_input_batch import InputBatch @@ -30,8 +30,8 @@ else: xgr = LazyLoader("xgr", globals(), "xgrammar") oc = LazyLoader("oc", globals(), "outlines_core") file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils") - bytes_to_unicode = LazyLoader( - "bytes_to_unicode", globals(), "transformers.convert_slow_tokenizer" + convert_slow_tokenizer = LazyLoader( + "convert_slow_tokenizer", globals(), "transformers.convert_slow_tokenizer" ) TokenizerLike = object @@ -202,7 +202,9 @@ def _reduced_vocabulary( A Dict of token string -> equivalent token ids """ - unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()} + unicode_to_bytes = { + v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items() + } def convert_token_to_string(token: str) -> str: string = tokenizer.convert_tokens_to_string([token]) From 6482e3895baa483fb30227648aa4721f09699cba Mon Sep 17 00:00:00 2001 From: Hank_ <37239608+ILikeIneine@users.noreply.github.com> Date: Wed, 17 Dec 2025 19:58:16 +0800 Subject: [PATCH 39/49] chores: adjust the attn register param order (#30688) Signed-off-by: Hank --- vllm/attention/backends/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index eaa0fa1d5db39..ed0021db204ac 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -201,8 +201,8 @@ _MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {} def register_backend( backend: AttentionBackendEnum | MambaAttentionBackendEnum, - is_mamba: bool = False, class_path: str | None = None, + is_mamba: bool = False, ) -> Callable[[type], type]: """Register or override a backend implementation. From 6e9dbcc50e35af75ec76bf033ee6402697c02609 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 17 Dec 2025 03:58:43 -0800 Subject: [PATCH 40/49] [Fix] uniform decode batch check (#30747) Signed-off-by: Jialin Ouyang --- tests/v1/worker/test_gpu_model_runner.py | 84 ++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 45 ++++++++++--- 2 files changed, 121 insertions(+), 8 deletions(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 7b8c4268a5237..59f1ac705829f 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1110,3 +1110,87 @@ def test_hybrid_cache_integration(model_runner, dist_init): runner._update_states(scheduler_output) assert _is_req_scheduled(runner, req_id) assert _is_req_state_block_table_match(runner, req_id) + + +def test_is_uniform_decode() -> None: + # Normal + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=2, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=15, + ) + # Spec decoding + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=5, + uniform_decode_query_len=5, + num_tokens=30, + num_reqs=6, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=5, + uniform_decode_query_len=4, + num_tokens=30, + num_reqs=6, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=5, + uniform_decode_query_len=5, + num_tokens=30, + num_reqs=7, + ) + # Force uniform decode + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=True, + ) + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=2, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=True, + ) + assert GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=15, + force_uniform_decode=True, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=False, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=2, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=16, + force_uniform_decode=False, + ) + assert not GPUModelRunner._is_uniform_decode( + max_num_scheduled_tokens=1, + uniform_decode_query_len=1, + num_tokens=16, + num_reqs=15, + force_uniform_decode=False, + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1db5bc99fff6c..a44150432434b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2777,6 +2777,27 @@ class GPUModelRunner( **model_kwargs, ) + @staticmethod + def _is_uniform_decode( + max_num_scheduled_tokens: int, + uniform_decode_query_len: int, + num_tokens: int, + num_reqs: int, + force_uniform_decode: bool | None = None, + ) -> bool: + """ + Checks if it's a decode batch with same amount scheduled tokens + across all requests. + """ + return ( + ( + (max_num_scheduled_tokens == uniform_decode_query_len) + and (num_tokens == max_num_scheduled_tokens * num_reqs) + ) + if force_uniform_decode is None + else force_uniform_decode + ) + def _determine_batch_execution_and_padding( self, num_tokens: int, @@ -2798,14 +2819,12 @@ class GPUModelRunner( torch.Tensor | None, CUDAGraphStat | None, ]: - num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) - uniform_decode = ( - ( - (max_num_scheduled_tokens == self.uniform_decode_query_len) - and (num_tokens_padded == max_num_scheduled_tokens * num_reqs) - ) - if force_uniform_decode is None - else force_uniform_decode + uniform_decode = self._is_uniform_decode( + max_num_scheduled_tokens=max_num_scheduled_tokens, + uniform_decode_query_len=self.uniform_decode_query_len, + num_tokens=num_tokens, + num_reqs=num_reqs, + force_uniform_decode=force_uniform_decode, ) # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output # is present). Also, chunked-prefill is disabled, so batch are uniform. @@ -2819,6 +2838,7 @@ class GPUModelRunner( else force_has_lora ) + num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens) dispatch_cudagraph = ( lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens, @@ -2834,6 +2854,15 @@ class GPUModelRunner( num_tokens_padded, use_cascade_attn or has_encoder_output ) num_tokens_padded = batch_descriptor.num_tokens + if self.compilation_config.pass_config.enable_sp: + assert ( + batch_descriptor.num_tokens + % self.vllm_config.parallel_config.tensor_parallel_size + == 0 + ), ( + "Sequence parallelism requires num_tokens to be " + "a multiple of tensor parallel size" + ) # Extra coordination when running data-parallel since we need to coordinate # across ranks From 9e67c4ce985b0b8852603cfe3fcaf8f37de137ed Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 17 Dec 2025 20:14:45 +0800 Subject: [PATCH 41/49] [Docs] fix function name (#30748) Signed-off-by: rongfu.leng --- docs/design/plugin_system.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index b0ca2dad23d5b..0fd448c2153c3 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -109,7 +109,7 @@ Every plugin has three parts: - `init_device`: This function is called to set up the device for the worker. - `initialize_cache`: This function is called to set cache config for the worker. - `load_model`: This function is called to load the model weights to device. - - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model. + - `get_kv_cache_spec`: This function is called to generate the kv cache spec for the model. - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs. - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config - `execute_model`: This function is called every step to inference the model. From b7b6a60aca0405b2d6b2ed6fd13853635f000b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=91=AB=E5=B4=A7?= <50285788+SongDI911@users.noreply.github.com> Date: Wed, 17 Dec 2025 23:10:59 +0800 Subject: [PATCH 42/49] Adapt the old parameter enable_thinking in chat_template_kwargs (#30852) Signed-off-by: xinsong.gao <1418762819@qq.com> Co-authored-by: Chauncey --- vllm/reasoning/deepseek_v3_reasoning_parser.py | 2 ++ vllm/tokenizers/deepseek_v32.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index 6604f70badbcf..4e6758586bf42 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -26,6 +26,8 @@ class DeepSeekV3ReasoningParser(ReasoningParser): chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {} thinking = bool(chat_kwargs.pop("thinking", False)) + enable_thinking = bool(chat_kwargs.pop("enable_thinking", False)) + thinking = thinking or enable_thinking if thinking: self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py index bf279a5cf67c5..d519b61ddb76d 100644 --- a/vllm/tokenizers/deepseek_v32.py +++ b/vllm/tokenizers/deepseek_v32.py @@ -50,6 +50,8 @@ class DeepseekV32Tokenizer(CachedHfTokenizer): **kwargs, ) -> str | list[int]: thinking = kwargs.get("thinking", False) + enable_thinking = kwargs.get("enable_thinking", False) + thinking = thinking or enable_thinking thinking_mode = "thinking" if not thinking: thinking_mode = "chat" From 196cdc3224112df7f68c901fe4c5314875a65be8 Mon Sep 17 00:00:00 2001 From: KimHyemin <102578109+www-spam@users.noreply.github.com> Date: Thu, 18 Dec 2025 00:11:18 +0900 Subject: [PATCH 43/49] [Model] Gemma3: Support untied word embeddings (#30827) Signed-off-by: www-spam --- vllm/model_executor/models/gemma3.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 40f6d100c767e..70f72b5cb9beb 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -39,7 +39,10 @@ from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name, @@ -532,12 +535,20 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): super().__init__() self.config = config - # currently all existing Gemma models have `tie_word_embeddings` enabled - assert config.tie_word_embeddings self.quant_config = quant_config self.model = Gemma3Model( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) + + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping ) @@ -565,7 +576,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self, hidden_states: torch.Tensor, ) -> torch.Tensor | None: - logits = self.logits_processor(self.model.embed_tokens, hidden_states) + logits = self.logits_processor(self.lm_head, hidden_states) return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: From 2497228ad4427310bc55427f6db404a00de4fd78 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 17 Dec 2025 23:32:17 +0800 Subject: [PATCH 44/49] [Chore] Factor out logic for requesting initial memory (#30868) Signed-off-by: DarkLight1337 --- vllm/utils/mem_utils.py | 31 +++++++++++++++++++++++++++---- vllm/v1/worker/gpu_worker.py | 20 ++++---------------- vllm/v1/worker/utils.py | 26 +++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index e2517b935bf28..bf6d7846573b9 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -66,27 +66,43 @@ class MemorySnapshot: torch_memory: int = 0 non_torch_memory: int = 0 timestamp: float = 0.0 + + device: torch.types.Device = None auto_measure: bool = True def __post_init__(self) -> None: + if self.device is None: + from vllm.platforms import current_platform + + device_fn = current_platform.current_device + assert device_fn is not None + self.device_ = torch.device(device_fn()) + else: + self.device_ = torch.device(self.device) + if self.auto_measure: self.measure() def measure(self) -> None: from vllm.platforms import current_platform + device = self.device_ + # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.cuda.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, # `torch.cuda.memory_reserved()` will keep growing, and only shrink # when we call `torch.cuda.empty_cache()` or OOM happens. - self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0) + self.torch_peak = torch.cuda.memory_stats(device).get( + "allocated_bytes.all.peak", 0 + ) - self.free_memory, self.total_memory = torch.cuda.mem_get_info() + self.free_memory, self.total_memory = torch.cuda.mem_get_info(device) shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark if ( current_platform.is_cuda() - and current_platform.get_device_capability() in shared_sysmem_device_mem_sms + and current_platform.get_device_capability(device.index) + in shared_sysmem_device_mem_sms ): # On UMA (Orin, Thor and Spark) platform, # where both CPU and GPU rely on system memory, @@ -106,12 +122,18 @@ class MemorySnapshot: # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) # this is used to measure the non-torch memory usage - self.torch_memory = torch.cuda.memory_reserved() + self.torch_memory = torch.cuda.memory_reserved(device) self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": + if self.device_ != other.device_: + raise ValueError( + "The two snapshots should be from the same device! " + f"Found: {self.device_} vs. {other.device_}" + ) + return MemorySnapshot( torch_peak=self.torch_peak - other.torch_peak, free_memory=self.free_memory - other.free_memory, @@ -120,6 +142,7 @@ class MemorySnapshot: torch_memory=self.torch_memory - other.torch_memory, non_torch_memory=self.non_torch_memory - other.non_torch_memory, timestamp=self.timestamp - other.timestamp, + device=self.device_, auto_measure=False, ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1e13650cd083e..bc71351d2cc55 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -56,6 +56,8 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.workspace import init_workspace_manager +from .utils import request_memory + logger = init_logger(__name__) if TYPE_CHECKING: @@ -237,22 +239,8 @@ class Worker(WorkerBase): torch.cuda.empty_cache() # take current memory snapshot - self.init_snapshot = MemorySnapshot() - self.requested_memory = ( - self.init_snapshot.total_memory - * self.cache_config.gpu_memory_utilization - ) - if self.init_snapshot.free_memory < self.requested_memory: - GiB = lambda b: round(b / GiB_bytes, 2) - raise ValueError( - f"Free memory on device " - f"({GiB(self.init_snapshot.free_memory)}/" - f"{GiB(self.init_snapshot.total_memory)} GiB) on startup " - f"is less than desired GPU memory utilization " - f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes." - ) + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 2e8afec024ce9..31ccf7f157468 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -8,13 +8,15 @@ from typing_extensions import deprecated from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import MemorySnapshot from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec @@ -248,6 +250,28 @@ def gather_mm_placeholders( return placeholders[is_embed] +def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float: + """ + Calculate the amount of memory required by vLLM, then validate + that the current amount of free memory is sufficient for that. + """ + requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization + + if init_snapshot.free_memory < requested_memory: + GiB = lambda b: round(b / GiB_bytes, 2) + raise ValueError( + f"Free memory on device {init_snapshot.device_} " + f"({GiB(init_snapshot.free_memory)}/" + f"{GiB(init_snapshot.total_memory)} GiB) on startup " + f"is less than desired GPU memory utilization " + f"({cache_config.gpu_memory_utilization}, " + f"{GiB(requested_memory)} GiB). Decrease GPU memory " + f"utilization or reduce GPU memory used by other processes." + ) + + return requested_memory + + def add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers: dict[str, str], kv_cache_groups: list[KVCacheGroupSpec], From 9ca8cb38fd68142627c9649756f1ddc5432c8b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 17 Dec 2025 18:49:56 +0100 Subject: [PATCH 45/49] [CI][Bugfix] Fix flaky `tests/entrypoints/openai/test_audio.py::test_chat_streaming_audio` (#30878) Signed-off-by: NickLucche --- tests/entrypoints/openai/test_audio.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a2d8993441fcd..4cf864bdb2de9 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -254,7 +254,9 @@ async def test_single_chat_session_input_audio( async def test_chat_streaming_audio( client: openai.AsyncOpenAI, model_name: str, audio_url: str ): - messages = dummy_messages_from_audio_url(audio_url) + messages = dummy_messages_from_audio_url( + audio_url, "What's a short title for this audio?" + ) # test single completion chat_completion = await client.chat.completions.create( From 7eb6cb6c18a948fb49824154cb3ece1e32d12cf8 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 17 Dec 2025 12:49:59 -0500 Subject: [PATCH 46/49] [Attention] Update tests to remove deprecated env vars (#30563) Signed-off-by: Matthew Bonanni --- .../scripts/hardware_ci/run-xpu-test.sh | 2 +- .../test_basic_correctness.py | 85 +++++------ tests/compile/distributed/test_fusions_e2e.py | 9 +- .../fullgraph/test_basic_correctness.py | 82 ++++++----- .../compile/fullgraph/test_full_cudagraph.py | 13 +- tests/compile/fullgraph/test_full_graph.py | 7 +- tests/distributed/test_context_parallel.py | 4 +- tests/distributed/test_pp_cudagraph.py | 26 ++-- tests/engine/test_arg_utils.py | 135 +++++++++++++++++- tests/entrypoints/openai/test_serving_chat.py | 13 +- .../attention/test_attention_selector.py | 52 +++---- .../attention/test_rocm_attention_selector.py | 60 +++++--- tests/kernels/test_flex_attention.py | 95 ++++++------ .../generation/test_granite_speech.py | 12 +- tests/models/multimodal/pooling/conftest.py | 24 ++-- .../models/multimodal/pooling/test_siglip.py | 8 ++ tests/models/quantization/test_fp8.py | 3 +- tests/models/test_initialization.py | 12 +- .../test_rocm_attention_backends_selection.py | 12 +- tests/v1/attention/utils.py | 47 +++--- tests/v1/cudagraph/test_cudagraph_mode.py | 33 +---- tests/v1/determinism/test_batch_invariance.py | 25 ++-- .../test_online_batch_invariance.py | 5 +- tests/v1/e2e/test_async_scheduling.py | 22 +-- tests/v1/e2e/test_cascade_attention.py | 29 ++-- tests/v1/e2e/test_spec_decode.py | 43 +++--- .../nixl_integration/run_accuracy_test.sh | 22 ++- .../tp_config_sweep_accuracy_test.sh | 12 +- .../kv_connector/unit/test_nixl_connector.py | 6 +- tests/v1/kv_connector/unit/utils.py | 4 + tests/v1/kv_offload/test_cpu_offloading.py | 15 +- tests/v1/spec_decode/test_eagle.py | 19 ++- tests/v1/spec_decode/test_max_len.py | 89 ++++++------ vllm/v1/attention/backends/rocm_attn.py | 2 +- 34 files changed, 580 insertions(+), 447 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index dfc9db512d1e9..85b554e5e8646 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -39,7 +39,7 @@ docker run \ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager - VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN cd tests pytest -v -s v1/core pytest -v -s v1/engine diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 9e1cc309edd1d..68b5cd5101d5d 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("model_executor", ["uni", "mp"]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) def test_models( - monkeypatch: pytest.MonkeyPatch, hf_runner, model: str, backend: str, @@ -77,48 +76,46 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", backend) + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = ( + "The following numbers of the sequence " + + ", ".join(str(i) for i in range(1024)) + + " are:" + ) + example_prompts = [prompt] - # 5042 tokens for gemma2 - # gemma2 has alternating sliding window size of 4096 - # we need a prompt with more than 4096 tokens to test the sliding window - prompt = ( - "The following numbers of the sequence " - + ", ".join(str(i) for i in range(1024)) - + " are:" - ) - example_prompts = [prompt] + with hf_runner(model) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + if enable_prompt_embeds: + with torch.no_grad(): + prompt_embeds = hf_model.get_prompt_embeddings(example_prompts) - with hf_runner(model) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - if enable_prompt_embeds: - with torch.no_grad(): - prompt_embeds = hf_model.get_prompt_embeddings(example_prompts) + with VllmRunner( + model, + max_model_len=8192, + enforce_eager=enforce_eager, + enable_prompt_embeds=enable_prompt_embeds, + gpu_memory_utilization=0.7, + async_scheduling=async_scheduling, + distributed_executor_backend=model_executor, + attention_config={"backend": backend}, + ) as vllm_model: + if enable_prompt_embeds: + vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens) + vllm_outputs = _fix_prompt_embed_outputs( + vllm_outputs, hf_model, example_prompts + ) + else: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - with VllmRunner( - model, - max_model_len=8192, - enforce_eager=enforce_eager, - enable_prompt_embeds=enable_prompt_embeds, - gpu_memory_utilization=0.7, - async_scheduling=async_scheduling, - distributed_executor_backend=model_executor, - ) as vllm_model: - if enable_prompt_embeds: - vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens) - vllm_outputs = _fix_prompt_embed_outputs( - vllm_outputs, hf_model, example_prompts - ) - else: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -161,12 +158,6 @@ def test_models_distributed( ): # noqa pytest.skip("enable_prompt_embeds does not work with ray compiled dag.") - if attention_backend: - monkeypatch_context.setenv( - "VLLM_ATTENTION_BACKEND", - attention_backend, - ) - for k, v in extra_env.items(): monkeypatch_context.setenv(k, v) @@ -178,6 +169,7 @@ def test_models_distributed( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method # (the default method). + attention_config = {"backend": attention_backend} if attention_backend else None with vllm_runner( model, dtype=dtype, @@ -185,6 +177,7 @@ def test_models_distributed( distributed_executor_backend=distributed_executor_backend, enable_prompt_embeds=enable_prompt_embeds, gpu_memory_utilization=0.7, + attention_config=attention_config, ) as vllm_model: if enable_prompt_embeds: with hf_runner(model, dtype=dtype) as hf_model: diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 960b5b4bd7ad4..28ab2cee71a6a 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -208,7 +208,8 @@ def test_attn_quant( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties @@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties @@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py index f2e58b5cc423e..d062ed221ff59 100644 --- a/tests/compile/fullgraph/test_basic_correctness.py +++ b/tests/compile/fullgraph/test_basic_correctness.py @@ -89,7 +89,6 @@ class TestSetting: ], ) def test_compile_correctness( - monkeypatch: pytest.MonkeyPatch, test_setting: TestSetting, ): # this test is run under multiple suits, with different GPUs. @@ -107,49 +106,48 @@ def test_compile_correctness( f"{cuda_device_count_stateless()}" ) - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - final_args = [ - *model_args, - "-pp", - str(pp_size), - "-tp", - str(tp_size), - "-cc.cudagraph_mode=none", - ] + final_args = [ + *model_args, + "-pp", + str(pp_size), + "-tp", + str(tp_size), + "-cc.cudagraph_mode=none", + f"--attention-backend={attn_backend}", + ] - all_args: list[list[str]] = [] - all_envs: list[dict[str, str] | None] = [] + all_args: list[list[str]] = [] + all_envs: list[dict[str, str] | None] = [] - for comp_mode in [ - CompilationMode.STOCK_TORCH_COMPILE, - CompilationMode.DYNAMO_TRACE_ONCE, - CompilationMode.VLLM_COMPILE, - ]: - for mode in [CompilationMode.NONE, comp_mode]: - all_args.append( - final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"] - ) - - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close", + for comp_mode in [ + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, + ]: + for mode in [CompilationMode.NONE, comp_mode]: + all_args.append( + final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"] ) - all_envs.clear() - all_args.clear() - for mode in [ - CompilationMode.NONE, - CompilationMode.STOCK_TORCH_COMPILE, - CompilationMode.DYNAMO_TRACE_ONCE, - CompilationMode.VLLM_COMPILE, - ]: - all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) - all_envs.append({}) - all_envs.append({}) + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close", + ) + all_envs.clear() + all_args.clear() - compare_all_settings(model, all_args * 3, all_envs, method=method) + for mode in [ + CompilationMode.NONE, + CompilationMode.STOCK_TORCH_COMPILE, + CompilationMode.DYNAMO_TRACE_ONCE, + CompilationMode.VLLM_COMPILE, + ]: + all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"]) + all_envs.append({}) + all_envs.append({}) + + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py index c6d4b5272dbcf..4ce6abfe3e46d 100644 --- a/tests/compile/fullgraph/test_full_cudagraph.py +++ b/tests/compile/fullgraph/test_full_cudagraph.py @@ -74,7 +74,6 @@ def llm_pair(request): # Force native sampler to avoid potential nondeterminism in FlashInfer # when per-request generators are not used in V1. "VLLM_USE_FLASHINFER_SAMPLER": "0", - **backend_config.env_vars, } with temporary_environ(env_vars): full = LLM( @@ -170,16 +169,10 @@ class TestFullCUDAGraph: @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") def test_full_cudagraph_with_invalid_backend(): - with ( - temporary_environ( - { - "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION", - # Flex_Attention is not supported with full cuda graph - } - ), - pytest.raises(RuntimeError), - ): + # Flex_Attention is not supported with full cuda graph + with pytest.raises(RuntimeError): LLM( model="Qwen/Qwen2-1.5B-Instruct", compilation_config=CompilationConfig(cudagraph_mode="FULL"), + attention_config={"backend": "FLEX_ATTENTION"}, ) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 3cd1d4be2ebdc..22af2d57f4f3d 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -197,20 +197,19 @@ def test_custom_compile_config( ], ) def test_fp8_kv_scale_compile( - monkeypatch: pytest.MonkeyPatch, compilation_mode: int, model: str, backend: AttentionBackendEnum | None, ): - if backend: - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) - model_kwargs = { "quantization": "fp8", "kv_cache_dtype": "fp8_e4m3", "calculate_kv_scales": True, "max_model_len": 512, } + if backend: + model_kwargs["attention_config"] = {"backend": backend.name} + run_model(compilation_mode, model, **model_kwargs) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index aa47f28a34dd5..a286309217719 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -219,14 +219,12 @@ def _test_cp_gsm8k( ] ) - server_env = {} if attn_backend: - server_env["VLLM_ATTENTION_BACKEND"] = attn_backend + server_args.append(f"--attention-backend={attn_backend}") with RemoteOpenAIServer( model_id, server_args, - env_dict=server_env, max_wait_seconds=720, ) as remote_server: host = f"http://{remote_server.host}" diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 2f2b43cb4cc2b..34ae305c2d2c1 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -20,23 +20,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test ) @create_new_process_for_each_test() def test_pp_cudagraph( - monkeypatch: pytest.MonkeyPatch, PP_SIZE: int, MODEL_NAME: str, ATTN_BACKEND: LiteralString, ): - with monkeypatch.context() as m: - cudagraph_args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--pipeline-parallel-size", - str(PP_SIZE), - "--distributed-executor-backend", - "mp", - ] - m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND) + cudagraph_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--distributed-executor-backend", + "mp", + f"--attention-backend={ATTN_BACKEND}", + ] - eager_args = cudagraph_args + ["--enforce-eager"] + eager_args = cudagraph_args + ["--enforce-eager"] - compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) + compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index c2cf77ffa12b6..25a5e00cc0e16 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -9,7 +9,7 @@ from typing import Annotated, Literal import pytest -from vllm.config import CompilationConfig, config +from vllm.config import AttentionConfig, CompilationConfig, config from vllm.engine.arg_utils import ( EngineArgs, contains_type, @@ -298,6 +298,139 @@ def test_compilation_config(): ) +def test_attention_config(): + from vllm.attention.backends.registry import AttentionBackendEnum + + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + + # default value + args = parser.parse_args([]) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config == AttentionConfig() + + # set backend via dot notation + args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"]) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config.backend is not None + assert engine_args.attention_config.backend.name == "FLASH_ATTN" + + # set backend via --attention-backend shorthand + args = parser.parse_args(["--attention-backend", "FLASHINFER"]) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_backend is not None + assert engine_args.attention_backend == "FLASHINFER" + + # set all fields via dot notation + args = parser.parse_args( + [ + "--attention-config.backend", + "FLASH_ATTN", + "--attention-config.flash_attn_version", + "3", + "--attention-config.use_prefill_decode_attention", + "true", + "--attention-config.flash_attn_max_num_splits_for_cuda_graph", + "16", + "--attention-config.use_cudnn_prefill", + "true", + "--attention-config.use_trtllm_ragged_deepseek_prefill", + "true", + "--attention-config.use_trtllm_attention", + "true", + "--attention-config.disable_flashinfer_prefill", + "true", + "--attention-config.disable_flashinfer_q_quantization", + "true", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config.backend is not None + assert engine_args.attention_config.backend.name == "FLASH_ATTN" + assert engine_args.attention_config.flash_attn_version == 3 + assert engine_args.attention_config.use_prefill_decode_attention is True + assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16 + assert engine_args.attention_config.use_cudnn_prefill is True + assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True + assert engine_args.attention_config.use_trtllm_attention is True + assert engine_args.attention_config.disable_flashinfer_prefill is True + assert engine_args.attention_config.disable_flashinfer_q_quantization is True + + # set to string form of a dict with all fields + args = parser.parse_args( + [ + "--attention-config=" + '{"backend": "FLASHINFER", "flash_attn_version": 2, ' + '"use_prefill_decode_attention": false, ' + '"flash_attn_max_num_splits_for_cuda_graph": 8, ' + '"use_cudnn_prefill": false, ' + '"use_trtllm_ragged_deepseek_prefill": false, ' + '"use_trtllm_attention": false, ' + '"disable_flashinfer_prefill": false, ' + '"disable_flashinfer_q_quantization": false}', + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + assert engine_args.attention_config.backend is not None + assert engine_args.attention_config.backend.name == "FLASHINFER" + assert engine_args.attention_config.flash_attn_version == 2 + assert engine_args.attention_config.use_prefill_decode_attention is False + assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8 + assert engine_args.attention_config.use_cudnn_prefill is False + assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False + assert engine_args.attention_config.use_trtllm_attention is False + assert engine_args.attention_config.disable_flashinfer_prefill is False + assert engine_args.attention_config.disable_flashinfer_q_quantization is False + + # test --attention-backend flows into VllmConfig.attention_config + args = parser.parse_args( + [ + "--model", + "facebook/opt-125m", + "--attention-backend", + "FLASH_ATTN", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + vllm_config = engine_args.create_engine_config() + assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN + + # test --attention-config.backend flows into VllmConfig.attention_config + args = parser.parse_args( + [ + "--model", + "facebook/opt-125m", + "--attention-config.backend", + "FLASHINFER", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + vllm_config = engine_args.create_engine_config() + assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER + + # test --attention-backend and --attention-config.backend are mutually exclusive + args = parser.parse_args( + [ + "--model", + "facebook/opt-125m", + "--attention-backend", + "FLASH_ATTN", + "--attention-config.backend", + "FLASHINFER", + ] + ) + assert args is not None + engine_args = EngineArgs.from_cli_args(args) + with pytest.raises(ValueError, match="mutually exclusive"): + engine_args.create_engine_config() + + def test_prefix_cache_default(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([]) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 444275e061c61..2befa40d636da 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -76,15 +76,10 @@ def default_server_args(with_tool_parser: bool): @pytest.fixture(scope="module") -def gptoss_server( - monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str] -): - with monkeypatch_module.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") - with RemoteOpenAIServer( - GPT_OSS_MODEL_NAME, default_server_args - ) as remote_server: - yield remote_server +def gptoss_server(default_server_args: list[str]): + server_args = default_server_args + ["--attention-backend=TRITON_ATTN"] + with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server: + yield remote_server @pytest_asyncio.fixture diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index c959b2f4bb03c..d62acc2022d10 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -6,7 +6,9 @@ from unittest.mock import patch import pytest import torch +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform @@ -73,18 +75,18 @@ def generate_params(): @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params()) -def test_env( +def test_backend_selection( device: str, name: str, use_mla: bool, block_size: int, - monkeypatch: pytest.MonkeyPatch, ): """Test attention backend selection with valid device-backend pairs.""" - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", name) - m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") + # Create AttentionConfig with the specified backend + attention_config = AttentionConfig(backend=AttentionBackendEnum[name]) + vllm_config = VllmConfig(attention_config=attention_config) + with set_current_vllm_config(vllm_config): if device == "cpu": with patch("vllm.platforms.current_platform", CpuPlatform()): backend = get_attn_backend(16, torch.float16, None, block_size) @@ -217,27 +219,32 @@ def test_env( @pytest.mark.parametrize("device", ["cpu", "cuda"]) def test_fp32_fallback(device: str): """Test attention backend selection with fp32.""" - if device == "cpu": - with patch("vllm.platforms.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16) - assert backend.get_name() == "CPU_ATTN" + # Use default config (no backend specified) + vllm_config = VllmConfig() - elif device == "cuda": - with patch("vllm.platforms.current_platform", CudaPlatform()): - backend = get_attn_backend(16, torch.float32, None, 16) - assert backend.get_name() == "FLEX_ATTENTION" + with set_current_vllm_config(vllm_config): + if device == "cpu": + with patch("vllm.platforms.current_platform", CpuPlatform()): + backend = get_attn_backend(16, torch.float32, None, 16) + assert backend.get_name() == "CPU_ATTN" + + elif device == "cuda": + with patch("vllm.platforms.current_platform", CudaPlatform()): + backend = get_attn_backend(16, torch.float32, None, 16) + assert backend.get_name() == "FLEX_ATTENTION" def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" pytest.skip( "Skipping as current backend selector does not " - "handle fallbacks when a backend is set via env var." + "handle fallbacks when a backend is explicitly set." ) - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN") + attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN) + vllm_config = VllmConfig(attention_config=attention_config) + with set_current_vllm_config(vllm_config): # Unsupported CUDA arch monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5)) backend = get_attn_backend(16, torch.float16, None, 16) @@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch): assert backend.get_name() != "FLASH_ATTN" -def test_invalid_env(monkeypatch: pytest.MonkeyPatch): +def test_invalid_backend(): """Test that invalid attention backend names raise ValueError.""" with ( - monkeypatch.context() as m, - patch("vllm.platforms.current_platform", CudaPlatform()), + pytest.raises(ValueError), ): - m.setenv("VLLM_ATTENTION_BACKEND", "INVALID") - - # Should raise ValueError for invalid backend - with pytest.raises(ValueError) as exc_info: - get_attn_backend(32, torch.float16, None, 16) - assert "Invalid value 'INVALID'" in str(exc_info.value) + # Invalid backend name should raise ValueError when creating enum + AttentionConfig(backend=AttentionBackendEnum["INVALID"]) diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index b61058081c0b2..f97d475eb47d7 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -4,7 +4,9 @@ import pytest import torch +from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms.rocm import RocmPlatform @@ -16,40 +18,56 @@ def clear_cache(): @pytest.mark.skip(reason="Skipped for now. Should be revisited.") def test_selector(monkeypatch: pytest.MonkeyPatch): - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN") + # Set the current platform to ROCm using monkeypatch + monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) - # Set the current platform to ROCm using monkeypatch - monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) + # Test standard ROCm attention + attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN) + vllm_config = VllmConfig(attention_config=attention_config) - # Test standard ROCm attention + with set_current_vllm_config(vllm_config): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN" - # MLA test for deepseek related + # MLA test for deepseek related + # Change the attention backend to triton MLA + attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA) + vllm_config = VllmConfig(attention_config=attention_config) - # change the attention backend to triton MLA - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA") + with set_current_vllm_config(vllm_config): backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True) assert backend.get_name() == "TRITON_MLA" - # If attention backend is None - # If use_mla is true - # The selected backend is triton MLA - m.setenv("VLLM_ATTENTION_BACKEND", "") + # If attention backend is None + # If use_mla is true + # The selected backend is triton MLA + attention_config = AttentionConfig(backend=None) + vllm_config = VllmConfig(attention_config=attention_config) + + with set_current_vllm_config(vllm_config): backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True) assert backend.get_name() == "TRITON_MLA" - # change the attention backend to AITER MLA - m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA") + # Change the attention backend to AITER MLA + attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA) + vllm_config = VllmConfig(attention_config=attention_config) + + with set_current_vllm_config(vllm_config): backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True) assert backend.get_name() == "ROCM_AITER_MLA" - # If attention backend is None - # If use_mla is true - # If VLLM_ROCM_USE_AITER is enabled - # The selected backend is ROCM_AITER_MLA - m.setenv("VLLM_ATTENTION_BACKEND", "") + # If attention backend is None + # If use_mla is true + # If VLLM_ROCM_USE_AITER is enabled + # The selected backend is ROCM_AITER_MLA + with monkeypatch.context() as m: m.setenv("VLLM_ROCM_USE_AITER", "1") - backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True) - assert backend.get_name() == "ROCM_AITER_MLA" + + attention_config = AttentionConfig(backend=None) + vllm_config = VllmConfig(attention_config=attention_config) + + with set_current_vllm_config(vllm_config): + backend = get_attn_backend( + 576, torch.bfloat16, "auto", 1, False, use_mla=True + ) + assert backend.get_name() == "ROCM_AITER_MLA" diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py index ae33f422d3732..f6987d54399d2 100644 --- a/tests/kernels/test_flex_attention.py +++ b/tests/kernels/test_flex_attention.py @@ -37,7 +37,7 @@ def set_seed(seed): not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, reason="CUDA not available or PyTorch version < 2.7", ) -def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): +def test_flex_attention_vs_default_backend(vllm_runner): """Test that FlexAttention produces the same outputs as the default backend. This test compares the outputs from the FlexAttention backend with @@ -54,35 +54,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ] # Run with flex attention - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - - set_seed(seed) - with vllm_runner( - model_name, - runner="generate", - tensor_parallel_size=1, - num_gpu_blocks_override=128, - enforce_eager=True, - ) as llm_flex: - output_flex = llm_flex.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs - ) + set_seed(seed) + with vllm_runner( + model_name, + runner="generate", + tensor_parallel_size=1, + num_gpu_blocks_override=128, + enforce_eager=True, + attention_config={"backend": "FLEX_ATTENTION"}, + ) as llm_flex: + output_flex = llm_flex.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs + ) # Run with default backend - with monkeypatch.context() as m: - set_seed(seed) - with vllm_runner( - model_name, - runner="generate", - tensor_parallel_size=1, - num_gpu_blocks_override=128, - enforce_eager=True, - gpu_memory_utilization=0.85, - ) as llm_default: - output_default = llm_default.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs - ) + set_seed(seed) + with vllm_runner( + model_name, + runner="generate", + tensor_parallel_size=1, + num_gpu_blocks_override=128, + enforce_eager=True, + gpu_memory_utilization=0.85, + ) as llm_default: + output_default = llm_default.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs + ) check_logprobs_close( outputs_0_lst=output_flex, @@ -96,7 +93,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, reason="CUDA not available or PyTorch version < 2.7", ) -def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): +def test_encoder_flex_attention_vs_default_backend(vllm_runner): """Test that FlexAttention produces the same outputs as the default backend. This test compares the outputs from the FlexAttention backend with @@ -110,30 +107,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ] # Run with flex attention - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") - with vllm_runner( - model_name, - runner="pooling", - dtype=torch.bfloat16, - tensor_parallel_size=1, - max_model_len=100, - enforce_eager=True, - ) as llm_flex: - flex_outputs = llm_flex.embed(prompts) + with vllm_runner( + model_name, + runner="pooling", + dtype=torch.bfloat16, + tensor_parallel_size=1, + max_model_len=100, + enforce_eager=True, + attention_config={"backend": "FLEX_ATTENTION"}, + ) as llm_flex: + flex_outputs = llm_flex.embed(prompts) # Run with default backend - with ( - monkeypatch.context() as m, - vllm_runner( - model_name, - runner="pooling", - dtype=torch.bfloat16, - tensor_parallel_size=1, - max_model_len=100, - enforce_eager=True, - ) as llm_default, - ): + with vllm_runner( + model_name, + runner="pooling", + dtype=torch.bfloat16, + tensor_parallel_size=1, + max_model_len=100, + enforce_eager=True, + ) as llm_default: default_outputs = llm_default.embed(prompts) check_embeddings_close( diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index f528a993f8551..489743c5a29b3 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME models = [MODEL_NAME] -@pytest.fixture(autouse=True) -def set_attention_backend_for_rocm(monkeypatch): +@pytest.fixture +def granite_speech_attention_config(): + """Return attention config for Granite Speech tests on ROCm.""" if current_platform.is_rocm(): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") + return {"backend": "TRITON_ATTN"} + return None def run_test( @@ -53,6 +55,7 @@ def run_test( num_logprobs: int, tensor_parallel_size: int, distributed_executor_backend: str | None = None, + attention_config: dict | None = None, ): """Inference result should be the same between hf and vllm. @@ -80,6 +83,7 @@ def run_test( enable_lora=True, max_lora_rank=64, enforce_eager=True, + attention_config=attention_config, ) as vllm_model: lora_request = LoRARequest("audio", 1, audio_lora_path) vllm_outputs_per_case = [ @@ -131,6 +135,7 @@ def test_models( vllm_runner, model: str, audio_assets: AudioTestAssets, + granite_speech_attention_config, dtype: str, max_model_len: int, max_tokens: int, @@ -157,4 +162,5 @@ def test_models( max_tokens=max_tokens, num_logprobs=num_logprobs, tensor_parallel_size=1, + attention_config=granite_speech_attention_config, ) diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py index c5f40cb42ca2a..401bc39b4b109 100644 --- a/tests/models/multimodal/pooling/conftest.py +++ b/tests/models/multimodal/pooling/conftest.py @@ -2,23 +2,17 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Pytest configuration for vLLM pooling tests.""" -import os -import warnings +import pytest from vllm.platforms import current_platform -def pytest_collection_modifyitems(config, items): - """Set FLEX_ATTENTION backend for SigLIP tests on ROCm.""" - if not current_platform.is_rocm(): - return +@pytest.fixture +def siglip_attention_config(): + """Return attention config for SigLIP tests on ROCm. - siglip_tests = [item for item in items if "test_siglip" in item.nodeid] - - if siglip_tests: - os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION" - warnings.warn( - "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests", - UserWarning, - stacklevel=1, - ) + On ROCm, SigLIP tests require FLEX_ATTENTION backend. + """ + if current_platform.is_rocm(): + return {"backend": "FLEX_ATTENTION"} + return None diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 72886cbf7f323..0b8cd33ccfb9d 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -38,6 +38,7 @@ def _run_test( *, dtype: str, tokenization_kwargs: dict[str, Any] | None = None, + attention_config: dict[str, Any] | None = None, ) -> None: if tokenization_kwargs is None: tokenization_kwargs = {} @@ -49,6 +50,7 @@ def _run_test( enforce_eager=True, max_model_len=64, gpu_memory_utilization=0.7, + attention_config=attention_config, ) as vllm_model: vllm_outputs = vllm_model.embed( input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs @@ -90,6 +92,7 @@ def test_models_text( hf_runner, vllm_runner, image_assets, + siglip_attention_config, model: str, dtype: str, ) -> None: @@ -108,6 +111,7 @@ def test_models_text( "padding": "max_length", "max_length": 64, }, # siglip2 was trained with this padding setting. + attention_config=siglip_attention_config, ) @@ -117,6 +121,7 @@ def test_models_image( hf_runner, vllm_runner, image_assets, + siglip_attention_config, model: str, dtype: str, ) -> None: @@ -133,6 +138,7 @@ def test_models_image( input_images, model, dtype=dtype, + attention_config=siglip_attention_config, ) @@ -141,6 +147,7 @@ def test_models_image( def test_models_text_image_no_crash( vllm_runner, image_assets, + siglip_attention_config, model: str, dtype: str, ) -> None: @@ -154,6 +161,7 @@ def test_models_text_image_no_crash( enforce_eager=True, max_model_len=64, gpu_memory_utilization=0.7, + attention_config=siglip_attention_config, ) as vllm_model: with pytest.raises(ValueError, match="not both"): vllm_model.embed(texts, images=images) diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index 7dfedaf2799d4..f3b85ba0ee394 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -75,7 +75,6 @@ def test_models( with monkeypatch.context() as m: m.setenv("TOKENIZERS_PARALLELISM", "true") - m.setenv("VLLM_ATTENTION_BACKEND", backend) MAX_MODEL_LEN = 1024 NUM_LOG_PROBS = 8 @@ -86,6 +85,7 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype="auto", + attention_config={"backend": backend}, ) as vllm_model: baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS @@ -97,6 +97,7 @@ def test_models( tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, + attention_config={"backend": backend}, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 8c4bd6eaa2dd8..0a573847bf913 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -108,11 +108,12 @@ def can_initialize( patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1), monkeypatch.context() as m, ): - if model_arch == "GptOssForCausalLM": - # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU - # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when - # L4 supports FA3. - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") + # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU + # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when + # L4 supports FA3. + attention_config = ( + {"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None + ) if model_arch == "WhisperForConditionalGeneration": m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") @@ -143,6 +144,7 @@ def can_initialize( else "vllm", hf_overrides=hf_overrides_fn, max_num_seqs=model_info.max_num_seqs, + attention_config=attention_config, ) diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py index 77790be6f892b..d8c747056faf6 100644 --- a/tests/v1/attention/test_rocm_attention_backends_selection.py +++ b/tests/v1/attention/test_rocm_attention_backends_selection.py @@ -94,26 +94,20 @@ def mock_on_gfx9(): None, AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(), ), - # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 - ( - {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"}, - None, - AttentionBackendEnum.ROCM_ATTN.get_path(), - ), - # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN + # Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN ( {"VLLM_ROCM_USE_AITER": "1"}, "TRITON_ATTN", AttentionBackendEnum.TRITON_ATTN.get_path(), ), - # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0 + # Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0 # (explicitly disabled) ( {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"}, None, AttentionBackendEnum.TRITON_ATTN.get_path(), ), - # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN + # Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN ( {"VLLM_ROCM_USE_AITER": "1"}, "ROCM_ATTN", diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 4dcaf9d908690..031436a030908 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -249,8 +249,8 @@ def create_dummy_kv_cache( @dataclass class BackendConfig: name: str - env_vars: dict - comp_config: dict # compilation config + attention_config: dict + comp_config: dict specific_gpu_arch: tuple | None = None @@ -259,10 +259,10 @@ full_cg_backend_configs = { # FA3 on Hopper "FA3": BackendConfig( name="FA3", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", - "VLLM_FLASH_ATTN_VERSION": "3", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + attention_config={ + "backend": "FLASH_ATTN", + "flash_attn_version": 3, + "flash_attn_max_num_splits_for_cuda_graph": 16, }, comp_config={ "cudagraph_mode": "FULL", @@ -272,9 +272,7 @@ full_cg_backend_configs = { # FlashMLA on Hopper "FlashMLA": BackendConfig( name="FlashMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASHMLA", - }, + attention_config={"backend": "FLASHMLA"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -283,9 +281,7 @@ full_cg_backend_configs = { # Cutlass MLA on Blackwell "CutlassMLA": BackendConfig( name="CutlassMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", - }, + attention_config={"backend": "CUTLASS_MLA"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -294,9 +290,7 @@ full_cg_backend_configs = { # FlashInfer MLA on Blackwell "FlashInferMLA": BackendConfig( name="FlashInferMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA", - }, + attention_config={"backend": "FLASHINFER_MLA"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -305,9 +299,9 @@ full_cg_backend_configs = { # FlashAttention MLA on Hopper "FlashAttentionMLA": BackendConfig( name="FlashAttentionMLA", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + attention_config={ + "backend": "FLASH_ATTN_MLA", + "flash_attn_max_num_splits_for_cuda_graph": 16, }, comp_config={ "cudagraph_mode": "FULL_DECODE_ONLY", @@ -317,10 +311,10 @@ full_cg_backend_configs = { # FA2 "FA2": BackendConfig( name="FA2", - env_vars={ - "VLLM_ATTENTION_BACKEND": "FLASH_ATTN", - "VLLM_FLASH_ATTN_VERSION": "2", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16", + attention_config={ + "backend": "FLASH_ATTN", + "flash_attn_version": 2, + "flash_attn_max_num_splits_for_cuda_graph": 16, }, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", @@ -329,7 +323,7 @@ full_cg_backend_configs = { # Triton Attention "TritonAttn": BackendConfig( name="TritonAttn", - env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"}, + attention_config={"backend": "TRITON_ATTN"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, @@ -337,14 +331,17 @@ full_cg_backend_configs = { # FlashInfer "FlashInfer": BackendConfig( name="FlashInfer", - env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"}, + attention_config={"backend": "FLASHINFER"}, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", }, ), "RocmAttn": BackendConfig( name="RocmAttn", - env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"}, + attention_config={ + "backend": "ROCM_ATTN", + "use_prefill_decode_attention": True, + }, comp_config={ "cudagraph_mode": "FULL", }, diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index b1895e83b8b37..f4f74d16c7019 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import contextlib -import os import weakref from contextlib import ExitStack @@ -13,26 +11,6 @@ from vllm import LLM from vllm.config import CompilationConfig, CompilationMode from vllm.platforms import current_platform - -@contextlib.contextmanager -def temporary_environ(env_vars): - """ - Temporarily set environment variables and restore them afterward. - We have to do this vs monkeypatch because monkeypatch doesn't work - with "module" scoped fixtures. - """ - original_env = {k: os.environ.get(k) for k in env_vars} - try: - os.environ.update(env_vars) - yield - finally: - for k, v in original_env.items(): - if v is None: - os.environ.pop(k, None) - else: - os.environ[k] = v - - # test attention backend and cudagraph_mode combo # (backend_name, cudagraph_mode, supported) if current_platform.is_rocm(): @@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte ): pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") - env_vars = backend_configs[backend_name].env_vars + attention_config = backend_config.attention_config - with temporary_environ(env_vars), ExitStack() as stack: + with ExitStack() as stack: if not supported: stack.enter_context(pytest.raises(Exception)) @@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte trust_remote_code=True, gpu_memory_utilization=0.45, max_model_len=1024, + attention_config=attention_config, compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode ), @@ -122,9 +101,10 @@ combo_cases_2 = [ def test_cudagraph_compilation_combo( backend_name, cudagraph_mode, compilation_mode, supported ): - env_vars = backend_configs[backend_name].env_vars + backend_config = backend_configs[backend_name] + attention_config = backend_config.attention_config - with temporary_environ(env_vars), ExitStack() as stack: + with ExitStack() as stack: if not supported: stack.enter_context(pytest.raises(Exception)) @@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo( trust_remote_code=True, gpu_memory_utilization=0.45, max_model_len=1024, + attention_config=attention_config, compilation_config=CompilationConfig( mode=compilation_mode, cudagraph_mode=cudagraph_mode ), diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index 7a58e1c9bad03..61fb5f07303b4 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90() BACKENDS, ) def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( - backend, monkeypatch: pytest.MonkeyPatch + backend, ): """ Ensures that the same request (the 'needle' prompt) yields identical output @@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) + attention_config = {"backend": backend} # Allow overrides from environment (useful for CI tuning) # "facebook/opt-125m" is too small, doesn't reliably test determinism model = resolve_model_name(backend) @@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( max_num_seqs=max_batch_size, gpu_memory_utilization=gpu_mem_util, max_model_len=max_model_len, + attention_config=attention_config, ) # Baseline generation for the needle prompt alone. @@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( max_num_seqs=max_batch_size, gpu_memory_utilization=gpu_mem_util, max_model_len=max_model_len, + attention_config=attention_config, ) mismatches = 0 @@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( BACKENDS, ) def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( - backend, monkeypatch: pytest.MonkeyPatch + backend, ): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = resolve_model_name(backend) @@ -193,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( dtype="bfloat16", # not everything is supported gpu_memory_utilization=0.9, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) # Use more realistic prompts for better token generation @@ -381,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( "backend", BACKENDS, ) -def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): +def test_simple_generation(backend): """ Simple test that runs the model with a basic prompt and prints the output. Useful for quick smoke testing and debugging. """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) model = resolve_model_name(backend) llm = LLM( @@ -398,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): dtype="bfloat16", enable_prefix_caching=False, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) prompt = "the capital of france is" @@ -444,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail( The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - # CRITICAL: Disable batch invariance for this test monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0") monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False) @@ -465,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail( max_model_len=8192, dtype="bfloat16", enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) # build ragged prompts to change shapes significantly across BS=1 vs BS=N @@ -649,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail( @skip_unsupported @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) def test_decode_logprobs_match_prefill_logprobs( - backend, monkeypatch: pytest.MonkeyPatch + backend, ): """ Test that verifies decode logprobs match prefill logprobs. @@ -664,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs( This ensures that the logprobs from decode are consistent with what we would get if we ran prefill on each prefix. """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = resolve_model_name(backend) @@ -689,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs( max_model_len=8192, dtype="bfloat16", enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config={"backend": backend}, ) # Use a few test prompts @@ -920,6 +919,7 @@ def LLM_with_max_seqs( max_num_seqs: int, gpu_memory_utilization: float, max_model_len: int, + attention_config: dict | None = None, ) -> LLM: """ Helper to construct an LLM with a specific max_num_seqs (batch-size limit) @@ -934,6 +934,7 @@ def LLM_with_max_seqs( tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), enable_prefix_caching=False, enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, + attention_config=attention_config, # Enable for MOE models # enable_expert_parallel=True, ) diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py index 5e3b997364949..52c8103b2f1ce 100644 --- a/tests/v1/determinism/test_online_batch_invariance.py +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process( @skip_unsupported @pytest.mark.parametrize("backend", BACKENDS) def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( - backend: str, monkeypatch: pytest.MonkeyPatch + backend: str, ) -> None: random.seed(int(os.getenv("VLLM_TEST_SEED", "12345"))) - # Override backend for this test (and the RemoteOpenAIServer child process). - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) model_name = resolve_model_name(backend) prompts_all = [_random_prompt(10, 50) for _ in range(32)] @@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( server_args: list[str] = [ "--max-model-len=8192", "--max-num-seqs=32", + f"--attention-backend={backend}", ] if tp_size: server_args += ["-tp", tp_size] diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 5cef9b33c9984..61e56c079a3b5 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -142,16 +142,17 @@ def run_tests( """Test consistency of combos of async scheduling, preemption, uni/multiproc executor with spec decoding.""" - with monkeypatch.context() as m: - # avoid precision errors - if current_platform.is_rocm(): - if is_testing_with_spec_decoding: - # Use TRITON_ATTN for spec decoding test for consistency - m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") - else: - m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA") + # Determine attention config based on platform + if current_platform.is_rocm(): + if is_testing_with_spec_decoding: + # Use TRITON_ATTN for spec decoding test for consistency + attention_config = {"backend": "TRITON_ATTN"} else: - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + attention_config = {"backend": "ROCM_AITER_FA"} + else: + attention_config = {"backend": "FLEX_ATTENTION"} + + with monkeypatch.context() as m: # lock matmul precision to full FP32 (IEEE) m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") # m.setenv("VLLM_BATCH_INVARIANT", "1") @@ -174,6 +175,7 @@ def run_tests( spec_config, test_prefill_chunking=test_prefill_chunking, is_testing_with_spec_decoding=is_testing_with_spec_decoding, + attention_config=attention_config, ) outputs.append(test_results) @@ -262,6 +264,7 @@ def run_test( spec_config: dict[str, Any] | None, test_prefill_chunking: bool, is_testing_with_spec_decoding: bool = False, + attention_config: dict[str, Any] | None = None, ): spec_decoding = spec_config is not None cache_arg: dict[str, Any] = ( @@ -301,6 +304,7 @@ def run_test( dtype=dtype, speculative_config=spec_config, disable_log_stats=False, + attention_config=attention_config, **cache_arg, ) as vllm_model: results = [] diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 0fcb97fe63055..a7be981805c0d 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -10,7 +10,7 @@ from ...utils import create_new_process_for_each_test @create_new_process_for_each_test() @pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"]) -def test_cascade_attention(example_system_message, monkeypatch, attn_backend): +def test_cascade_attention(example_system_message, attn_backend): prompt = "\n: Implement fibonacci sequence in Python.\n:" if attn_backend == "FLASHINFER": @@ -19,19 +19,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend): "needs investigation. See issue #25679." ) - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + llm = LLM( + model="Qwen/Qwen2-1.5B-Instruct", attention_config={"backend": attn_backend} + ) + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) - llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") - sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + # No cascade attention. + single_prompt = [example_system_message + prompt] + responses = llm.generate(single_prompt, sampling_params) + ref_output = responses[0].outputs[0].text - # No cascade attention. - single_prompt = [example_system_message + prompt] - responses = llm.generate(single_prompt, sampling_params) - ref_output = responses[0].outputs[0].text - - # (Probably) Use cascade attention. - prompts = [example_system_message + prompt] * 64 - responses = llm.generate(prompts, sampling_params) - for response in responses: - assert response.outputs[0].text == ref_output + # (Probably) Use cascade attention. + prompts = [example_system_message + prompt] * 64 + responses = llm.generate(prompts, sampling_params) + for response in responses: + assert response.outputs[0].text == ref_output diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index fcfc8bdce12e9..a25114a4d96cb 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -438,25 +438,26 @@ def test_eagle_correctness( should be the same when using eagle speculative decoding. model_setup: (method, model_name, eagle_model_name, tp_size) """ + # Determine attention config + # Scout requires default backend selection because vision encoder has + # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back + # to Flex Attn + if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": + if current_platform.is_rocm(): + # TODO: Enable Flex Attn for spec_decode on ROCm + pytest.skip("Flex Attn for spec_decode not supported on ROCm currently") + attention_config = None # Let it fall back to default + else: + attention_config = {"backend": attn_backend} + + if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): + pytest.skip( + "TRITON_ATTN does not support " + "multi-token eagle spec decode on current platform" + ) + with monkeypatch.context() as m: - if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN": - # Scout requires default backend selection - # because vision encoder has head_dim 88 being incompatible - # with FLASH_ATTN and needs to fall back to Flex Attn - - # pass if not ROCm - if current_platform.is_rocm(): - # TODO: Enable Flex Attn for spec_decode on ROCm - pytest.skip("Flex Attn for spec_decode not supported on ROCm currently") - else: - m.setenv("VLLM_MLA_DISABLE", "1") - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): - pytest.skip( - "TRITON_ATTN does not support " - "multi-token eagle spec decode on current platform" - ) + m.setenv("VLLM_MLA_DISABLE", "1") if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): if "deepseek" in model_setup[1].lower(): @@ -471,7 +472,10 @@ def test_eagle_correctness( max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len ref_llm = LLM( - model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size + model=model_name, + max_model_len=max_model_len, + tensor_parallel_size=tp_size, + attention_config=attention_config, ) ref_outputs = ref_llm.chat(test_prompts, sampling_config) del ref_llm @@ -492,6 +496,7 @@ def test_eagle_correctness( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, model_impl=model_impl, + attention_config=attention_config, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 453ccc81eb14a..c2c38f51c5003 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -3,21 +3,29 @@ set -xe # Parse command line arguments KV_BUFFER_DEVICE="cuda" # Default to cuda +ATTENTION_BACKEND="" # Default to empty (use vllm default) while [[ $# -gt 0 ]]; do case $1 in --kv_buffer_device) KV_BUFFER_DEVICE="$2" shift 2 ;; + --attention-backend) + ATTENTION_BACKEND="$2" + shift 2 + ;; *) echo "Unknown option $1" - echo "Usage: $0 [--kv_buffer_device ]" + echo "Usage: $0 [--kv_buffer_device ] [--attention-backend ]" exit 1 ;; esac done echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE" +if [[ -n "$ATTENTION_BACKEND" ]]; then + echo "Using attention backend: $ATTENTION_BACKEND" +fi DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then @@ -148,6 +156,11 @@ run_tests_for_model() { --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '$KV_CONFIG'" + # Add attention backend config if specified + if [[ -n "$ATTENTION_BACKEND" ]]; then + BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" + fi + if [ -n "$model_args" ]; then FULL_CMD="$BASE_CMD $model_args" else @@ -188,7 +201,12 @@ run_tests_for_model() { --block-size ${DECODE_BLOCK_SIZE} \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --kv-transfer-config '$KV_CONFIG'" - + + # Add attention backend config if specified + if [[ -n "$ATTENTION_BACKEND" ]]; then + BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND" + fi + # DP-EP attention mode if [[ -z "$DP_EP" ]]; then BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE" diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh index 9308c81da0635..8199fd516cd43 100755 --- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh @@ -15,14 +15,14 @@ configs=( run_tests() { local label=$1 - local extra_env=$2 + local extra_args=$2 echo "=== Running tests (${label}) ===" for cfg in "${configs[@]}"; do - echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}" + echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}" # Use 'env' to safely set variables without eval - if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then - echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}" + if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then + echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}" exit 1 fi done @@ -34,8 +34,8 @@ run_tests "default backend" "" # Check if FLASHINFER is set (non-empty) if [[ -n "${FLASHINFER:-}" ]]; then - echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER" - run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER" + echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER" + run_tests "FLASHINFER backend" "--attention-backend FLASHINFER" else echo "FLASHINFER not set, skipping FLASHINFER runs." fi diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 66804fa671c7c..25f4308079595 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1132,7 +1132,7 @@ def _run_abort_timeout_test(llm: LLM, timeout: int): "TRITON_ATTN", ], ) -def test_register_kv_caches(dist_init, attn_backend, monkeypatch): +def test_register_kv_caches(dist_init, attn_backend): """ Test that register_kv_caches() properly calls nixl_wrapper methods with correct data. @@ -1144,9 +1144,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch): block layout info """ - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - - vllm_config = create_vllm_config() + vllm_config = create_vllm_config(attention_backend=attn_backend) # Import the appropriate backend based on the parameter if attn_backend == "FLASH_ATTN": diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 5cdb1f84b30d4..3a0dbb8e43b52 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -11,6 +11,7 @@ import torch from vllm import SamplingParams from vllm.config import ( + AttentionConfig, CacheConfig, DeviceConfig, KVTransferConfig, @@ -94,6 +95,7 @@ def create_vllm_config( dtype: str = "float16", cache_dtype: str = "auto", hf_overrides: dict[str, Any] | None = None, + attention_backend: str | None = None, ) -> VllmConfig: """Initialize VllmConfig For Testing.""" model_config = ModelConfig( @@ -124,12 +126,14 @@ def create_vllm_config( enable_permute_local_kv=enable_permute_local_kv, kv_connector_extra_config=kv_connector_extra_config or {}, ) + attention_config = AttentionConfig(backend=attention_backend) return VllmConfig( scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config, kv_transfer_config=kv_transfer_config, device_config=DeviceConfig("cpu"), + attention_config=attention_config, ) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 57474a3dc01e7..1ac5e5b8cdc57 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -13,7 +13,6 @@ from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch from vllm.platforms import current_platform -from vllm.utils.system_utils import set_env_var CPU_BLOCK_SIZES = [48] ATTN_BACKENDS = ["FLASH_ATTN"] @@ -180,13 +179,13 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None: topic="test", ) - with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend): - llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", - gpu_memory_utilization=0.5, - kv_events_config=kv_events_config, - kv_transfer_config=kv_transfer_config, - ) + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + gpu_memory_utilization=0.5, + kv_events_config=kv_events_config, + kv_transfer_config=kv_transfer_config, + attention_config={"backend": attn_backend}, + ) events_endpoint = events_endpoint.replace("*", "127.0.0.1") subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 55e9b4d0660f5..f63cd3a6e42aa 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -15,6 +15,7 @@ from tests.v1.attention.utils import ( ) from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( + AttentionConfig, CacheConfig, DeviceConfig, ModelConfig, @@ -38,6 +39,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" def _create_proposer( method: str, num_speculative_tokens: int, + attention_backend: str | None = None, speculative_token_tree: list[tuple[int, ...]] | None = None, ) -> EagleProposer: model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100) @@ -70,6 +72,7 @@ def _create_proposer( max_model_len=model_config.max_model_len, is_encoder_decoder=model_config.is_encoder_decoder, ), + attention_config=AttentionConfig(backend=attention_backend), ) return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type) @@ -331,8 +334,6 @@ def test_load_model( use_distinct_lm_head, monkeypatch, ): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): pytest.skip( "TRITON_ATTN does not support " @@ -394,7 +395,9 @@ def test_load_model( assert not isinstance(target_model, SupportsMultiModal) # Create proposer using the helper function - proposer = _create_proposer(method, num_speculative_tokens=8) + proposer = _create_proposer( + method, num_speculative_tokens=8, attention_backend=attn_backend + ) # Call the method under test proposer.load_model(target_model) @@ -420,8 +423,6 @@ def test_load_model( @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8]) def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): pytest.skip( "TRITON_ATTN does not support " @@ -449,7 +450,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): seq_lens = [seq_len_1, seq_len_2] # Create proposer first so we can use its actual hidden_size - proposer = _create_proposer("eagle", num_speculative_tokens) + proposer = _create_proposer( + "eagle", num_speculative_tokens, attention_backend=attn_backend + ) # Get the hidden_size from the proposer to ensure consistency hidden_size = proposer.hidden_size @@ -622,7 +625,9 @@ def test_propose_tree(spec_token_tree): # Create proposer first so we can use its actual hidden_size. proposer = _create_proposer( - "eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree + "eagle", + num_speculative_tokens, + speculative_token_tree=spec_token_tree, ) # Get the hidden_size from the proposer to ensure consistency. hidden_size = proposer.hidden_size diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 15a6bd2659ea9..42991f9f1ae03 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -38,53 +38,48 @@ def test_ngram_max_len(num_speculative_tokens: int): def test_eagle_max_len( monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str ): - with monkeypatch.context() as m: - m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) - - if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): - pytest.skip( - "TRITON_ATTN does not support " - "multi-token eagle spec decode on current platform" - ) - - if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): - m.setenv("VLLM_ROCM_USE_AITER", "1") - - llm = LLM( - model="meta-llama/Meta-Llama-3-8B-Instruct", - enforce_eager=True, # For faster initialization. - speculative_config={ - "method": "eagle", - "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", - "num_speculative_tokens": num_speculative_tokens, - "max_model_len": 80, - }, - max_model_len=200, + if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): + pytest.skip( + "TRITON_ATTN does not support " + "multi-token eagle spec decode on current platform" ) - sampling_params = SamplingParams(max_tokens=200, ignore_eos=True) - outputs = llm.generate(_PROMPTS, sampling_params) - for o in outputs: - assert o.outputs[0].finish_reason == "length", ( - "This test is only meaningful if the output " - "is truncated due to max length" - ) - sampling_params = SamplingParams( - max_tokens=200, - structured_outputs=StructuredOutputsParams( - regex="^" + "a b c d e " * 15 + "$" - ), + if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm(): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + + llm = LLM( + model="meta-llama/Meta-Llama-3-8B-Instruct", + enforce_eager=True, # For faster initialization. + speculative_config={ + "method": "eagle", + "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", + "num_speculative_tokens": num_speculative_tokens, + "max_model_len": 80, + }, + max_model_len=200, + attention_config={"backend": attn_backend}, + ) + sampling_params = SamplingParams(max_tokens=200, ignore_eos=True) + outputs = llm.generate(_PROMPTS, sampling_params) + for o in outputs: + assert o.outputs[0].finish_reason == "length", ( + "This test is only meaningful if the output is truncated due to max length" ) - output = llm.generate(_PROMPTS, sampling_params) - for o in output: - assert o.prompt_token_ids is not None - assert ( - len(o.prompt_token_ids) - < 80 - < len(o.prompt_token_ids) + len(o.outputs[0].token_ids) - <= 200 - ), ( - "This test is only meaningful if the output " - "is longer than the eagle max length" - ) - assert o.outputs[0].text == "a b c d e " * 15 + + sampling_params = SamplingParams( + max_tokens=200, + structured_outputs=StructuredOutputsParams(regex="^" + "a b c d e " * 15 + "$"), + ) + output = llm.generate(_PROMPTS, sampling_params) + for o in output: + assert o.prompt_token_ids is not None + assert ( + len(o.prompt_token_ids) + < 80 + < len(o.prompt_token_ids) + len(o.outputs[0].token_ids) + <= 200 + ), ( + "This test is only meaningful if the output " + "is longer than the eagle max length" + ) + assert o.outputs[0].text == "a b c d e " * 15 diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index e2410a70b1a63..e231c600cba7a 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend): raise ValueError( f"Head size {head_size} is not supported by {attn_type}. " f"Supported head sizes are: {cls.get_supported_head_sizes()}. " - "Set --attention-config.backend=FLEX_ATTENTION to use " + "Set --attention-backend=FLEX_ATTENTION to use " "FlexAttention backend which supports all head sizes." ) From e3a0f21e6ce78268865cafcdc3dc58c7a80dbc57 Mon Sep 17 00:00:00 2001 From: Xunzhuo Date: Thu, 18 Dec 2025 02:45:56 +0800 Subject: [PATCH 47/49] [docs]: add ecosystem projects sr in docs/governance (#30844) Signed-off-by: bitliu --- docs/governance/committers.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/governance/committers.md b/docs/governance/committers.md index c9428027da953..2f0780a08978b 100644 --- a/docs/governance/committers.md +++ b/docs/governance/committers.md @@ -181,3 +181,4 @@ If you have PRs touching the area, please feel free to ping the area owner for r - Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers) - Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel) +- Semantic Router: [@xunzhuo](https://github.com/xunzhuo), [@rootfs](https://github.com/rootfs) and [see more details](https://vllm-semantic-router.com/community/team) From e06d0bf0aa2af11220b5c3aa5ccc8f999d0e3161 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 17 Dec 2025 15:20:22 -0500 Subject: [PATCH 48/49] 2.9.1 PyTorch release update (#28495) --- .buildkite/test-amd.yaml | 2 +- .buildkite/test-pipeline.yaml | 2 +- CMakeLists.txt | 4 ++-- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/cuda.txt | 6 +++--- requirements/rocm-build.txt | 8 ++++---- requirements/test.in | 6 +++--- requirements/test.txt | 8 ++++---- vllm/model_executor/layers/conv.py | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e8f99100a8de0..6df373632d730 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -740,7 +740,7 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 + - uv pip install --system torchao==0.14.1 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b4de630b09417..8e3bcfe4a36bc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -658,7 +658,7 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca71f6ba4df0..a14496e035d9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,8 +56,8 @@ endif() # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1") # # Try to find python package with an executable that exactly matches diff --git a/pyproject.toml b/pyproject.toml index a250ab6567f12..c03f96dd7acd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging>=24.2", "setuptools>=77.0.3,<81.0.0", "setuptools-scm>=8.0", - "torch == 2.9.0", + "torch == 2.9.1", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index 23ff8d4fdc1c0..3756371638bad 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,7 +4,7 @@ ninja packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 -torch==2.9.0 +torch==2.9.1 wheel jinja2>=3.1.6 regex diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 462f18ef7159b..1417fb99120bc 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.9.0 -torchaudio==2.9.0 +torch==2.9.1 +torchaudio==2.9.1 # These must be updated alongside torch -torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.5.3 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index b977e80be067f..54af9d995c4a2 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -2,11 +2,11 @@ -r common.txt --extra-index-url https://download.pytorch.org/whl/rocm6.4 -torch==2.9.0 -torchvision==0.24.0 -torchaudio==2.9.0 +torch==2.9.1 +torchvision==0.24.1 +torchaudio==2.9.1 -triton==3.5.0 +triton==3.5.1 cmake>=3.26.1,<4 packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/requirements/test.in b/requirements/test.in index dfae5b75821f8..55452ce83f232 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -24,9 +24,9 @@ soundfile # required for audio tests jiwer # required for audio tests tblib # for pickling test exceptions timm >=1.0.17 # required for internvl and gemma3n-mm test -torch==2.9.0 -torchaudio==2.9.0 -torchvision==0.24.0 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.5 # required for voxtral test diff --git a/requirements/test.txt b/requirements/test.txt index 571194e05c1ba..ea2093e4347fe 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1123,7 +1123,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.9.0+cu129 +torch==2.9.1+cu129 # via # -r requirements/test.in # accelerate @@ -1152,7 +1152,7 @@ torch==2.9.0+cu129 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.9.0+cu129 +torchaudio==2.9.1+cu129 # via # -r requirements/test.in # encodec @@ -1165,7 +1165,7 @@ torchmetrics==1.7.4 # pytorch-lightning # terratorch # torchgeo -torchvision==0.24.0+cu129 +torchvision==0.24.1+cu129 # via # -r requirements/test.in # lightly @@ -1206,7 +1206,7 @@ transformers==4.57.3 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.5.0 +triton==3.5.1 # via torch tritonclient==2.51.0 # via diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py index 8d51e5bd9920a..1cd02698b3863 100644 --- a/vllm/model_executor/layers/conv.py +++ b/vllm/model_executor/layers/conv.py @@ -251,6 +251,6 @@ class Conv3dLayer(ConvLayerBase): # See: https://github.com/vllm-project/vllm/issues/27406 # and https://github.com/pytorch/pytorch/issues/166122 # By default, we use CUDNN's convolution ops with optimization. - if self.enable_linear and is_torch_equal("2.9.0"): + if self.enable_linear and (is_torch_equal("2.9.0") or is_torch_equal("2.9.1")): return self._forward_mulmat(x) return self._forward_conv(x) From e3fc374a9a69dddb16885d810f1e28d3fdd39ebd Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 17 Dec 2025 18:00:59 -0500 Subject: [PATCH 49/49] [BugFix] Workspace allocation during profile run : DeepEPHighThroughput + DeepGEMM (#30899) --- vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 484314091cb15..b0834e861338f 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -795,7 +795,10 @@ class FusedMoEModularKernel(torch.nn.Module): top_k, global_num_experts, local_num_experts, - expert_tokens_meta, + # expert_tokens_meta help in allocating optimal/minimal + # amount of workspace. Mark it None, so we allocate for + # the worst-case scenario. + expert_tokens_meta=None, ) )