From 10ee1c64cfa7c0b7f68e9ee793435c9cafbf821a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 14:28:34 -0500
Subject: [PATCH 01/49] [CI] Generalize gsm8k test args and add Qwen3-Next MTP
 B200 test (#30723)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  4 +-
 tests/evals/gsm8k/README.md                   | 13 ++--
 .../DeepSeek-V2-Lite-Instruct-FP8.yaml        |  3 +-
 .../Llama-3-8B-Instruct-nonuniform-CT.yaml    |  2 +-
 .../Llama-3.2-1B-Instruct-INT8-CT.yaml        |  2 +-
 .../gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml   |  2 +-
 .../Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml   |  2 +-
 tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml |  2 +-
 .../gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml    |  3 +-
 .../configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml | 12 ++++
 .../evals/gsm8k/configs/models-blackwell.txt  |  1 +
 tests/evals/gsm8k/conftest.py                 |  8 +--
 tests/evals/gsm8k/test_gsm8k_correctness.py   | 70 +++++++++++--------
 .../compressed_tensors_moe.py                 | 11 +--
 14 files changed, 78 insertions(+), 57 deletions(-)
 create mode 100644 tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9d0b3fdd3a02c..8e6d32f71f220 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -654,7 +654,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: OpenAI API correctness # 22min
   timeout_in_minutes: 30
@@ -1064,7 +1064,7 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md
index 29c5199e1e87a..dcbfd85bfeee8 100644
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
 ### Run tests with pytest (like buildkite)
 
 ```bash
-pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 ```
 
 ### Run standalone evaluation script
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 accuracy_threshold: 0.54  # Minimum expected accuracy
 num_questions: 1319       # Number of questions (default: full test set)
 num_fewshot: 5            # Few-shot examples from train set
-max_model_len: 4096       # Model context length
+server_args: "--max-model-len 4096 --tensor-parallel-size 2"  # Server arguments
+env:                      # Environment variables (optional)
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
 ```
+
+The `server_args` field accepts any arguments that can be passed to `vllm serve`.
+
+The `env` field accepts a dictionary of environment variables to set for the server process.
diff --git a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
index 7ec6a1e0be27f..72fa7e8a38c73 100644
--- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 accuracy_threshold: 0.72
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
index caa0448f23d48..b7b59e9dcd5ce 100644
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 accuracy_threshold: 0.74
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
index 615aa69a2d2b6..8b3c9ff645e87 100644
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
 accuracy_threshold: 0.31
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
index 9297bf6ddf2d3..4a1b1948acac8 100644
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index 5319ada30f645..5ce3af8be346a 100644
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 accuracy_threshold: 0.60
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
index c39fb979d98ac..5452ebe753f04 100644
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
 accuracy_threshold: 0.375
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
\ No newline at end of file
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
index 6b7bdd1e65bb3..f162aa8bfe5b0 100644
--- a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
 accuracy_threshold: 0.89
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
-
+server_args: "--enforce-eager --max-model-len 4096"
diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
new file mode 100644
index 0000000000000..673b473f817eb
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
@@ -0,0 +1,12 @@
+model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt
index 3c9b1084de7bc..39978aa6ffbe9 100644
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@@ -3,3 +3,4 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
+Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py
index 1932a13cdfc63..6f25fe6414af4 100644
--- a/tests/evals/gsm8k/conftest.py
+++ b/tests/evals/gsm8k/conftest.py
@@ -11,14 +11,12 @@ def pytest_addoption(parser):
         default="configs/models-small.txt",
         help="File containing list of config files to test",
     )
-    parser.addoption("--tp-size", default=1, type=int, help="Tensor parallel size")
 
 
 def pytest_generate_tests(metafunc):
     """Generate test parameters from config files."""
     if "config_filename" in metafunc.fixturenames:
         config_list_file = metafunc.config.getoption("--config-list-file")
-        tp_size = metafunc.config.getoption("--tp-size")
 
         # Handle both relative and absolute paths
         config_list_path = Path(config_list_file)
@@ -55,9 +53,9 @@ def pytest_generate_tests(metafunc):
         # Generate test parameters
         if config_files:
             metafunc.parametrize(
-                ["config_filename", "tp_size"],
-                [(config_file, int(tp_size)) for config_file in config_files],
-                ids=[f"{config_file.stem}-tp{tp_size}" for config_file in config_files],
+                "config_filename",
+                config_files,
+                ids=[config_file.stem for config_file in config_files],
             )
         else:
             print("No config files found, test will be skipped")
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
index b5d67df7bf3db..ea6715f5cb532 100644
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -5,30 +5,31 @@ GSM8K evaluation using vLLM server and isolated GSM8K script.
 Replacement for lm-eval-harness with better performance and control.
 
 Usage:
-pytest -s -v test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 """
 
+import shlex
+
 import yaml
 
 from tests.utils import RemoteOpenAIServer
 
 from .gsm8k_eval import evaluate_gsm8k
 
-RTOL = 0.08  # Relative tolerance for accuracy comparison
+TOL = 0.08  # Absolute tolerance for accuracy comparison
 
 
-def launch_gsm8k_eval(eval_config, server_url, tp_size):
-    """Launch GSM8K evaluation using our isolated script."""
+def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
+    """Run GSM8K evaluation using our isolated script."""
     # Extract host and port from server URL
     if "://" in server_url:
         server_url = server_url.split("://")[1]
 
     host_port = server_url.split("/")[0]  # Remove path if present
     if ":" in host_port:
-        host, port = host_port.split(":")
-        port = int(port)
+        host, p = host_port.split(":")
+        port = int(p)
     else:
         host = host_port
         port = 8000
@@ -48,46 +49,57 @@ def launch_gsm8k_eval(eval_config, server_url, tp_size):
     return results
 
 
-def test_gsm8k_correctness_param(config_filename, tp_size):
+def test_gsm8k_correctness(config_filename):
     """Test GSM8K correctness for a given model configuration."""
     eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
 
-    # Server arguments
-    server_args = [
-        "--max-model-len",
-        str(eval_config.get("max_model_len", 4096)),
-        "--enforce-eager",
-        "--trust-remote-code",
-        "--tensor-parallel-size",
-        str(tp_size),
-    ]
+    # Parse server arguments from config (use shlex to handle quoted strings)
+    server_args_str = eval_config.get("server_args", "")
+    server_args = shlex.split(server_args_str) if server_args_str else []
+
+    # Add standard server arguments
+    server_args.extend(
+        [
+            "--trust-remote-code",
+        ]
+    )
 
     env_dict = eval_config.get("env", None)
 
+    print(f"Starting GSM8K evaluation for model: {eval_config['model_name']}")
+    print(f"Expected metric threshold: {eval_config['accuracy_threshold']}")
+    print(f"Number of questions: {eval_config['num_questions']}")
+    print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
+    print(f"Server args: {' '.join(server_args)}")
+
     # Launch server and run evaluation
     with RemoteOpenAIServer(
-        eval_config["model_name"], server_args, env_dict=env_dict, max_wait_seconds=480
+        eval_config["model_name"],
+        server_args,
+        env_dict=env_dict,
+        max_wait_seconds=600,
     ) as remote_server:
         server_url = remote_server.url_for("v1")
+        print(f"Server started at: {server_url}")
 
-        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+        results = run_gsm8k_eval(eval_config, server_url)
 
-        # Check accuracy against threshold
-        measured_accuracy = results["accuracy"]
-        expected_accuracy = eval_config["accuracy_threshold"]
+        measured_metric = results["accuracy"]
+        expected_metric = eval_config["accuracy_threshold"]
 
         print(f"GSM8K Results for {eval_config['model_name']}:")
-        print(f"  Accuracy: {measured_accuracy:.3f}")
-        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Measured metric: {measured_metric:.4f}")
+        print(f"  Expected metric: {expected_metric:.4f}")
+        print(f"  Tolerance: {TOL:.4f}")
         print(f"  Questions: {results['num_questions']}")
         print(f"  Invalid rate: {results['invalid_rate']:.3f}")
         print(f"  Latency: {results['latency']:.1f}s")
         print(f"  QPS: {results['questions_per_second']:.1f}")
 
-        # Verify accuracy is within tolerance
-        assert measured_accuracy >= expected_accuracy - RTOL, (
-            f"Accuracy too low: {measured_accuracy:.3f} < "
-            f"{expected_accuracy:.3f} - {RTOL:.3f}"
+        # Verify metric is within tolerance
+        assert measured_metric >= expected_metric - TOL, (
+            f"GSM8K metric too low: {measured_metric:.4f} < "
+            f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
         )
 
         print(f"✅ GSM8K test passed for {eval_config['model_name']}")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index f650a6eabbb9c..c302e465aedb7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -626,17 +626,11 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
         else:
+            # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
+            # only (no EP).
             from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 
-            assert layer.expert_map is None, (
-                "Expert Parallelism / expert_map "
-                "is currently not supported for "
-                "CompressedTensorsW4A4Nvfp4MoEMethod."
-            )
             assert self.moe_quant_config is not None
-
-            # Cutlass moe takes in activations in BF16/Half precision
-            # and fp4 quantized weights loaded from the checkpoint
             return cutlass_moe_fp4(
                 a=x,
                 w1_fp4=layer.w13_weight,
@@ -644,6 +638,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 quant_config=self.moe_quant_config,
+                expert_map=layer.expert_map,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 # TODO(bnell): derive these from arguments
                 m=x.shape[0],

From ca702a14dc2d4c5c077dbb8098e66ca244cea185 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 16 Dec 2025 20:36:49 +0100
Subject: [PATCH 02/49] [Frontend] Add `max-completion-token` option to
 transcription/translation endpoints (#30769)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../test_transcription_validation_whisper.py  | 32 ++++++++++++++++++
 .../openai/test_translation_validation.py     | 33 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |  6 ++++
 vllm/entrypoints/openai/speech_to_text.py     | 10 ++++--
 4 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
index 3c507ee0a3fa7..8bf729c517f7a 100644
--- a/tests/entrypoints/openai/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
     )
     assert transcription.segments is not None
     assert len(transcription.segments) > 0
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index d7d407484f16d..2c577237691ab 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model):
     )
     out = json.loads(translation)["text"].strip().lower()
     assert out.count("greek sea") == 2
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
+    transcription = await client.audio.translations.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(model_name)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await client.audio.transcriptions.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a7c4980cd3674..94dde4564ea0c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2054,6 +2054,9 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     presence_penalty: float | None = 0.0
     """The presence penalty to use for sampling."""
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
     # --8<-- [end:transcription-sampling-params]
 
     # Default sampling parameters for transcription requests.
@@ -2300,6 +2303,9 @@ class TranslationRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: bool | None = False
     stream_continuous_usage_stats: bool | None = False
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
     # --8<-- [end:translation-extra-params]
 
     # Default sampling parameters for translation requests.
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index cea9924ebbaca..df9c06adb105a 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -293,8 +293,14 @@ class OpenAISpeechToText(OpenAIServing):
         try:
             # Unlike most decoder-only models, whisper generation length is not
             # constrained by the size of the input audio, which is mapped to a
-            # fixed-size log-mel-spectogram.
-            default_max_tokens = self.model_config.max_model_len
+            # fixed-size log-mel-spectogram. Still, allow for fewer tokens to be
+            # generated by respecting the extra completion tokens arg.
+            if request.max_completion_tokens is None:
+                default_max_tokens = self.model_config.max_model_len
+            else:
+                default_max_tokens = min(
+                    self.model_config.max_model_len, request.max_completion_tokens
+                )
             sampling_params = request.to_sampling_params(
                 default_max_tokens, self.default_sampling_params
             )

From f21f5ea38c6fa0e824bc00d5762d17e049199cd3 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 16 Dec 2025 14:50:59 -0500
Subject: [PATCH 03/49] [Refactor] Small refactor for group topk (#30562)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
---
 csrc/moe/grouped_topk_kernels.cu              | 13 ++++++++++---
 tests/v1/determinism/test_batch_invariance.py |  1 -
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 5fa367abd96f5..7229e420d3fe4 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -446,9 +446,13 @@ __device__ inline T apply_sigmoid(T val) {
 
 template <ScoringFunc SF, typename T>
 __device__ inline T apply_scoring(T val) {
-  if constexpr (SF == SCORING_SIGMOID) {
+  if constexpr (SF == SCORING_NONE) {
+    return val;
+  } else if constexpr (SF == SCORING_SIGMOID) {
     return apply_sigmoid(val);
   } else {
+    static_assert(SF == SCORING_NONE || SF == SCORING_SIGMOID,
+                  "Unsupported ScoringFunc in apply_scoring");
     return val;
   }
 }
@@ -670,10 +674,13 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   if (case_id < num_tokens) {
     if (if_proceed_next_topk) {
+      float scale = routed_scaling_factor;
+      if (renormalize) {
+        scale /= topk_sum;
+      }
       for (int i = lane_id; i < topk; i += WARP_SIZE) {
         float base = cuda_cast<float, T>(s_topk_value[i]);
-        float value = renormalize ? (base / topk_sum * routed_scaling_factor)
-                                  : (base * routed_scaling_factor);
+        float value = base * scale;
         topk_indices[i] = s_topk_idx[i];
         topk_values[i] = value;
       }
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 1c45e7fe366ff..7a58e1c9bad03 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -188,7 +188,6 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     llm = LLM(
         model=model_name,
         tensor_parallel_size=tp_size,
-        # enable_prefix_caching=False,
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported

From 254a7f8fd613d6b6964abc277b73ca1f0b823cdb Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Tue, 16 Dec 2025 13:01:48 -0800
Subject: [PATCH 04/49] [Perf] Do FP4 quant before All gather on flashinfer
 trtllmgen MOE  (#30014)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
---
 .../device_communicators/all2all.py           | 29 ++++++++++---
 .../base_device_communicator.py               |  7 +++-
 .../device_communicators/cuda_communicator.py | 16 +++++---
 vllm/distributed/parallel_state.py            | 13 ++++--
 .../layers/fused_moe/fused_moe_method_base.py | 12 ++++++
 vllm/model_executor/layers/fused_moe/layer.py | 41 ++++++++++++++++++-
 .../layers/quantization/modelopt.py           | 25 ++++++++++-
 .../quantization/utils/flashinfer_fp4_moe.py  | 36 +++++++++-------
 vllm/utils/flashinfer.py                      | 17 ++++++++
 9 files changed, 165 insertions(+), 31 deletions(-)

diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index c40dde26b741f..7a4e81cf967de 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -64,7 +64,12 @@ class NaiveAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        if extra_tensors is not None:
+            raise NotImplementedError(
+                "extra_tensors is not supported for NaiveAll2AllManager"
+            )
         sp_size = self.tp_group.world_size if is_sequence_parallel else 1
         dp_metadata = get_forward_context().dp_metadata
         assert dp_metadata is not None
@@ -76,6 +81,7 @@ class NaiveAll2AllManager(All2AllManagerBase):
         router_logits = self.naive_multicast(
             router_logits, cu_tokens_across_sp_cpu, is_sequence_parallel
         )
+
         return hidden_states, router_logits
 
     def combine(
@@ -113,7 +119,11 @@ class AgRsAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         """
         Gather hidden_states and router_logits from all dp ranks.
         """
@@ -121,15 +131,22 @@ class AgRsAll2AllManager(All2AllManagerBase):
         assert dp_metadata is not None
         sizes = dp_metadata.get_chunk_sizes_across_dp_rank()
         assert sizes is not None
-
         dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
         assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
-        hidden_states, router_logits = dist_group.all_gatherv(
-            [hidden_states, router_logits],
+
+        tensors_to_gather = [hidden_states, router_logits]
+        if extra_tensors is not None:
+            tensors_to_gather.extend(extra_tensors)
+
+        gathered_tensors = dist_group.all_gatherv(
+            tensors_to_gather,
             dim=0,
             sizes=sizes,
         )
-        return hidden_states, router_logits
+
+        if extra_tensors is not None:
+            return (gathered_tensors[0], gathered_tensors[1], gathered_tensors[2:])
+        return gathered_tensors[0], gathered_tensors[1]
 
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
@@ -204,6 +221,7 @@ class PPLXAll2AllManager(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
@@ -251,6 +269,7 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
+        extra_tensors: list[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 3a849da70e4cb..caeff54406b59 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
+from typing import Any
 from weakref import WeakValueDictionary
 
 import torch
@@ -68,7 +69,11 @@ class All2AllManagerBase:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ):
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> Any:
+        # Subclasses should either:
+        # - implement handling for extra_tensors, or
+        # - raise a clear error if extra_tensors is not supported.
         raise NotImplementedError
 
     def set_num_sms(self, num_sms: int):
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index cd9c267beb5b5..9542498c453ec 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -318,17 +318,23 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
         return output_list
 
-    def dispatch(
+    def dispatch(  # type: ignore[override]
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         assert self.all2all_manager is not None
-        hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits, is_sequence_parallel
+        return self.all2all_manager.dispatch(
+            hidden_states,
+            router_logits,
+            is_sequence_parallel,
+            extra_tensors,  # type: ignore[call-arg]
         )
-        return hidden_states, router_logits
 
     def combine(
         self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 338cb1f1814b5..f5ada5a009ec3 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1007,10 +1007,17 @@ class GroupCoordinator:
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         is_sequence_parallel: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        extra_tensors: list[torch.Tensor] | None = None,
+    ) -> (
+        tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, torch.Tensor, list[torch.Tensor]]
+    ):
         if self.device_communicator is not None:
-            return self.device_communicator.dispatch(
-                hidden_states, router_logits, is_sequence_parallel
+            return self.device_communicator.dispatch(  # type: ignore[call-arg]
+                hidden_states,
+                router_logits,
+                is_sequence_parallel,
+                extra_tensors,
             )
         else:
             return hidden_states, router_logits
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 8c9d8a2777d58..a46e3972ed8e3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -71,6 +71,18 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             "implementation based on the prepare_finalize"
         )
 
+    def prepare_dp_allgather_tensor(
+        self,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
+        raise NotImplementedError(
+            "Method 'prepare_dp_allgather_tensor' is not implemented in "
+            f"{self.__class__.__name__}."
+        )
+
     @abstractmethod
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index cc3afade709d9..b39ce415a0f83 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -44,6 +44,7 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     is_flashinfer_supporting_global_sf,
 )
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import (
     aux_stream,
@@ -1933,10 +1934,46 @@ class FusedMoE(CustomOp):
         )
 
         with sp_ctx:
+            extra_tensors = None
             if do_naive_dispatch_combine:
-                hidden_states_combined, router_logits = get_ep_group().dispatch(
-                    hidden_states, router_logits, self.is_sequence_parallel
+                # Avoid circular import
+                from vllm.model_executor.layers.quantization.modelopt import (
+                    ModelOptNvFp4FusedMoE,
                 )
+
+                post_quant_allgather = (
+                    has_flashinfer_trtllm_fused_moe()
+                    and self.quant_method is not None
+                    and self.dp_size > 1
+                    and self.use_ep
+                    and isinstance(self.quant_method, ModelOptNvFp4FusedMoE)
+                )
+                if post_quant_allgather:
+                    hidden_states_to_dispatch, extra_tensors = (
+                        self.quant_method.prepare_dp_allgather_tensor(
+                            self, hidden_states, router_logits
+                        )
+                    )
+                else:
+                    hidden_states_to_dispatch = hidden_states
+
+                dispatch_res = get_ep_group().dispatch(
+                    hidden_states_to_dispatch,
+                    router_logits,
+                    self.is_sequence_parallel,
+                    extra_tensors=extra_tensors,
+                )
+                if extra_tensors is not None:
+                    hidden_states_combined, router_logits, extra_tensors_combined = (
+                        dispatch_res
+                    )
+                    hidden_states_combined = (
+                        hidden_states_combined,
+                        extra_tensors_combined[0],
+                    )
+                else:
+                    hidden_states_combined, router_logits = dispatch_res
+
             # Run shared experts before matrix multiply.
             # because matrix multiply maybe modify the hidden_states.
             if has_separate_shared_experts and not use_shared_experts_stream:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index f71854e6b63c5..d5d7e7bfaae73 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1522,6 +1522,24 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 w2_blockscale_swizzled, requires_grad=False
             )
 
+    def prepare_dp_allgather_tensor(
+        self,
+        layer: FusedMoE,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """Optionally prepare extra tensors to carry through DP allgather/EP."""
+        import flashinfer
+
+        a1_gscale = layer.w13_input_scale_quant
+        hidden_states_fp4, hidden_states_sf = flashinfer.fp4_quantize(
+            hidden_states,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
+        extra_tensors: list[torch.Tensor] = [hidden_states_sf]
+        return hidden_states_fp4, extra_tensors
+
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
@@ -1576,8 +1594,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
+        # Hidden_states in select_experts is only used to extract metadata
+        if isinstance(x, tuple):
+            x_routing, _ = x
+        else:
+            x_routing = x
         topk_weights, topk_ids, _ = layer.select_experts(
-            hidden_states=x,
+            hidden_states=x_routing,
             router_logits=router_logits,
         )
 
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 76bce8a8d98d6..1d410316d6299 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -238,7 +238,7 @@ def prepare_static_weights_for_trtllm_fp4_moe(
 
 def flashinfer_trtllm_fp4_moe(
     layer: torch.nn.Module,
-    x: torch.Tensor,
+    x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
     router_logits: torch.Tensor,
     top_k: int,
     global_num_experts: int,
@@ -269,12 +269,16 @@ def flashinfer_trtllm_fp4_moe(
     from vllm.model_executor.models.llama4 import Llama4MoE
 
     # Quantize input to FP4
-    a1_gscale = layer.w13_input_scale_quant
-    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-        x,
-        a1_gscale,
-        is_sf_swizzled_layout=False,
-    )
+    if isinstance(x, tuple):
+        hidden_states_fp4, hidden_states_scale_linear_fp4 = x
+    else:
+        # hidden_states is the already quantized
+        a1_gscale = layer.w13_input_scale_quant
+        (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+            x,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
 
     # Determine routing method type
     use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function
@@ -360,13 +364,17 @@ def flashinfer_trtllm_fp4_routed_moe(
         torch.bfloat16
     ).view(torch.int16)
 
-    # Quantize input to FP4
-    a1_gscale = layer.w13_input_scale_quant
-    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
-        x,
-        a1_gscale,
-        is_sf_swizzled_layout=False,
-    )
+    if isinstance(x, tuple):
+        # Hidden_states is the already quantized
+        hidden_states_fp4, hidden_states_scale_linear_fp4 = x
+    else:
+        # Quantize input to FP4
+        a1_gscale = layer.w13_input_scale_quant
+        (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+            x,
+            a1_gscale,
+            is_sf_swizzled_layout=False,
+        )
 
     # Call TRT-LLM FP4 block-scale MoE kernel
     out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe(
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5019b771f4a14..1c2710be3173b 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -184,6 +184,23 @@ def has_flashinfer_cutedsl() -> bool:
     )
 
 
+@functools.cache
+def has_flashinfer_trtllm_fused_moe() -> bool:
+    """Return `True` if FlashInfer TRTLLM fused MoE is available."""
+    if not has_flashinfer_moe():
+        return False
+    required_functions = [
+        ("flashinfer.fused_moe", "trtllm_fp8_block_scale_moe"),
+        ("flashinfer.fused_moe", "trtllm_fp8_per_tensor_scale_moe"),
+        ("flashinfer.fused_moe", "trtllm_fp4_block_scale_moe"),
+    ]
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
     """Return `True` if FlashInfer CUTLASS fused MoE is available."""

From 9fec0e13d512b6b9082e40297582d8052f434610 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 16 Dec 2025 17:10:16 -0500
Subject: [PATCH 05/49] [Attention] Cache attention metadata builds across
 hybrid KV-cache groups (#29627)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Stanislaw Wozniak <stw@zurich.ibm.com>
---
 .../attention/test_chunked_local_attention.py |  2 +-
 .../layers/chunked_local_attention.py         | 16 +++++++---
 vllm/envs.py                                  |  4 +--
 vllm/v1/attention/backends/flash_attn.py      | 13 ++++++++
 vllm/v1/attention/backends/mamba2_attn.py     | 27 ++++++++++++++++
 vllm/v1/attention/backends/utils.py           | 32 ++++++++++++++++---
 vllm/v1/worker/gpu_model_runner.py            | 24 +++++++++++++-
 7 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index faace3473a281..4529c2cfc29b6 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -172,7 +172,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
     )
 
     # Call the function
-    result = make_local_attention_virtual_batches(
+    result, _ = make_local_attention_virtual_batches(
         attn_chunk_size, common_attn_metadata, block_size
     )
 
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 0ced0028ded9e..7e3794d408332 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -4,7 +4,7 @@ import functools
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
@@ -51,11 +51,19 @@ def create_chunked_local_attention_backend(
             common_prefix_len: int,
             common_attn_metadata: CommonAttentionMetadata,
             fast_build: bool = False,
-        ) -> AttentionMetadata:
-            common_attn_metadata = make_local_attention_virtual_batches(
+        ):
+            cm, make_virtual_batches_block_table = make_local_attention_virtual_batches(
                 attention_chunk_size, common_attn_metadata, block_size
             )
-            return super().build(common_prefix_len, common_attn_metadata, fast_build)
+            metadata = super().build(common_prefix_len, cm, fast_build)
+            metadata.make_virtual_batches_block_table = make_virtual_batches_block_table
+            return metadata
+
+        def update_block_table(
+            self, metadata, blk_table: torch.Tensor, slot_mapping: torch.Tensor
+        ):
+            blk_table = metadata.make_virtual_batches_block_table(blk_table)
+            return super().update_block_table(metadata, blk_table, slot_mapping)
 
     attn_backend = subclass_attention_backend(
         name_prefix=prefix,
diff --git a/vllm/envs.py b/vllm/envs.py
index d0f2798096263..7e072a588591c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -207,7 +207,7 @@ if TYPE_CHECKING:
     VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
-    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
+    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_ATTENTION: str | None = None
     VLLM_NVFP4_GEMM_BACKEND: str | None = None
@@ -1430,7 +1430,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # kv-cache memory usage and enable longer contexts)
     # TODO(lucas): Remove this flag once latency regression is resolved.
     "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(
-        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))
+        int(os.getenv("VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "1"))
     ),
     # Enables support for the "store" option in the OpenAI Responses API.
     # When set to 1, vLLM's OpenAI server will retain the input and output
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index f5ad98cf2125c..3445e998d6371 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 
+import copy
 from dataclasses import dataclass
 from typing import ClassVar
 
@@ -250,6 +251,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         if get_flash_attn_version() == 3
         else AttentionCGSupport.UNIFORM_BATCH
     )
+    supports_update_block_table: bool = True
 
     def __init__(
         self,
@@ -493,6 +495,17 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
         )
         return attn_metadata
 
+    def update_block_table(
+        self,
+        metadata: FlashAttentionMetadata,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> FlashAttentionMetadata:
+        new_metadata = copy.copy(metadata)
+        new_metadata.block_table = blk_table
+        new_metadata.slot_mapping = slot_mapping
+        return new_metadata
+
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index bf1d8f09ab0ac..f923371283aa0 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import itertools
 from dataclasses import dataclass
 
@@ -134,6 +135,8 @@ class Mamba2AttentionMetadata:
 class Mamba2AttentionMetadataBuilder(
     BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]
 ):
+    supports_update_block_table: bool = True
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -346,3 +349,27 @@ class Mamba2AttentionMetadataBuilder(
             num_computed_tokens_p=num_computed_tokens_p,
         )
         return attn_metadata
+
+    def update_block_table(
+        self,
+        metadata: Mamba2AttentionMetadata,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> Mamba2AttentionMetadata:
+        new_metadata = copy.copy(metadata)
+        prefix_caching = self.vllm_config.cache_config.enable_prefix_caching
+        state_indices_t = blk_table if prefix_caching else blk_table[:, 0]
+        num_reqs = blk_table.shape[0]
+
+        # For CUDA graphs, copy to persistent buffer
+        if (
+            metadata.num_prefills == 0
+            and num_reqs <= self.decode_cudagraph_max_bs
+            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        ):
+            persistent_state_indices_t = self.state_indices_tensor[:num_reqs]
+            persistent_state_indices_t.copy_(state_indices_t, non_blocking=True)
+            state_indices_t = persistent_state_indices_t
+
+        new_metadata.state_indices_tensor = state_indices_t
+        return new_metadata
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 1cbe929fc57a8..56763f4b52539 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -4,6 +4,7 @@ import abc
 import enum
 import functools
 from abc import abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass, field, fields, make_dataclass
 from typing import (
     TYPE_CHECKING,
@@ -317,6 +318,9 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # If not, set this to None. Otherwise set it to the query
     # length that will be pulled into the front of the batch.
     reorder_batch_threshold: int | None = None
+    # Does this backend/builder support updating the block table in existing
+    # metadata
+    supports_update_block_table: bool = False
 
     @abstractmethod
     def __init__(
@@ -387,6 +391,21 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
         """
         raise NotImplementedError
 
+    def update_block_table(
+        self,
+        metadata: M,
+        blk_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
+    ) -> M:
+        """
+        Update the block table for the attention metadata.
+        Faster when theres multiple kv-cache groups that create virtually the
+        same metadata but just with different block tables.
+
+        Only needs to be implemented if supports_update_block_table is True.
+        """
+        raise NotImplementedError
+
     def build_for_cudagraph_capture(
         self, common_attn_metadata: CommonAttentionMetadata
     ) -> M:
@@ -603,7 +622,7 @@ def make_local_attention_virtual_batches(
     attn_chunk_size: int,
     common_attn_metadata: CommonAttentionMetadata,
     block_size: int = 0,
-) -> CommonAttentionMetadata:
+) -> tuple[CommonAttentionMetadata, Callable[[torch.Tensor], torch.Tensor]]:
     query_start_loc_np = common_attn_metadata.query_start_loc_cpu.numpy()
     seq_lens_np = common_attn_metadata.seq_lens_cpu.numpy()
     block_table = common_attn_metadata.block_table_tensor
@@ -715,9 +734,12 @@ def make_local_attention_virtual_batches(
     # tensor first, which recovers perf.
     batch_indices_torch = torch.from_numpy(batch_indices)
     block_indices_torch = torch.from_numpy(block_indices)
-    block_table_local = block_table[batch_indices_torch, block_indices_torch].view(
-        virtual_batches, -1
-    )
+
+    # Save as a lambda so we can return this for update_block_table
+    make_block_table = lambda block_table: block_table[
+        batch_indices_torch, block_indices_torch
+    ].view(virtual_batches, -1)
+    block_table_local = make_block_table(block_table)
 
     query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
     seq_lens_cpu = torch.from_numpy(seqlens_k_local)
@@ -736,7 +758,7 @@ def make_local_attention_virtual_batches(
         causal=True,
         _seq_lens_cpu=seq_lens_cpu,
         _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
-    )
+    ), make_block_table
 
 
 def make_kv_sharing_fast_prefill_common_attn_metadata(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1aa2ec6bb655c..179f713c4d86a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1630,6 +1630,15 @@ class GPUModelRunner(
                 logits_indices
             )
 
+        # Cache attention metadata builds across hybrid KV-cache groups
+        # The only thing that changes between different hybrid KV-cache groups when the
+        # same metadata builder and KVCacheSpec is the same is the block table, so we
+        # can cache the attention metadata builds and just update the block table using
+        # `builder.update_block_table` if the builder supports it.
+        cached_attn_metadata: dict[
+            tuple[KVCacheSpec, type[AttentionMetadataBuilder]], AttentionMetadata
+        ] = {}
+
         def _build_attn_group_metadata(
             kv_cache_gid: int,
             attn_gid: int,
@@ -1637,13 +1646,15 @@ class GPUModelRunner(
             ubid: int | None = None,
         ) -> None:
             attn_group = self.attn_groups[kv_cache_gid][attn_gid]
+            builder = attn_group.get_metadata_builder(ubid or 0)
+            cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder))
+
             cascade_attn_prefix_len = (
                 cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
                 if cascade_attn_prefix_lens
                 else 0
             )
 
-            builder = attn_group.get_metadata_builder(ubid or 0)
             extra_attn_metadata_args = {}
             if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                 assert ubid is None, "UBatching not supported with GDN yet"
@@ -1658,12 +1669,23 @@ class GPUModelRunner(
                 attn_metadata_i = builder.build_for_cudagraph_capture(
                     common_attn_metadata
                 )
+            elif (
+                cache_key in cached_attn_metadata
+                and builder.supports_update_block_table
+            ):
+                attn_metadata_i = builder.update_block_table(
+                    cached_attn_metadata[cache_key],
+                    common_attn_metadata.block_table_tensor,
+                    common_attn_metadata.slot_mapping,
+                )
             else:
                 attn_metadata_i = builder.build(
                     common_prefix_len=cascade_attn_prefix_len,
                     common_attn_metadata=common_attn_metadata,
                     **extra_attn_metadata_args,
                 )
+                if builder.supports_update_block_table:
+                    cached_attn_metadata[cache_key] = attn_metadata_i
 
             if ubid is None:
                 assert isinstance(attn_metadata, dict)

From f5f51e5931ffd99afe69696b60765b88d3eb13f2 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Tue, 16 Dec 2025 14:18:17 -0800
Subject: [PATCH 06/49] [Core][MM] Optimize encoder cache manager by operating
 with embeddings only (#30475)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Sun Kim <sunytokki@gmail.com>
---
 .../multimodal/processing/test_mllama4.py     |  4 +-
 tests/multimodal/test_utils.py                | 92 +++++++++++++++++++
 tests/v1/core/test_encoder_cache_manager.py   | 79 +++++++++++++++-
 .../unit/test_ec_example_connector.py         |  2 +-
 .../ec_connector/example_connector.py         |  2 +-
 vllm/model_executor/models/qwen3_vl.py        |  8 +-
 vllm/multimodal/inputs.py                     | 39 +++++++-
 vllm/multimodal/profiling.py                  | 32 ++-----
 vllm/multimodal/registry.py                   |  2 +-
 vllm/v1/core/encoder_cache_manager.py         | 80 ++++++++--------
 vllm/v1/core/sched/scheduler.py               | 35 +++++--
 vllm/v1/request.py                            |  6 +-
 vllm/v1/worker/gpu_model_runner.py            | 49 +++-------
 vllm/v1/worker/utils.py                       |  6 ++
 14 files changed, 306 insertions(+), 130 deletions(-)

diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index e5ff2d1391b62..325159965c803 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
         total_num_patches.item() + num_tiles.item() + 3
     )  # image start, image, image end
 
-    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
+    profiled_tokens = profiler.get_mm_max_tokens(
         max_model_len,
         mm_counts=mm_counts,
     )
 
-    assert total_tokens == profiled_tokens["image"]
+    assert total_num_patches == profiled_tokens["image"]
     assert total_tokens == sum(
         placeholder.length
         for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 636cd0ffd445e..02bb1f769baad 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
 
 import numpy as np
 import pytest
+import torch
 from PIL import Image, ImageChops
 
 from vllm.multimodal.image import convert_image_mode
@@ -410,6 +411,97 @@ def test_argsort_mm_positions(case):
     assert modality_idxs == expected_modality_idxs
 
 
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, 5),
+        (torch.tensor([True, True, True, True, True]), 5),
+        (torch.tensor([False, False, False, False, False]), 0),
+        (torch.tensor([True, False, True, False, True]), 3),
+        (torch.tensor([True]), 1),
+    ],
+)
+def test_placeholder_range_get_num_embeds(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_num_embeds == expected
+
+
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, None),
+        (
+            torch.tensor([False, True, False, True, True]),
+            torch.tensor([0, 1, 1, 2, 3]),
+        ),
+        (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
+    ],
+)
+def test_placeholder_range_embeds_cumsum(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+
+    if expected is None:
+        assert pr.embeds_cumsum is None
+        return
+
+    assert torch.equal(pr.embeds_cumsum, expected)
+    # cached_property should return the same object on repeated access
+    assert pr.embeds_cumsum is pr.embeds_cumsum
+
+
+@pytest.mark.parametrize(
+    "is_embed,start_idx,end_idx,expected",
+    [
+        (None, 2, 4, (2, 4)),
+        (
+            torch.tensor([False, True, False, True, True]),
+            3,
+            5,
+            (1, 3),
+        ),
+        (
+            torch.tensor([False, True, False, True, True]),
+            0,
+            2,
+            (0, 1),
+        ),
+        (
+            torch.tensor([True, False, True, False]),
+            2,
+            2,
+            (1, 1),
+        ),
+    ],
+)
+def test_placeholder_range_get_embeds_indices_in_range(
+    is_embed, start_idx, end_idx, expected
+):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected
+
+
+@pytest.mark.parametrize(
+    "offset,is_embed,expected",
+    [
+        (0, None, [(0, 4)]),
+        (
+            2,
+            torch.tensor([False, True, False, True, True]),
+            [(3, 3), (5, 6)],
+        ),
+        (0, torch.tensor([True, True, True, True]), [(0, 3)]),
+        (0, torch.tensor([False, False, False, False]), []),
+    ],
+)
+def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
+    assert pr.extract_embeds_range() == expected
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
diff --git a/tests/v1/core/test_encoder_cache_manager.py b/tests/v1/core/test_encoder_cache_manager.py
index 8a52b5bd78977..511ff48c401ca 100644
--- a/tests/v1/core/test_encoder_cache_manager.py
+++ b/tests/v1/core/test_encoder_cache_manager.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
+import torch
 
 from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
 from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
@@ -23,7 +24,7 @@ class MockRequest:
             )
             self.mm_features.append(feature)
 
-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
         return self._token_counts[input_id]
 
 
@@ -162,8 +163,8 @@ def test_schedule_request_multi_images_respect_space_limit():
 
     num_tokens_to_schedule = 0
     assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
-    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
-    compute_budget -= req.get_num_encoder_tokens(0)
+    num_tokens_to_schedule += req.get_num_encoder_embeds(0)
+    compute_budget -= req.get_num_encoder_embeds(0)
 
     assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
 
@@ -174,7 +175,75 @@ def test_schedule_request_multi_images_respect_compute_limit():
     compute_budget = 10
     num_tokens_to_schedule = 0
     assert manager.can_allocate(req, 0, compute_budget, num_tokens_to_schedule)
-    num_tokens_to_schedule += req.get_num_encoder_tokens(0)
-    compute_budget -= req.get_num_encoder_tokens(0)
+    num_tokens_to_schedule += req.get_num_encoder_embeds(0)
+    compute_budget -= req.get_num_encoder_embeds(0)
 
     assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
+
+
+def test_encoder_cache_with_is_embed_mask():
+    class MockRequestWithMask(MockRequest):
+        def get_num_encoder_embeds(self, input_id: int) -> int:
+            return self.mm_features[input_id].mm_position.get_num_embeds
+
+    is_embed = torch.zeros(100, dtype=torch.bool)
+    is_embed[torch.tensor([5, 15, 25, 35, 45, 55, 65, 75])] = True
+
+    request = MockRequestWithMask("r1", ["img1"], [100])
+    request.mm_features[0] = MultiModalFeatureSpec(
+        data=None,
+        modality="image",
+        identifier="img1",
+        mm_position=PlaceholderRange(offset=0, length=100, is_embed=is_embed),
+    )
+
+    manager = EncoderCacheManager(cache_size=100)
+    manager.allocate(request, 0)
+
+    assert manager.num_free_slots == 92
+    assert "img1" in manager.cached
+
+    old_size = 100
+    new_size = request.mm_features[0].mm_position.get_num_embeds
+    assert new_size == 8
+    savings_ratio = old_size / new_size
+    assert savings_ratio == 12.5
+
+
+def test_encoder_cache_mask_based_retrieval():
+    class MockRequestWithMask(MockRequest):
+        def get_num_encoder_embeds(self, input_id: int) -> int:
+            return self.mm_features[input_id].mm_position.get_num_embeds
+
+    is_embed = torch.tensor(
+        [False, False, True, True, False, True, True, True, False, False]
+    )
+
+    request = MockRequestWithMask("r1", ["img1"], [10])
+    request.mm_features[0] = MultiModalFeatureSpec(
+        data=None,
+        modality="image",
+        identifier="img1",
+        mm_position=PlaceholderRange(offset=0, length=10, is_embed=is_embed),
+    )
+
+    manager = EncoderCacheManager(cache_size=50)
+    manager.allocate(request, 0)
+
+    assert request.mm_features[0].mm_position.get_num_embeds == 5
+
+    start_idx = 2
+    end_idx = 8
+    num_embeds_before = is_embed[:start_idx].sum().item()
+    num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
+
+    assert num_embeds_before == 0
+    assert num_embeds_in_range == 5
+
+    start_idx = 0
+    end_idx = 5
+    num_embeds_before = is_embed[:start_idx].sum().item() if start_idx > 0 else 0
+    num_embeds_in_range = is_embed[start_idx:end_idx].sum().item()
+
+    assert num_embeds_before == 0
+    assert num_embeds_in_range == 2
diff --git a/tests/v1/ec_connector/unit/test_ec_example_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py
index 7e9eb21310031..9ed82e1cef823 100644
--- a/tests/v1/ec_connector/unit/test_ec_example_connector.py
+++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py
@@ -38,7 +38,7 @@ class MockRequest:
             )
             self.mm_features.append(feature)
 
-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
         assert input_id < len(self._token_counts)
         return self._token_counts[input_id]
 
diff --git a/vllm/distributed/ec_transfer/ec_connector/example_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
index 5f2eff5a8e6a8..c9aad9e9fc8f3 100644
--- a/vllm/distributed/ec_transfer/ec_connector/example_connector.py
+++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py
@@ -144,7 +144,7 @@ class ECExampleConnector(ECConnectorBase):
         Update ECConnector state after encoder cache allocation.
         """
         mm_hash = request.mm_features[index].identifier
-        num_encoder_token = request.get_num_encoder_tokens(index)
+        num_encoder_token = request.get_num_encoder_embeds(index)
         # Insert mm_hash only if this block has not been recorded yet.
         self._mm_datas_need_loads[mm_hash] = num_encoder_token
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index c0589986d1fe8..4838f68e06f70 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -713,17 +713,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         mm_counts: Mapping[str, int],
     ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
-        video_soft_tokens = self.get_num_video_tokens(
+        num_video_soft_tokens = self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
             num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts),
             image_processor=None,
         )
-
-        # NOTE: By default in Qwen3-VL, one video token is converted to
-        # "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501
-        formatted_video_soft_tokens = video_soft_tokens * 12.5
-        return int(formatted_video_soft_tokens)
+        return num_video_soft_tokens
 
     def _calculate_timestamps(
         self, indices: list[int] | torch.Tensor, video_fps: float, merge_size: int
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6b1cbbe24e2e7..fa69818a7b1f8 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from functools import partial
+from functools import cached_property, partial
 from itertools import accumulate
 from typing import (
     TYPE_CHECKING,
@@ -169,11 +169,42 @@ class PlaceholderRange:
     between `offset` and `offset + length` to assign embeddings to.
     """
 
-    def get_num_embeds(self) -> int:
+    @cached_property
+    def embeds_cumsum(self) -> torch.Tensor | None:
         if self.is_embed is None:
+            return None
+
+        return self.is_embed.cumsum(dim=0)
+
+    @cached_property
+    def get_num_embeds(self) -> int:
+        if self.embeds_cumsum is None:
             return self.length
 
-        return int(self.is_embed.sum().item())
+        return int(self.embeds_cumsum[-1])
+
+    def get_embeds_indices_in_range(
+        self, start_idx: int, end_idx: int
+    ) -> tuple[int, int]:
+        """
+        Returns the starting and ending indices of the embeddings of encoder outputs
+        in the range of [start_idx, end_idx) in the placeholders.
+
+        For example, given:
+        PlaceholderRange(offset=2, length=5, is_embed=[False, True, False, True, True])
+
+        If start_idx=3 and end_idx=5, the output is (1, 3) because we want to get
+        the second and the third embeddings from the encoder output.
+        """
+        if self.embeds_cumsum is None:
+            return start_idx, end_idx
+
+        embeds_start_idx = (
+            int(self.embeds_cumsum[start_idx - 1]) if start_idx > 0 else 0
+        )
+        embeds_end_idx = int(self.embeds_cumsum[end_idx - 1])
+
+        return embeds_start_idx, embeds_end_idx
 
     def extract_embeds_range(self) -> list[tuple[int, int]]:
         """Extract the start and end indices of the embedded region in prompt.
@@ -188,7 +219,7 @@ class PlaceholderRange:
             Returns full placeholder range if `is_embed` is `None`.
         """
         if self.is_embed is None:
-            return [(self.offset, self.offset + self.length)]
+            return [(self.offset, self.offset + self.length - 1)]
 
         mask_i = self.is_embed.int()
         starts = torch.nonzero(
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index cb70041e9744f..a690948f759e9 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -274,15 +274,11 @@ class MultiModalProfiler(Generic[_I]):
     def _get_mm_num_tokens(
         self,
         mm_inputs: MultiModalInputs,
-        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         return {
-            modality: sum(
-                item.get_num_embeds() if mm_embeddings_only else item.length
-                for item in placeholders
-            )
+            modality: sum(item.get_num_embeds for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
 
@@ -328,12 +324,15 @@ class MultiModalProfiler(Generic[_I]):
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
 
-    def _get_mm_max_tokens(
+    def get_mm_max_tokens(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int] | None = None,
-        mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
+        """
+        Returns the maximum number of embeddings per item of each modality, excluding
+        any break/text tokens in-between multimodal embeddings/encoder outputs.
+        """
         if mm_counts is None:
             mm_counts = self.get_mm_limits()
 
@@ -349,21 +348,4 @@ class MultiModalProfiler(Generic[_I]):
             }
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        return self._get_mm_num_tokens(mm_inputs, mm_embeddings_only=mm_embeddings_only)
-
-    def get_mm_max_contiguous_tokens(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int] | None = None,
-    ) -> Mapping[str, int]:
-        """
-        Returns the maximum length of the multimodal (image placeholders+text)
-        tokens, including any break/text tokens in-between image embeddings.
-
-        `<im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>`
-        Returns 9, even when the number of image embeddings is 6.
-
-        This is important to take into account when profiling and
-        initializing the encoder cache size.
-        """
-        return self._get_mm_max_tokens(seq_len, mm_counts, mm_embeddings_only=False)
+        return self._get_mm_num_tokens(mm_inputs)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 00a84f9dec4f7..1e7fe8648ab71 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -164,7 +164,7 @@ class MultiModalRegistry:
             profiler.get_mm_limits() if profiler_limits is None else profiler_limits
         )
 
-        return profiler.get_mm_max_contiguous_tokens(
+        return profiler.get_mm_max_tokens(
             seq_len,
             {modality: 1 for modality, limit in profiler_limits.items() if limit > 0},
         )
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 50f738713590b..d73c05d2cf80b 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -39,20 +39,26 @@ class EncoderCacheManager:
     space for new embeddings.
     Oldest cached embeddings with no request referenced will be first evicted.
 
+    NOTE: The EncoderCacheManager operates on the level of multimodal embeddings
+    instead of encoder tokens (i.e. all tokens that represent the multimodal data
+    in the input sequence). This means all break/text tokens in-between multimodal
+    embeddings are not considered with respect to the cache size and the number
+    of free slots.
+
     Args:
         cache_size: Limit the size of the cache, measured by the number of
-                    tokens from the input sequence.
+                    encoder embeddings from the input sequence.
 
     Attributes:
-        cache_size: Total cache capacity in encoder tokens.
-        num_free_slots: Current available cache capacity in encoder tokens.
+        cache_size: Total cache capacity in encoder embeddings.
+        num_free_slots: Current available cache capacity in encoder embeddings.
         num_freeable_slots: Capacity that can be immediately reclaimed by
-            evicting entries with zero references (in encoder tokens).
+            evicting entries with zero references (in encoder embeddings).
         cached: Mapping from mm_hash to a set of request IDs that currently
             reference the cached entry. If the set is empty, the entry exists
             but is not referenced by any request and is eligible for
             reclamation.
-        freeable: List of tuples (mm_hash, num_tokens) representing entries
+        freeable: List of tuples (mm_hash, num_encoder_embeds) representing entries
             whose no current running request is needed and that can be freed to
             make space when needed.
         freed: List of mm_hash strings that were actually evicted since the
@@ -67,7 +73,7 @@ class EncoderCacheManager:
         # mm_hash of mm_data => ids of requests that reference the mm_data
         self.cached: dict[str, set[str]] = {}
 
-        # mm_hash of mm_data => num_encoder_tokens of the mm_data
+        # mm_hash of mm_data => num_encoder_embeds of the mm_data
         self.freeable: OrderedDict[str, int] = OrderedDict()
         self.freed: list[str] = []
 
@@ -93,8 +99,8 @@ class EncoderCacheManager:
 
         # Cached but currently not referenced by any request
         if not self.cached[mm_hash]:
-            num_tokens = self.freeable.pop(mm_hash)
-            self.num_freeable_slots -= num_tokens
+            num_encoder_embeds = self.freeable.pop(mm_hash)
+            self.num_freeable_slots -= num_encoder_embeds
 
         self.cached[mm_hash].add(request.request_id)
         return True
@@ -104,7 +110,7 @@ class EncoderCacheManager:
         request: Request,
         input_id: int,
         encoder_compute_budget: int,
-        num_tokens_to_schedule: int,
+        num_embeds_to_schedule: int,
     ) -> bool:
         """Check if there's sufficient cache space for a multimodal input.
         If there is, return True and update EncoderCacheManager state.
@@ -121,9 +127,9 @@ class EncoderCacheManager:
         Args:
             request: The request containing the multimodal input.
             input_id: Index of the multimodal input within the request.
-            encoder_compute_budget: Number of encoder tokens allowed to be
+            encoder_compute_budget: Number of encoder embeddings allowed to be
                 computed when this method is invoked.
-            num_tokens_to_schedule: Number of tokens already scheduled to be
+            num_embeds_to_schedule: Number of encoder embeddings already scheduled to be
                 allocated with cache space when this method is invoked.
 
         Returns:
@@ -134,30 +140,30 @@ class EncoderCacheManager:
         Note: This method does not allocate physical memory for the encoder
         output but only the state of EncoderCacheManager.
         """
-        num_tokens = request.get_num_encoder_tokens(input_id)
+        num_embeds = request.get_num_encoder_embeds(input_id)
 
         # Not enough compute budget
-        if num_tokens > encoder_compute_budget:
+        if num_embeds > encoder_compute_budget:
             return False
 
-        num_tokens += num_tokens_to_schedule
+        num_embeds += num_embeds_to_schedule
 
         # Enough free slots
-        if num_tokens <= self.num_free_slots:
+        if num_embeds <= self.num_free_slots:
             return True
 
         # Not enough reclaimable slots
-        if num_tokens > self.num_freeable_slots:
+        if num_embeds > self.num_freeable_slots:
             return False
 
         # Not enough free slots but enough reclaimable slots
         # NOTE: Eviction takes place here, but physical memory is not freed
         # until model runner is notified by the scheduler output.
-        while num_tokens > self.num_free_slots:
-            mm_hash, num_free_token = self.freeable.popitem(last=False)
+        while num_embeds > self.num_free_slots:
+            mm_hash, num_free_embeds = self.freeable.popitem(last=False)
             del self.cached[mm_hash]
             self.freed.append(mm_hash)
-            self.num_free_slots += num_free_token
+            self.num_free_slots += num_free_embeds
         return True
 
     def allocate(self, request: Request, input_id: int) -> None:
@@ -176,16 +182,16 @@ class EncoderCacheManager:
         if mm_hash not in self.cached:
             self.cached[mm_hash] = set()
 
-        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
 
         # NOTE: Encoder cache should always have enough space for encoder inputs
         # that are scheduled since eviction takes place at can_allocate().
-        assert self.num_free_slots >= num_encoder_tokens
-        assert self.num_freeable_slots >= num_encoder_tokens
+        assert self.num_free_slots >= num_encoder_embeds
+        assert self.num_freeable_slots >= num_encoder_embeds
 
         self.cached[mm_hash].add(request_id)
-        self.num_free_slots -= num_encoder_tokens
-        self.num_freeable_slots -= num_encoder_tokens
+        self.num_free_slots -= num_encoder_embeds
+        self.num_freeable_slots -= num_encoder_embeds
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
         """Get all cached multimodal input IDs for a request.
@@ -206,7 +212,7 @@ class EncoderCacheManager:
 
         When the reference set for the corresponding `mm_hash` becomes empty,
         the entry is appended to `freeable` and `num_freeable_slots` is
-        increased by the number of encoder tokens for that input.
+        increased by the number of encoder embeddings for that input.
 
         The entry is NOT physically freed until capacity is needed (e.g., by
         `can_allocate`).
@@ -218,9 +224,9 @@ class EncoderCacheManager:
             return
         self.cached[mm_hash].discard(req_id)
         if not self.cached[mm_hash]:
-            num_tokens = request.get_num_encoder_tokens(input_id)
-            self.freeable[mm_hash] = num_tokens
-            self.num_freeable_slots += num_tokens
+            num_encoder_embeds = request.get_num_encoder_embeds(input_id)
+            self.freeable[mm_hash] = num_encoder_embeds
+            self.num_freeable_slots += num_encoder_embeds
 
     def free(self, request: Request) -> None:
         """Free all encoder input cache reference held by *request*.
@@ -361,20 +367,20 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
         request: Request,
         input_id: int,
         encoder_compute_budget: int,
-        num_tokens_to_schedule: int,
+        num_embeds_to_schedule: int,
     ) -> bool:
-        num_tokens = request.get_num_encoder_tokens(input_id)
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
         # Not enough compute budget
-        if num_tokens > encoder_compute_budget:
+        if num_encoder_embeds > encoder_compute_budget:
             return False
 
-        num_tokens += num_tokens_to_schedule
+        num_encoder_embeds += num_embeds_to_schedule
         # Enough free slots
-        return num_tokens <= self.num_free_slots
+        return num_encoder_embeds <= self.num_free_slots
 
     def allocate(self, request: Request, input_id: int) -> None:
-        num_encoder_tokens = request.get_num_encoder_tokens(input_id)
-        self.num_free_slots -= num_encoder_tokens
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
+        self.num_free_slots -= num_encoder_embeds
 
         mm_hash = request.mm_features[input_id].identifier
         self.freed.append(mm_hash)
@@ -392,5 +398,5 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
         return freed
 
     def free_encoder_input(self, request: Request, input_id: int) -> None:
-        num_tokens = request.get_num_encoder_tokens(input_id)
-        self.num_free_slots += num_tokens
+        num_encoder_embeds = request.get_num_encoder_embeds(input_id)
+        self.num_free_slots += num_encoder_embeds
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 754e0b9d08316..8e835ad096405 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -355,11 +355,11 @@ class Scheduler(SchedulerInterface):
                             if preempted_encoder_inputs:
                                 # Restore encoder compute budget if the preempted
                                 # request had encoder inputs scheduled in this step.
-                                num_tokens_to_restore = sum(
-                                    preempted_req.get_num_encoder_tokens(i)
+                                num_embeds_to_restore = sum(
+                                    preempted_req.get_num_encoder_embeds(i)
                                     for i in preempted_encoder_inputs
                                 )
-                                encoder_compute_budget += num_tokens_to_restore
+                                encoder_compute_budget += num_embeds_to_restore
                             req_index -= 1
                     else:
                         preempted_req = self.running.pop()
@@ -911,10 +911,11 @@ class Scheduler(SchedulerInterface):
         # multiple encoder inputs per request), we need to create temporary
         # trackers for accounting at the encoder input level.
         mm_hashes_to_schedule = set()
-        num_tokens_to_schedule = 0
+        num_embeds_to_schedule = 0
         for i, mm_feature in enumerate(mm_features):
             start_pos = mm_feature.mm_position.offset
             num_encoder_tokens = mm_feature.mm_position.length
+            num_encoder_embeds = mm_feature.mm_position.get_num_embeds
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -970,9 +971,8 @@ class Scheduler(SchedulerInterface):
             ):
                 num_new_tokens = start_pos - num_computed_tokens
                 break
-
             if not self.encoder_cache_manager.can_allocate(
-                request, i, encoder_compute_budget, num_tokens_to_schedule
+                request, i, encoder_compute_budget, num_embeds_to_schedule
             ):
                 # The encoder cache is full or the encoder budget is exhausted.
                 # NOTE(woosuk): We assume that the encoder input tokens should
@@ -992,14 +992,31 @@ class Scheduler(SchedulerInterface):
                     num_new_tokens = 0
                 break
 
+            # Calculate the number of embeddings to schedule in the current range
+            # of scheduled encoder placholder tokens.
+            start_idx_rel = max(0, num_computed_tokens - start_pos)
+            end_idx_rel = min(
+                num_encoder_tokens, num_computed_tokens + num_new_tokens - start_pos
+            )
+            curr_embeds_start, curr_embeds_end = (
+                mm_feature.mm_position.get_embeds_indices_in_range(
+                    start_idx_rel,
+                    end_idx_rel,
+                )
+            )
+            # There's no embeddings in the current range of encoder placeholder tokens
+            # so we can skip the encoder input.
+            if curr_embeds_end - curr_embeds_start == 0:
+                continue
+
             if self.ec_connector is not None and remote_cache_has_item[i]:
                 mm_hashes_to_schedule.add(request.mm_features[i].identifier)
                 external_load_encoder_input.append(i)
-                num_tokens_to_schedule += num_encoder_tokens
+                num_embeds_to_schedule += num_encoder_embeds
                 continue
 
-            num_tokens_to_schedule += num_encoder_tokens
-            encoder_compute_budget -= num_encoder_tokens
+            num_embeds_to_schedule += num_encoder_embeds
+            encoder_compute_budget -= num_encoder_embeds
             mm_hashes_to_schedule.add(request.mm_features[i].identifier)
             encoder_inputs_to_schedule.append(i)
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index a775e840e841c..f33059b80b894 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -209,10 +209,10 @@ class Request:
     def get_finished_reason(self) -> FinishReason | None:
         return RequestStatus.get_finished_reason(self.status)
 
-    def get_num_encoder_tokens(self, input_id: int) -> int:
+    def get_num_encoder_embeds(self, input_id: int) -> int:
         assert input_id < len(self.mm_features)
-        num_tokens = self.mm_features[input_id].mm_position.length
-        return num_tokens
+        num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
+        return num_embeds
 
     def record_event(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 179f713c4d86a..1db5bc99fff6c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -169,9 +169,7 @@ from .utils import (
     MultiModalBudget,
     add_kv_sharing_layers_to_kv_cache_groups,
     bind_kv_cache,
-    gather_mm_placeholders,
     sanity_check_mm_encoder_outputs,
-    scatter_mm_placeholders,
 )
 
 if TYPE_CHECKING:
@@ -2209,10 +2207,7 @@ class GPUModelRunner(
 
         # Cache the encoder outputs by mm_hash
         for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
-            self.encoder_cache[mm_hash] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
+            self.encoder_cache[mm_hash] = output
             logger.debug("Finish execute for mm hash %s", mm_hash)
             self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
@@ -2263,6 +2258,13 @@ class GPUModelRunner(
                     num_encoder_tokens,
                 )
                 assert start_idx < end_idx
+                curr_embeds_start, curr_embeds_end = (
+                    pos_info.get_embeds_indices_in_range(start_idx, end_idx)
+                )
+                # If there are no embeddings in the current range, we skip
+                # gathering the embeddings.
+                if curr_embeds_start == curr_embeds_end:
+                    continue
 
                 mm_hash = mm_feature.identifier
                 encoder_output = self.encoder_cache.get(mm_hash, None)
@@ -2270,16 +2272,14 @@ class GPUModelRunner(
 
                 if (is_embed := pos_info.is_embed) is not None:
                     is_embed = is_embed[start_idx:end_idx]
+                    mm_embeds_item = encoder_output[curr_embeds_start:curr_embeds_end]
+                else:
+                    mm_embeds_item = encoder_output[start_idx:end_idx]
 
                 req_start_pos = req_start_idx + start_pos - num_computed_tokens
                 is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = (
                     True if is_embed is None else is_embed
                 )
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
                 mm_embeds_req.append(mm_embeds_item)
 
             if self.is_multimodal_pruning_enabled and self.uses_mrope:
@@ -4508,31 +4508,8 @@ class GPUModelRunner(
                         dummy_encoder_outputs,
                         expected_num_items=max_mm_items_per_batch,
                     )
-
-                    # NOTE: This happens when encoder cache needs to store
-                    # the embeddings that encoder outputs are scattered onto.
-                    # In this case we create dummy embeddings of size
-                    # (max_tokens_for_modality, hidden_size) and scatter
-                    # encoder output into it.
-                    encoder_output_shape = dummy_encoder_outputs[0].shape
-                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
-                        dummy_modality
-                    ]
-                    if encoder_output_shape[0] < max_mm_tokens_per_item:
-                        encoder_hidden_size = encoder_output_shape[-1]
-                        expanded_outputs = []
-                        for output in dummy_encoder_outputs:
-                            expanded = output.new_zeros(
-                                (max_mm_tokens_per_item, encoder_hidden_size)
-                            )
-                            num_tokens = output.shape[0]
-                            expanded[:num_tokens].copy_(output)
-                            expanded_outputs.append(expanded)
-
-                        dummy_encoder_outputs = expanded_outputs
-
-                    # Cache the dummy encoder outputs.
-                    self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
+                    for i, output in enumerate(dummy_encoder_outputs):
+                        self.encoder_cache[f"tmp_{i}"] = output
 
         # Add `is_profile` here to pre-allocate communication buffers
         hidden_states, last_hidden_states = self._dummy_run(
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e9c48223d58b9..2e8afec024ce9 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -4,10 +4,12 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 
 import torch
+from typing_extensions import deprecated
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
@@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
+logger = init_logger(__name__)
+
 
 class MultiModalBudget:
     """Helper class to calculate budget information for multi-modal models."""
@@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs(
     )
 
 
+@deprecated("`scatter_mm_placeholders` is deprecated and will be removed in v0.15.0.")
 def scatter_mm_placeholders(
     embeds: torch.Tensor,
     is_embed: torch.Tensor | None,
@@ -226,6 +231,7 @@ def scatter_mm_placeholders(
     return placeholders
 
 
+@deprecated("`gather_mm_placeholders` is deprecated and will be removed in v0.15.0.")
 def gather_mm_placeholders(
     placeholders: torch.Tensor,
     is_embed: torch.Tensor | None,

From eaa82a709a963ab744647a701fe267223ed7b02b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20C=C3=A1mpora?=
 <961215+dcampora@users.noreply.github.com>
Date: Tue, 16 Dec 2025 23:21:17 +0100
Subject: [PATCH 07/49] [Bugfix][DSV32] Fix overflow in topk. (#30754)

Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 csrc/sampler.cu | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/csrc/sampler.cu b/csrc/sampler.cu
index fc2154beff9e0..d458f8e4c1d02 100644
--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@@ -550,8 +550,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill(
   int rowEnd = rowEnds[rowIdx];
 
   // Local pointers to this block
-  outIndices += rowIdx * topK;
-  logits += rowIdx * stride0;
+  outIndices += static_cast<int64_t>(rowIdx) * topK;
+  logits += static_cast<int64_t>(rowIdx) * stride0;
 
   topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort>(
       nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK);
@@ -576,19 +576,21 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(
 
   // Local pointers to this block
   if constexpr (!multipleBlocksPerRow && !mergeBlocks) {
-    outIndices += rowIdx * topK;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
   } else if constexpr (multipleBlocksPerRow) {
     const auto blockSize = rowEnd / gridDim.y;  // 16384 / 2 = 8192
     rowStart = blockSize * blockIdx.y;          // 8192 * 1 = 8192
     rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize;
-    outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK;
-    outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK;
+    outIndices +=
+        static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
+    outLogits +=
+        static_cast<int64_t>(rowIdx) * gridDim.y * topK + blockIdx.y * topK;
   } else if constexpr (mergeBlocks) {
     rowEnd = numBlocksToMerge * topK;
-    indices += rowIdx * numBlocksToMerge * topK;
-    outIndices += rowIdx * topK;
+    indices += static_cast<int64_t>(rowIdx) * numBlocksToMerge * topK;
+    outIndices += static_cast<int64_t>(rowIdx) * topK;
   }
-  logits += rowIdx * stride0;
+  logits += static_cast<int64_t>(rowIdx) * stride0;
 
   topKPerRowJob<kNumThreadsPerBlock, kNumBins, useRadixSort,
                 multipleBlocksPerRow, mergeBlocks>(

From ce96857fdd2bf2390aaa2183561fd1a0f5c464c7 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Wed, 17 Dec 2025 06:35:28 +0800
Subject: [PATCH 08/49] [Kernel][Quantization][MoE] add marlin kernel support
 for turing (sm75) (#29901)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 CMakeLists.txt                                | 109 ++++---
 csrc/moe/marlin_moe_wna16/.gitignore          |   1 +
 csrc/moe/marlin_moe_wna16/generate_kernels.py | 132 +++++----
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 208 ++++----------
 csrc/moe/marlin_moe_wna16/ops.cu              |  54 ++--
 csrc/quantization/gptq_marlin/.gitignore      |   1 +
 csrc/quantization/gptq_marlin/dequant.h       |   2 +-
 .../gptq_marlin/generate_kernels.py           | 132 +++++----
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |  68 +++--
 csrc/quantization/gptq_marlin/marlin.cuh      |  74 ++++-
 csrc/quantization/gptq_marlin/marlin_mma.h    | 269 ++++++++++++++++++
 .../gptq_marlin/marlin_template.h             | 184 +++---------
 .../layers/quantization/awq_marlin.py         |   2 +-
 .../model_executor/layers/quantization/fp8.py |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/modelopt.py           |   2 +-
 16 files changed, 729 insertions(+), 513 deletions(-)
 create mode 100644 csrc/quantization/gptq_marlin/marlin_mma.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd52df86e0346..5ca71f6ba4df0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -357,6 +357,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # marlin arches for fp16 output
   cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
   cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   # marlin arches for fp8 input
@@ -364,8 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
   cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  # marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
 
-  if (MARLIN_ARCHS)
+  if (MARLIN_OTHER_ARCHS)
 
     #
     # For the Marlin kernels we automatically generate sources for various
@@ -406,25 +410,39 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
-    endif()
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+    if (MARLIN_ARCHS)
+      file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
-    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
-      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+    endif()
+
+    if (MARLIN_SM75_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
     endif()
-    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
 
     if (MARLIN_FP8_ARCHS) 
       file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
@@ -446,14 +464,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
-      CUDA_ARCHS "${MARLIN_ARCHS}")
+      CUDA_ARCHS "${MARLIN_OTHER_ARCHS}")
     if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      set_source_files_properties(${MARLIN_SRCS}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
 
-    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
+    message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
                    " in CUDA target architectures")
@@ -980,12 +998,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # note that we always set `use_atomic_add=False` for moe marlin now,
   # so we don't need 9.0 for bf16 atomicAdd PTX
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin has limited support for turing
+  cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}")
   # moe marlin arches for fp8 input
   # - sm80 doesn't support fp8 computation
   # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
   # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
   cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
-  if (MARLIN_MOE_ARCHS)
+  # moe marlin arches for other files
+  cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
+  if (MARLIN_MOE_OTHER_ARCHS)
 
     #
     # For the Marlin MOE kernels we automatically generate sources for various
@@ -1026,16 +1048,29 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
-    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
-      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MARLIN_MOE_SRC}
-        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if (MARLIN_MOE_ARCHS)
+      file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+    endif()
+
+    if (MARLIN_MOE_SM75_ARCHS) 
+      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_SM75_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_SM75_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC})
     endif()
-    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
 
     if (MARLIN_MOE_FP8_ARCHS)
       file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
@@ -1049,7 +1084,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
     endif()
 
-    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
+    set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_MOE_OTHER_SRC}"
+      CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_MOE_OTHER_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}")
+
+    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}")
   else()
     message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
                    " in CUDA target architectures")
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index ba805f9250ece..7dc482a894660 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1,2 +1,3 @@
 sm*_kernel_*.cu
 kernel_selector.h
+kernel_*.cu
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 88f1055337fd5..9db03ea149d0c 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -10,6 +10,8 @@ import jinja2
 
 ARCHS = []
 SUPPORT_FP8 = False
+SUPPORT_SM75 = False
+SUPPORT_SM80 = False
 for arch in sys.argv[1].split(","):
     arch = arch[: arch.index(".") + 2].replace(".", "")
     arch = int(arch)
@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
     # with FP16 MMA, so it cannot achieve any acceleration.
     if arch in [89, 120]:
         SUPPORT_FP8 = True
+    if arch >= 80:
+        SUPPORT_SM80 = True
+    if arch == 75:
+        SUPPORT_SM75 = True
 
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
@@ -157,6 +163,7 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     result_dict = {}
+    sm_75_result_dict = {}
 
     for quant_config in QUANT_CONFIGS:
         c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
@@ -174,6 +181,8 @@ def generate_new_kernels():
             s_type = quant_config.get("s_type", c_type)
             if (a_type, b_type, c_type) not in result_dict:
                 result_dict[(a_type, b_type, c_type)] = []
+                if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
+                    sm_75_result_dict[(a_type, b_type, c_type)] = []
 
             for group_blocks, m_blocks, thread_configs in itertools.product(
                 all_group_blocks, all_m_blocks, all_thread_configs
@@ -197,78 +206,89 @@ def generate_new_kernels():
                     "thread_k_blocks": thread_k // 16,
                     "thread_n_blocks": thread_n // 16,
                     "m_block_size_8": "true" if m_blocks == 0.5 else "false",
-                    "stages": "pipe_stages",
+                    "stages": 4,
                     "group_blocks": group_blocks,
                     "is_zp_float": "false",
                 }
 
-                result_dict[(a_type, b_type, c_type)].append(config)
+                if SUPPORT_SM80:
+                    result_dict[(a_type, b_type, c_type)].append(config)
+                if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
+                    config_sm75 = config.copy()
+                    config_sm75["stages"] = 2
+                    sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
 
     kernel_selector_str = FILE_HEAD_COMMENT
 
-    for (a_type, b_type, c_type), config_list in result_dict.items():
-        all_template_str_list = []
-        for config in config_list:
-            s_type = config["s_type"]
-            template_str = jinja2.Template(TEMPLATE).render(
-                a_type_id=f"vllm::{a_type}.id()",
-                b_type_id=f"vllm::{b_type}.id()",
-                c_type_id=f"vllm::{c_type}.id()",
-                s_type_id=f"vllm::{s_type}.id()",
-                **config,
-            )
-            all_template_str_list.append(template_str)
-
-            conditions = [
-                f"a_type == vllm::{a_type}",
-                f"b_type == vllm::{b_type}",
-                f"c_type == vllm::{c_type}",
-                f"s_type == vllm::{s_type}",
-                f"threads == {config['threads']}",
-                f"thread_m_blocks == {config['thread_m_blocks']}",
-                f"thread_n_blocks == {config['thread_n_blocks']}",
-                f"thread_k_blocks == {config['thread_k_blocks']}",
-                f"m_block_size_8 == {config['m_block_size_8']}",
-                f"group_blocks == {config['group_blocks']}",
-                f"is_zp_float == {config['is_zp_float']}",
-            ]
-            conditions = " && ".join(conditions)
-
-            if kernel_selector_str == FILE_HEAD_COMMENT:
-                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            else:
-                kernel_selector_str += f"else if ({conditions})\n  kernel = "
-
-            kernel_template2 = (
-                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
-                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
-                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
-                "{{is_zp_float}}>;"
-            )
-
-            kernel_selector_str += (
-                jinja2.Template(kernel_template2).render(
+    for result_dict_tmp in [result_dict, sm_75_result_dict]:
+        for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
+            all_template_str_list = []
+            if not config_list:
+                continue
+            for config in config_list:
+                s_type = config["s_type"]
+                template_str = jinja2.Template(TEMPLATE).render(
                     a_type_id=f"vllm::{a_type}.id()",
                     b_type_id=f"vllm::{b_type}.id()",
                     c_type_id=f"vllm::{c_type}.id()",
                     s_type_id=f"vllm::{s_type}.id()",
                     **config,
                 )
-                + "\n"
-            )
+                all_template_str_list.append(template_str)
 
-        file_content = FILE_HEAD + "\n\n"
-        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        if a_type == "kFE4M3fn":
-            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
-        else:
-            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+                conditions = [
+                    f"a_type == vllm::{a_type}",
+                    f"b_type == vllm::{b_type}",
+                    f"c_type == vllm::{c_type}",
+                    f"s_type == vllm::{s_type}",
+                    f"threads == {config['threads']}",
+                    f"thread_m_blocks == {config['thread_m_blocks']}",
+                    f"thread_n_blocks == {config['thread_n_blocks']}",
+                    f"thread_k_blocks == {config['thread_k_blocks']}",
+                    f"m_block_size_8 == {config['m_block_size_8']}",
+                    f"stages == {config['stages']}",
+                    f"group_blocks == {config['group_blocks']}",
+                    f"is_zp_float == {config['is_zp_float']}",
+                ]
+                conditions = " && ".join(conditions)
 
-        filename = filename.lower()
+                if kernel_selector_str == FILE_HEAD_COMMENT:
+                    kernel_selector_str += f"if ({conditions})\n  kernel = "
+                else:
+                    kernel_selector_str += f"else if ({conditions})\n  kernel = "
 
-        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
-            f.write(file_content)
+                kernel_template2 = (
+                    "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                    "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                    "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                    "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                    "{{is_zp_float}}>;"
+                )
+
+                kernel_selector_str += (
+                    jinja2.Template(kernel_template2).render(
+                        a_type_id=f"vllm::{a_type}.id()",
+                        b_type_id=f"vllm::{b_type}.id()",
+                        c_type_id=f"vllm::{c_type}.id()",
+                        s_type_id=f"vllm::{s_type}.id()",
+                        **config,
+                    )
+                    + "\n"
+                )
+
+            file_content = FILE_HEAD + "\n\n"
+            file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+            if a_type == "kFE4M3fn":
+                filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            elif result_dict_tmp is sm_75_result_dict:
+                filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            else:
+                filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+            filename = filename.lower()
+
+            with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+                f.write(file_content)
 
     if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
         kernel_selector_str += (
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 5b6b2456b4111..138197b76f026 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -26,6 +26,7 @@
 #include "quantization/gptq_marlin/marlin.cuh"
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "quantization/gptq_marlin/dequant.h"
+#include "quantization/gptq_marlin/marlin_mma.h"
 #include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
@@ -35,7 +36,7 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
@@ -84,146 +85,6 @@ __global__ void Marlin(
 
 #else
 
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
-            "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
-            "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  } else if (k_size == 32) {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma_trans(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    const typename MarlinScalarType<type_id>::FragB& frag_b2,
-    typename MarlinScalarType<type_id>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
-            "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-            "r"(c[3]));
-    }
-  } else {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
-      asm volatile(
-          "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  #else
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  #endif
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
 template <int count, vllm::ScalarTypeId type_id>
@@ -439,9 +300,20 @@ __global__ void Marlin(
   if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  // Turing TensorCore only supports fp16 and int8
+  if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
+    return;
+  #endif
+
   int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
   constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
+  #else
+  constexpr bool use_fp16_accum = false;
+  #endif
   using Adtype = MarlinScalarType<a_type_id>;
   using Cdtype = MarlinScalarType<c_type_id>;
 
@@ -618,7 +490,22 @@ __global__ void Marlin(
         }
       }
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+
+      if constexpr (moe_block_size >= 16)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 16);
+      if constexpr (moe_block_size >= 8)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 8);
+      if constexpr (moe_block_size >= 4)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 4);
+      if constexpr (moe_block_size >= 2)
+        local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 2);
+
+      local_count += __shfl_down_sync(0xFFFFFFFF, local_count, 1);
+      block_num_valid_tokens = local_count;
+  #else
       block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
+  #endif
 
       if (lane_id == 0)
         reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
@@ -1018,10 +905,6 @@ __global__ void Marlin(
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
-  // shared memory reused by reduction should be smaller than
-  // shared memory used by weight.
-  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
-                stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
 
   // Register storage for double buffer of shared memory reads.
@@ -1545,11 +1428,13 @@ __global__ void Marlin(
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
-                               frag_c[i][j][0]);
+          mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
+                                               frag_c[i][j][0]);
         } else {
-          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
+                                         frag_c[i][j][0]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
+                                         frag_c[i][j][1]);
         }
       }
     }
@@ -1583,10 +1468,12 @@ __global__ void Marlin(
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[0],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[1],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
       }
 
       if constexpr (group_blocks != -1) {
@@ -2132,6 +2019,21 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      // convert fp16 accum to fp32 for reduction
+      if constexpr (use_fp16_accum) {
+  #pragma unroll
+        for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
+          float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
+          scalar_t* frag_c_part_half =
+              reinterpret_cast<scalar_t*>(frag_c_part_float);
+
+  #pragma unroll
+          for (int i = 3; i >= 0; i--) {
+            frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
+          }
+        }
+      }
+
       if constexpr (is_a_8bit) {
         float frag_a_s[2 * thread_m_blocks];
 
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 4fd8fc5c54202..8ac1691220a6b 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -142,7 +142,7 @@ typedef struct {
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                           int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
+                          bool has_act_order, bool is_k_full, int stages) {
   bool cache_scales_chunk = has_act_order && !is_k_full;
 
   int tb_n = th_config.thread_n;
@@ -160,13 +160,13 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
 
   if (cache_scales_chunk) {
     int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+        tb_groups * stages * 2;          // Chunk size is 2x pipeline over dim K
     load_groups = max(load_groups, 32);  // We load at least 32 scale groups
     return load_groups * tb_n * 2;
   } else {
     int tb_scales = tb_groups * tb_n * 2;
 
-    return tb_scales * pipe_stages;
+    return tb_scales * stages;
   }
 }
 
@@ -174,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                           int thread_m_blocks, int prob_m, int prob_n,
                           int prob_k, int num_bits, int group_size,
                           bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float, bool is_a_8bit) {
+                          int is_zp_float, bool is_a_8bit, int stages) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
@@ -185,8 +185,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
   int sh_block_meta_size = tb_m * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
-  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
   int tmp_size =
@@ -195,8 +195,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
 
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+                            group_size, has_act_order, is_k_full, stages);
+  int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
   int sh_zp_size = 0;
   if (has_zp) {
     if (is_zp_float)
@@ -217,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                      int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                      int num_bits, int group_size, bool has_act_order,
                      bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem, bool is_a_8bit) {
+                     bool is_a_8bit, int stages, int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -243,7 +243,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   int cache_size =
       get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
                             prob_n, prob_k, num_bits, group_size, has_act_order,
-                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+                            is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
   return cache_size <= max_shared_mem;
 }
 
@@ -252,7 +252,7 @@ MarlinFuncPtr get_marlin_kernel(
     const vllm::ScalarType c_type, const vllm::ScalarType s_type,
     int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
     bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-    int threads, bool is_zp_float) {
+    int threads, bool is_zp_float, int stages) {
   int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
 
@@ -266,8 +266,8 @@ exec_config_t determine_exec_config(
     const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
     int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
     bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
-    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
-    bool is_a_8bit) {
+    bool is_k_full, bool has_zp, bool is_zp_float, bool is_a_8bit, int stages,
+    int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -284,15 +284,15 @@ exec_config_t determine_exec_config(
 
     if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                          prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
-                         is_a_8bit)) {
+                         is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
+                         max_shared_mem - 512)) {
       continue;
     }
 
     int cache_size = get_kernel_cache_size(
         th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
         num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        is_a_8bit);
+        is_a_8bit, stages);
 
     int group_blocks = 0;
     if (!has_act_order) {
@@ -303,7 +303,7 @@ exec_config_t determine_exec_config(
         get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
                           th_config.thread_n / 16, th_config.thread_k / 16,
                           m_block_size_8, has_act_order, has_zp, group_blocks,
-                          th_config.num_threads, is_zp_float);
+                          th_config.num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) continue;
 
@@ -433,8 +433,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          dev);
   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                          dev);
-  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
-              "marlin kernel only support Ampere or newer GPUs.");
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
+              "marlin kernel only support Turing or newer GPUs.");
+  int stages = 4;
+  if (major_capability == 7 && minor_capability == 5) {
+    stages = 2;
+    TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
+                "Turing only support FP16 or INT8 activation.");
+  }
   if (a_type == vllm::kFE4M3fn) {
     TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
                 "FP8 only support Ada Lovelace or newer GPUs.");
@@ -461,8 +467,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     exec_cfg = determine_exec_config(
         a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
         top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
-        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
-        is_a_8bit);
+        has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages,
+        max_shared_mem, sms);
     thread_tfg = exec_cfg.tb_cfg;
   }
 
@@ -479,7 +485,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
                               prob_m, prob_n, prob_k, num_bits, group_size,
                               has_act_order, is_k_full, has_zp, is_zp_float,
-                              max_shared_mem, is_a_8bit),
+                              is_a_8bit, stages, max_shared_mem),
               "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
               ", thread_k = ", thread_tfg.thread_k,
               ", thread_n = ", thread_tfg.thread_n,
@@ -493,12 +499,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   int sh_cache_size =
       get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
                             prob_n, prob_k, num_bits, group_size, has_act_order,
-                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+                            is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
 
   auto kernel = get_marlin_kernel(
       a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
       thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-      num_threads, is_zp_float);
+      num_threads, is_zp_float, stages);
 
   if (kernel == MarlinDefault) {
     TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore
index ba805f9250ece..7dc482a894660 100644
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -1,2 +1,3 @@
 sm*_kernel_*.cu
 kernel_selector.h
+kernel_*.cu
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index 26b8d40368aa9..edd97dbfcd8e5 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -67,7 +67,7 @@ where `scale_factor * multiplier` can be computed at weight loading.
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 750
 // Lookup-table based 3-input logical operation; explicitly used for
 // dequantization as the compiler does not seem to automatically recognize it in
 // all cases.
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 27ef7271ba41c..24866fc5cd546 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -10,6 +10,8 @@ import jinja2
 
 ARCHS = []
 SUPPORT_FP8 = False
+SUPPORT_SM75 = False
+SUPPORT_SM80 = False
 for arch in sys.argv[1].split(","):
     arch = arch[: arch.index(".") + 2].replace(".", "")
     arch = int(arch)
@@ -19,6 +21,10 @@ for arch in sys.argv[1].split(","):
     # with FP16 MMA, so it cannot achieve any acceleration.
     if arch in [89, 120]:
         SUPPORT_FP8 = True
+    if arch >= 80:
+        SUPPORT_SM80 = True
+    if arch == 75:
+        SUPPORT_SM75 = True
 
 FILE_HEAD_COMMENT = """
 // auto generated by generate_kernels.py
@@ -166,6 +172,7 @@ def remove_old_kernels():
 
 def generate_new_kernels():
     result_dict = {}
+    sm_75_result_dict = {}
 
     for quant_config in QUANT_CONFIGS:
         c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
@@ -184,6 +191,8 @@ def generate_new_kernels():
             s_type = quant_config.get("s_type", c_type)
             if (a_type, b_type, c_type) not in result_dict:
                 result_dict[(a_type, b_type, c_type)] = []
+                if a_type in ["kFloat16", "kS8"] and c_type == "kFloat16":
+                    sm_75_result_dict[(a_type, b_type, c_type)] = []
 
             for group_blocks, m_blocks, thread_configs in itertools.product(
                 all_group_blocks, all_m_blocks, all_thread_configs
@@ -207,78 +216,89 @@ def generate_new_kernels():
                     "thread_k_blocks": thread_k // 16,
                     "thread_n_blocks": thread_n // 16,
                     "m_block_size_8": "true" if m_blocks == 0.5 else "false",
-                    "stages": "pipe_stages",
+                    "stages": 4,
                     "group_blocks": group_blocks,
                     "is_zp_float": "true" if is_zp_float else "false",
                 }
 
-                result_dict[(a_type, b_type, c_type)].append(config)
+                if SUPPORT_SM80:
+                    result_dict[(a_type, b_type, c_type)].append(config)
+                if (a_type, b_type, c_type) in sm_75_result_dict and SUPPORT_SM75:
+                    config_sm75 = config.copy()
+                    config_sm75["stages"] = 2
+                    sm_75_result_dict[(a_type, b_type, c_type)].append(config_sm75)
 
     kernel_selector_str = FILE_HEAD_COMMENT
 
-    for (a_type, b_type, c_type), config_list in result_dict.items():
-        all_template_str_list = []
-        for config in config_list:
-            s_type = config["s_type"]
-            template_str = jinja2.Template(TEMPLATE).render(
-                a_type_id=f"vllm::{a_type}.id()",
-                b_type_id=f"vllm::{b_type}.id()",
-                c_type_id=f"vllm::{c_type}.id()",
-                s_type_id=f"vllm::{s_type}.id()",
-                **config,
-            )
-            all_template_str_list.append(template_str)
-
-            conditions = [
-                f"a_type == vllm::{a_type}",
-                f"b_type == vllm::{b_type}",
-                f"c_type == vllm::{c_type}",
-                f"s_type == vllm::{s_type}",
-                f"threads == {config['threads']}",
-                f"thread_m_blocks == {config['thread_m_blocks']}",
-                f"thread_n_blocks == {config['thread_n_blocks']}",
-                f"thread_k_blocks == {config['thread_k_blocks']}",
-                f"m_block_size_8 == {config['m_block_size_8']}",
-                f"group_blocks == {config['group_blocks']}",
-                f"is_zp_float == {config['is_zp_float']}",
-            ]
-            conditions = " && ".join(conditions)
-
-            if kernel_selector_str == FILE_HEAD_COMMENT:
-                kernel_selector_str += f"if ({conditions})\n  kernel = "
-            else:
-                kernel_selector_str += f"else if ({conditions})\n  kernel = "
-
-            kernel_template2 = (
-                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
-                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
-                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
-                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
-                "{{is_zp_float}}>;"
-            )
-
-            kernel_selector_str += (
-                jinja2.Template(kernel_template2).render(
+    for result_dict_tmp in [result_dict, sm_75_result_dict]:
+        for (a_type, b_type, c_type), config_list in result_dict_tmp.items():
+            all_template_str_list = []
+            if not config_list:
+                continue
+            for config in config_list:
+                s_type = config["s_type"]
+                template_str = jinja2.Template(TEMPLATE).render(
                     a_type_id=f"vllm::{a_type}.id()",
                     b_type_id=f"vllm::{b_type}.id()",
                     c_type_id=f"vllm::{c_type}.id()",
                     s_type_id=f"vllm::{s_type}.id()",
                     **config,
                 )
-                + "\n"
-            )
+                all_template_str_list.append(template_str)
 
-        file_content = FILE_HEAD + "\n\n"
-        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        if a_type == "kFE4M3fn":
-            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
-        else:
-            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+                conditions = [
+                    f"a_type == vllm::{a_type}",
+                    f"b_type == vllm::{b_type}",
+                    f"c_type == vllm::{c_type}",
+                    f"s_type == vllm::{s_type}",
+                    f"threads == {config['threads']}",
+                    f"thread_m_blocks == {config['thread_m_blocks']}",
+                    f"thread_n_blocks == {config['thread_n_blocks']}",
+                    f"thread_k_blocks == {config['thread_k_blocks']}",
+                    f"m_block_size_8 == {config['m_block_size_8']}",
+                    f"stages == {config['stages']}",
+                    f"group_blocks == {config['group_blocks']}",
+                    f"is_zp_float == {config['is_zp_float']}",
+                ]
+                conditions = " && ".join(conditions)
 
-        filename = filename.lower()
+                if kernel_selector_str == FILE_HEAD_COMMENT:
+                    kernel_selector_str += f"if ({conditions})\n  kernel = "
+                else:
+                    kernel_selector_str += f"else if ({conditions})\n  kernel = "
 
-        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
-            f.write(file_content)
+                kernel_template2 = (
+                    "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                    "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                    "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                    "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                    "{{is_zp_float}}>;"
+                )
+
+                kernel_selector_str += (
+                    jinja2.Template(kernel_template2).render(
+                        a_type_id=f"vllm::{a_type}.id()",
+                        b_type_id=f"vllm::{b_type}.id()",
+                        c_type_id=f"vllm::{c_type}.id()",
+                        s_type_id=f"vllm::{s_type}.id()",
+                        **config,
+                    )
+                    + "\n"
+                )
+
+            file_content = FILE_HEAD + "\n\n"
+            file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+            if a_type == "kFE4M3fn":
+                filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            elif result_dict_tmp is sm_75_result_dict:
+                filename = f"sm75_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+            else:
+                filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+            filename = filename.lower()
+
+            with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+                f.write(file_content)
 
     if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
         kernel_selector_str += (
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 28ff06559a98a..77f319d53bc52 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -37,7 +37,7 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
@@ -148,7 +148,7 @@ typedef struct {
 
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                           int prob_n, int prob_k, int num_bits, int group_size,
-                          bool has_act_order, bool is_k_full) {
+                          bool has_act_order, bool is_k_full, int stages) {
   bool cache_scales_chunk = has_act_order && !is_k_full;
 
   int tb_n = th_config.thread_n;
@@ -166,28 +166,29 @@ int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
 
   if (cache_scales_chunk) {
     int load_groups =
-        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+        tb_groups * stages * 2;          // Chunk size is 2x pipeline over dim K
     load_groups = max(load_groups, 32);  // We load at least 32 scale groups
     return load_groups * tb_n * 2;
   } else {
     int tb_scales = tb_groups * tb_n * 2;
 
-    return tb_scales * pipe_stages;
+    return tb_scales * stages;
   }
 }
 
 int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
                           int prob_m, int prob_n, int prob_k, int num_bits,
                           int group_size, bool has_act_order, bool is_k_full,
-                          int has_zp, int is_zp_float) {
+                          int has_zp, bool is_zp_float, bool is_a_8bit,
+                          int stages) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
   int tb_k = th_config.thread_k;
   int tb_n = th_config.thread_n;
   int tb_m = thread_m_blocks * 16;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
-  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_a_size = stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
+  int sh_b_size = stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
   int tmp_size =
@@ -196,8 +197,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
 
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
-                            group_size, has_act_order, is_k_full);
-  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+                            group_size, has_act_order, is_k_full, stages);
+  int sh_g_idx_size = has_act_order && !is_k_full ? stages * tb_k / 4 : 0;
   int sh_zp_size = 0;
   if (has_zp) {
     if (is_zp_float)
@@ -217,7 +218,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
 bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
                      int prob_m, int prob_n, int prob_k, int num_bits,
                      int group_size, bool has_act_order, bool is_k_full,
-                     int has_zp, int is_zp_float, int max_shared_mem) {
+                     int has_zp, bool is_zp_float, bool is_a_8bit, int stages,
+                     int max_shared_mem) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -242,7 +244,7 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   // Check that pipeline fits into cache
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
-      has_act_order, is_k_full, has_zp, is_zp_float);
+      has_act_order, is_k_full, has_zp, is_zp_float, is_a_8bit, stages);
   return cache_size <= max_shared_mem;
 }
 
@@ -251,7 +253,7 @@ MarlinFuncPtr get_marlin_kernel(
     const vllm::ScalarType c_type, const vllm::ScalarType s_type,
     int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
     bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
-    int threads, bool is_zp_float) {
+    int threads, bool is_zp_float, int stages) {
   int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
 
@@ -265,7 +267,8 @@ exec_config_t determine_exec_config(
     const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
     int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
     int num_bits, int group_size, bool has_act_order, bool is_k_full,
-    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
+    bool has_zp, bool is_zp_float, int is_a_8bit, int stages,
+    int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -280,13 +283,15 @@ exec_config_t determine_exec_config(
 
     if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                          num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem - 512)) {
+                         is_zp_float, is_a_8bit, stages,
+                         max_shared_mem - 512)) {
       continue;
     }
 
-    int cache_size = get_kernel_cache_size(
-        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
-        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+    int cache_size = get_kernel_cache_size(th_config, thread_m_blocks, prob_m,
+                                           prob_n, prob_k, num_bits, group_size,
+                                           has_act_order, is_k_full, has_zp,
+                                           is_zp_float, is_a_8bit, stages);
 
     int group_blocks = 0;
     if (!has_act_order) {
@@ -297,14 +302,10 @@ exec_config_t determine_exec_config(
         get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
                           th_config.thread_n / 16, th_config.thread_k / 16,
                           m_block_size_8, has_act_order, has_zp, group_blocks,
-                          th_config.num_threads, is_zp_float);
+                          th_config.num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) continue;
 
-    // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
-    // int n_tiles = prob_n / th_config.thread_n;
-    // int k_tiles = prob_k / th_config.thread_k;
-
     return {1, th_config};
   }
 
@@ -321,6 +322,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
+  bool is_a_8bit = a_type.size_bits() == 8;
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -389,8 +391,14 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          dev);
   cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
                          dev);
-  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
-              "marlin kernel only support Ampere or newer GPUs.");
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 75,
+              "marlin kernel only support Turing or newer GPUs.");
+  int stages = 4;
+  if (major_capability == 7 && minor_capability == 5) {
+    stages = 2;
+    TORCH_CHECK(a_type == vllm::kFloat16 || a_type == vllm::kS8,
+                "Turing only support FP16 or INT8 activation.");
+  }
   if (a_type == vllm::kFE4M3fn) {
     TORCH_CHECK(
         major_capability * 10 + minor_capability == 89 ||
@@ -431,7 +439,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
       exec_cfg = determine_exec_config(
           a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
           thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
-          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
+          is_k_full, has_zp, is_zp_float, is_a_8bit, stages, max_shared_mem,
+          sms);
       thread_tfg = exec_cfg.tb_cfg;
       if (thread_tfg.thread_n != -1) {
         if (prob_n / thread_tfg.thread_n *
@@ -440,7 +449,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
           if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
                               prob_n, prob_k, num_bits, group_size,
                               has_act_order, is_k_full, has_zp, is_zp_float,
-                              max_shared_mem_new)) {
+                              is_a_8bit, stages, max_shared_mem_new)) {
             thread_tfg = {128, 64, 128};
             exec_cfg = {1, thread_tfg};
           }
@@ -466,7 +475,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     TORCH_CHECK(
         is_valid_config(thread_tfg, thread_m_blocks, prob_m_split, prob_n,
                         prob_k, num_bits, group_size, has_act_order, is_k_full,
-                        has_zp, is_zp_float, max_shared_mem_new),
+                        has_zp, is_zp_float, is_a_8bit, stages,
+                        max_shared_mem_new),
         "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
         ", thread_k = ", thread_tfg.thread_k,
         ", thread_n = ", thread_tfg.thread_n,
@@ -475,12 +485,12 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
         ", prob_m_split = ", prob_m_split, ", group_size = ", group_size,
         ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
         ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
-        ", max_shared_mem_new = ", max_shared_mem_new);
+        ", stages = ", stages, ", max_shared_mem_new = ", max_shared_mem_new);
 
     auto kernel = get_marlin_kernel(
         a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
         thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
-        num_threads, is_zp_float);
+        num_threads, is_zp_float, stages);
 
     if (kernel == MarlinDefault) {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index 2505e221322dd..33fe52f605b42 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -1,17 +1,19 @@
 #pragma once
 
-#include <torch/all.h>
+#ifndef _marlin_cuh
+  #define _marlin_cuh
+  #include <torch/all.h>
 
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
+  #include <ATen/cuda/CUDAContext.h>
+  #include <c10/cuda/CUDAGuard.h>
+  #include <cuda.h>
+  #include <cuda_fp16.h>
+  #include <cuda_runtime.h>
+  #include <iostream>
 
-#ifndef MARLIN_NAMESPACE_NAME
-  #define MARLIN_NAMESPACE_NAME marlin
-#endif
+  #ifndef MARLIN_NAMESPACE_NAME
+    #define MARLIN_NAMESPACE_NAME marlin
+  #endif
 
 namespace MARLIN_NAMESPACE_NAME {
 
@@ -51,9 +53,51 @@ using I4 = Vec<int, 4>;
 
 constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-// No support for async
-#else
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int32_t*>(smem_ptr)[0] =
+        reinterpret_cast<const int32_t*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int64_t*>(smem_ptr)[0] =
+        reinterpret_cast<const int64_t*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int4*>(smem_ptr)[0] =
+        reinterpret_cast<const int4*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  if (pred) {
+    reinterpret_cast<int4*>(smem_ptr)[0] =
+        reinterpret_cast<const int4*>(glob_ptr)[0];
+  }
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  reinterpret_cast<int4*>(smem_ptr)[0] =
+      reinterpret_cast<const int4*>(glob_ptr)[0];
+}
+
+__device__ inline void cp_async_fence() {}
+
+template <int n>
+__device__ inline void cp_async_wait() {}
+
+  #else
 
 __device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
                                          bool pred = true) {
@@ -126,6 +170,8 @@ __device__ inline void cp_async_wait() {
   asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
 }
 
-#endif
+  #endif
 
 }  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
\ No newline at end of file
diff --git a/csrc/quantization/gptq_marlin/marlin_mma.h b/csrc/quantization/gptq_marlin/marlin_mma.h
new file mode 100644
index 0000000000000..6ec2aaafc4392
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/marlin_mma.h
@@ -0,0 +1,269 @@
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
+    static_assert(!use_fp16_accum);
+  }
+
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(b[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[2]), "r"(a[3]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+#else
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, half>::value &&
+                         use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(a[1]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[2]), "r"(a[3]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
+#else
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[0]), "r"(b[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(a[1]), "r"(b[0]), "r"(c[2]), "r"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(a[2]), "r"(b[1]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(a[3]), "r"(b[1]), "r"(c[2]), "r"(c[3]));
+#else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+#endif
+    }
+  }
+}
+
+template <vllm::ScalarTypeId type_id, bool use_fp16_accum, int k_size = 16>
+__device__ inline void mma_trans(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (!std::is_same<scalar_t, half>::value || k_size != 16) {
+    static_assert(!use_fp16_accum);
+  }
+
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value && !use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+#else
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, half>::value &&
+                         use_fp16_accum) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[1]), "r"(b2[1]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
+#else
+      uint32_t* c = reinterpret_cast<uint32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 "
+          "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]));
+#endif
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
+  } else {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(b2[1]), "r"(a[0]), "r"(c[2]), "r"(c[3]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[0]), "=r"(c[1])
+          : "r"(b[0]), "r"(a[1]), "r"(c[0]), "r"(c[1]));
+      asm volatile(
+          "mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1}, {%2}, {%3}, {%4,%5};\n"
+          : "=r"(c[2]), "=r"(c[3])
+          : "r"(b2[1]), "r"(a[1]), "r"(c[2]), "r"(c[3]));
+#else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+#endif
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
\ No newline at end of file
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index 22bb71e482ce8..c7b53696c1223 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -26,6 +26,7 @@
 #include "marlin.cuh"
 #include "marlin_dtypes.cuh"
 #include "dequant.h"
+#include "marlin_mma.h"
 #include "core/scalar_type.hpp"
 
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
@@ -35,7 +36,7 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
@@ -75,137 +76,6 @@ __global__ void Marlin(
 
 #else
 
-// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
-// output/accumulation.
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
-            "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
-            "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  } else if (k_size == 32) {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
-template <vllm::ScalarTypeId type_id, int k_size = 16>
-__device__ inline void mma_trans(
-    const typename MarlinScalarType<type_id>::FragA& a_frag,
-    const typename MarlinScalarType<type_id>::FragB& frag_b,
-    const typename MarlinScalarType<type_id>::FragB& frag_b2,
-    typename MarlinScalarType<type_id>::FragC& frag_c) {
-  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
-  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
-  if constexpr (k_size == 16) {
-    if constexpr (std::is_same<scalar_t, half>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
-            "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
-            "r"(c[3]));
-    }
-  } else {
-    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
-      float* c = reinterpret_cast<float*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
-      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
-      asm volatile(
-          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
-          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
-          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
-    }
-  }
-}
-
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
 template <int count, vllm::ScalarTypeId type_id>
@@ -415,6 +285,17 @@ __global__ void Marlin(
   if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
   #endif
 
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  // Turing TensorCore only supports fp16 and int8
+  if constexpr (a_type_id != vllm::kFloat16.id() && a_type_id != vllm::kS8.id())
+    return;
+  #endif
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
+  constexpr bool use_fp16_accum = a_type_id == vllm::kFloat16.id();
+  #else
+  constexpr bool use_fp16_accum = false;
+  #endif
   using Adtype = MarlinScalarType<a_type_id>;
   using Cdtype = MarlinScalarType<c_type_id>;
   const int4* A = A0;
@@ -873,10 +754,6 @@ __global__ void Marlin(
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
   int4* sh_s = sh_zp + (stages * zp_sh_stage);
-  // shared memory reused by reduction should be smaller than
-  // shared memory used by weight.
-  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
-                stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
 
   // Register storage for double buffer of shared memory reads.
@@ -1395,11 +1272,13 @@ __global__ void Marlin(
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
-                               frag_c[i][j][0]);
+          mma_trans<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0, frag_b1,
+                                               frag_c[i][j][0]);
         } else {
-          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b0,
+                                         frag_c[i][j][0]);
+          mma<a_type_id, use_fp16_accum>(frag_a[k2][i], frag_b1,
+                                         frag_c[i][j][1]);
         }
       }
     }
@@ -1433,10 +1312,12 @@ __global__ void Marlin(
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
-        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
-                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[0],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, false, 32>(
+            frag_a[k2][i], frag_b[1],
+            (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
       }
 
       if constexpr (group_blocks != -1) {
@@ -1956,6 +1837,21 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      // convert fp16 accum to fp32 for reduction
+      if constexpr (use_fp16_accum) {
+  #pragma unroll
+        for (int i = 0; i < (thread_m_blocks * (is_a_8bit ? 2 : 4) * 2); i++) {
+          float* frag_c_part_float = reinterpret_cast<float*>(frag_c) + i * 4;
+          scalar_t* frag_c_part_half =
+              reinterpret_cast<scalar_t*>(frag_c_part_float);
+
+  #pragma unroll
+          for (int i = 3; i >= 0; i--) {
+            frag_c_part_float[i] = Cdtype::num2float(frag_c_part_half[i]);
+          }
+        }
+      }
+
       if constexpr (is_a_8bit) {
         float frag_a_s[2 * thread_m_blocks];
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3ed15ed7dd422..314848721a80a 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -121,7 +121,7 @@ class AWQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f2b66a2beb6d7..800340ed6043c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -253,7 +253,7 @@ class Fp8Config(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6e5dcfe59b2f9..347c7b2008d12 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -181,7 +181,7 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def get_config_filenames(cls) -> list[str]:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index d5d7e7bfaae73..aa3937d4c03ff 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -871,7 +871,7 @@ class ModelOptNvFp4Config(ModelOptQuantConfigBase):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 80
+        return 75
 
     @classmethod
     def override_quantization_method(

From b6ec077e058e15e5b853793924e6643ec6c579aa Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 16 Dec 2025 17:47:53 -0500
Subject: [PATCH 09/49] [CI] Skip ci failure test (#30804)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tests/compile/distributed/test_fusions_e2e.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index bd326f1157d8f..80086c4e03a9c 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -523,6 +523,8 @@ CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
     list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
 )
 @pytest.mark.parametrize("inductor_graph_partition", [True, False])
+# TODO: remove skip after we fix the fusion thoroughly
+@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
 def test_rms_group_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
@@ -562,7 +564,7 @@ def test_rms_group_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, enable_fusion=True),
+        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )

From 0a1ab1e565fce5070bc1c1b1f3374537e437550c Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 17:56:02 -0500
Subject: [PATCH 10/49] [Perf][Kernels] Vectorize `csrc/activations_kernels.cu`
 (#29512)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/benchmark_activation.py |   4 +-
 csrc/activation_kernels.cu                 | 208 +++++++++++++++++----
 2 files changed, 175 insertions(+), 37 deletions(-)

diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
index 66268b71b3de6..d31e67057d8f6 100644
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -13,8 +13,8 @@ from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+batch_size_range = [1, 16, 128]
+seq_len_range = [1, 16, 64, 1024, 4096]
 intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
 
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index a4a880f13cf7e..8268065ef02c8 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
                                             const scalar_t& y) {
   return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
 }
-// Activation and gating kernel template.
 
+// Check if all pointers are 16-byte aligned for int4 vectorized access
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
+}
+
+// Activation and gating kernel template.
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
           bool act_first>
 __global__ void act_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., 2, d]
     const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access.
+  // All three pointers must be 16-byte aligned for safe int4 operations.
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
+                                                        VLLM_LDG(&y_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+    }
   }
 }
 
@@ -120,50 +162,115 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
 __global__ void act_and_mul_kernel_with_param(
     scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
     const float param) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(xp[j], param) * yp[j];
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x, param) * y;
+    }
   }
 }
 
 template <typename T>
 __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
                                                float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
+  // Clamp gate to (-inf, limit] and up to [-limit, limit]
+  const float g = fminf((float)gate, limit);
+  const float u = fmaxf(fminf((float)up, limit), -limit);
+  // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
+  return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
 }
 
+// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
 template <typename scalar_t,
           scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
                              const float)>
 __global__ void swigluoai_and_mul_kernel(
     scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const scalar_t* __restrict__ input,  // [..., 2 * d] (interleaved)
     const int d, const float alpha, const float limit) {
+  // For interleaved data: input has 2*d elements per token (gate/up pairs)
+  // output has d elements per token
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  constexpr int PAIRS = VEC_SIZE / 2;  // Number of gate/up pairs per int4 load
   const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+  const scalar_t* in_ptr = input + token_idx * 2 * d;
+  scalar_t* out_ptr = out + token_idx * d;
 
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  // Check alignment for 128-bit vectorized access on input.
+  // For output we use int2 (64-bit) which has 8-byte alignment requirement.
+  const bool in_aligned = is_16byte_aligned(in_ptr);
+  const bool out_aligned =
+      (reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0;  // 8-byte for int2
+
+  if (in_aligned && out_aligned && d >= PAIRS) {
+    // Fast path: vectorized loop
+    // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
+    // Each int2 store writes PAIRS output elements
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int2* out_vec = reinterpret_cast<int2*>(out_ptr);
+    const int num_vecs = d / PAIRS;
+    const int vec_end = num_vecs * PAIRS;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]);
+      int2 r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < PAIRS; j++) {
+        rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
+                          VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      // gate = x[..., ::2]  (even indices)
+      const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
+      // up = x[..., 1::2]   (odd indices)
+      const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
+      out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
+    }
   }
 }
 
@@ -217,10 +324,41 @@ __global__ void activation_kernel(
     scalar_t* __restrict__ out,          // [..., d]
     const scalar_t* __restrict__ input,  // [..., d]
     const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
   const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x);
+  const scalar_t* in_ptr = input + token_idx * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]), r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(vp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&in_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x);
+    }
   }
 }
 

From 2410132bb1f9faa5b252fad3f2b83dc926946b08 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 07:32:43 +0800
Subject: [PATCH 11/49] [ROCm] [Bugfix] Fix torch sdpa hallucination (#30789)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/attention/ops/vit_attn_wrappers.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 46c7d83dfa5c2..892c4209c01e0 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -16,6 +16,7 @@ import einops
 import torch
 import torch.nn.functional as F
 
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 
 
@@ -89,6 +90,13 @@ def torch_sdpa_wrapper(
     v: torch.Tensor,
     cu_seqlens: torch.Tensor,
 ) -> torch.Tensor:
+    # Never remove the contiguous logic for ROCm
+    # Without it, hallucinations occur with the backend
+    if current_platform.is_rocm():
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+
     outputs = []
 
     lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()

From e80455ca8b696452b98d91785175210ed7a1bd41 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 18:40:47 -0500
Subject: [PATCH 12/49] Replace deprecated enable_fusion with fuse_norm_quant
 in test_rms_group_quant (#30817)

Signed-off-by: mgoin <mgoin64@gmail.com>

From e087fbc393055fb69e9acf71fa124be0190498ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 17 Dec 2025 00:54:45 +0100
Subject: [PATCH 13/49] [MM] Pass FA version in ViT Attn (#30756)

Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/attention/layers/mm_encoder_attention.py | 6 ++++++
 vllm/attention/ops/vit_attn_wrappers.py       | 9 ++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py
index c9107ebcab856..8b3dee1340b9f 100644
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -10,6 +10,7 @@ from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
     vit_torch_sdpa_wrapper,
 )
+from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import MultiModalConfig
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
@@ -101,6 +102,10 @@ class MMEncoderAttention(CustomOp):
             self.attn_backend,
         )
 
+        if self.is_flash_attn_backend:
+            assert self.flash_attn_varlen_func is not None
+            self._fa_version = get_flash_attn_version()
+
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
@@ -204,6 +209,7 @@ class MMEncoderAttention(CustomOp):
             max_seqlen=max_seqlen,
             batch_size=bsz,
             is_rocm_aiter=(self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA),
+            fa_version=self._fa_version,
         )
         return output
 
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 892c4209c01e0..5a74e1310133d 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -28,11 +28,15 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int,
 ) -> torch.Tensor:
+    kwargs = {}
     if is_rocm_aiter:
         from aiter import flash_attn_varlen_func
     else:
         from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+
+        kwargs["fa_version"] = fa_version
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -44,6 +48,7 @@ def flash_attn_maxseqlen_wrapper(
         max_seqlen_k=max_seqlen.item(),
         dropout_p=0.0,
         causal=False,
+        **kwargs,
     )
     context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size)
     return context_layer
@@ -57,6 +62,7 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int,
 ) -> torch.Tensor:
     return torch.empty_like(q)
 
@@ -76,9 +82,10 @@ def vit_flash_attn_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    fa_version: int,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter
+        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, fa_version
     )
 
 

From c0a88df7f771a48247a934e8821e6e230b3fc5a4 Mon Sep 17 00:00:00 2001
From: Amr Mahdi <amrmahdi@meta.com>
Date: Wed, 17 Dec 2025 02:41:57 +0200
Subject: [PATCH 14/49] [docker] Allow kv_connectors install to fail on arm64
 (#30806)

Signed-off-by: Amr Mahdi <amrmahdi@meta.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index ae2624ace67b9..e61021b6eeb85 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -621,7 +621,7 @@ ENV UV_HTTP_TIMEOUT=500
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r /tmp/kv_connectors.txt; \
+        uv pip install --system -r /tmp/kv_connectors.txt || true; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image

From f5db6385a19b04e76b5834618305485753e75544 Mon Sep 17 00:00:00 2001
From: "Grzegorz K. Karch" <grzegorz-k-karch@users.noreply.github.com>
Date: Wed, 17 Dec 2025 02:06:28 +0100
Subject: [PATCH 15/49] Fix nemotron_nas intermediate_size computation (#30795)

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 vllm/model_executor/models/nemotron_nas.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 19a942a5277cc..83ef5e7e1282d 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -169,10 +169,13 @@ class DeciLMDecoderLayer(nn.Module):
             self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         if not self._is_no_op_ffn:
-            ffn_mult = block_config.ffn.ffn_mult
-            intermediate_size = _ffn_mult_to_intermediate_size(
-                ffn_mult, config.hidden_size
-            )
+            if hasattr(block_config.ffn, "ffn_mult"):
+                ffn_mult = block_config.ffn.ffn_mult
+                intermediate_size = _ffn_mult_to_intermediate_size(
+                    ffn_mult, config.hidden_size
+                )
+            else:
+                intermediate_size = block_config.ffn.intermediate_size
 
             self.mlp = LlamaMLP(
                 hidden_size=self.hidden_size,

From 811cdf5197acb4d6ab42250a5b0f822887d1190a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 16 Dec 2025 20:52:14 -0500
Subject: [PATCH 16/49] Update model-hosting-container-standards to 0.1.10
 (#30815)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/common.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 31c8fb404f63a..426d281c26704 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -50,5 +50,5 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards >= 0.1.9, < 1.0.0
-mcp
\ No newline at end of file
+model-hosting-container-standards >= 0.1.10, < 1.0.0
+mcp

From bb5ac1fe38c9fcc7bafaee47fd45c8d1696ad176 Mon Sep 17 00:00:00 2001
From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com>
Date: Wed, 17 Dec 2025 04:21:07 +0000
Subject: [PATCH 17/49] [CPU] Add action to automatically label CPU related PRs
 (#30678)

Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
---
 .github/mergify.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3ad79f93bc7ad..3e4e21efe39df 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -235,6 +235,20 @@ pull_request_rules:
       add:
         - rocm
 
+- name: label-cpu
+  description: Automatically apply cpu label
+  conditions:
+    - label != stale
+    - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.*
+  actions:
+    label:
+      add:
+        - cpu
+    assign:
+      users:
+        - "fadara01"
+        - "aditew01"
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:

From 44d3b1df3d6416b76d84c360d751b8f5220c0b11 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 17 Dec 2025 12:21:19 +0800
Subject: [PATCH 18/49] [CI/Build] Fix compatibility between #30244 and #30396
 (#30787)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/compile/distributed/test_fusions_e2e.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 80086c4e03a9c..960b5b4bd7ad4 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -564,7 +564,9 @@ def test_rms_group_quant(
         splitting_ops=splitting_ops,
         # Common
         mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
+        pass_config=PassConfig(
+            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+        ),
         # Inductor caches custom passes by default as well via uuid
         inductor_compile_config={"force_disable_caches": True},
     )

From 009a773828fee13504ee2976ad02abb6020152c8 Mon Sep 17 00:00:00 2001
From: shanjiaz <43143795+shanjiaz@users.noreply.github.com>
Date: Wed, 17 Dec 2025 00:01:04 -0500
Subject: [PATCH 19/49] bump up compressed tensors version to 0.13.0 (#30799)

Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 426d281c26704..7c89385da6ba5 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.12.2 # required for compressed-tensors
+compressed-tensors == 0.13.0 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

From d4d2751732c3ccae162a5a0160c7d4fe05d2779a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 17 Dec 2025 00:29:03 -0500
Subject: [PATCH 20/49] Update note comment for flashinfer attention warmup
 (#30711)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/warmup/kernel_warmup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 95f5982bc8c7b..98b28d3e5292f 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -49,13 +49,12 @@ def kernel_warmup(worker: "Worker"):
         except NotImplementedError:
             return False
 
-    # NOTE: we add check for empty attn_groups to avoid errors when
-    # deploying models such as E instances and encoder-only models.
-    # As for those models, worker.model_runner.attn_groups is empty.
-    # This change is made during EPD feature development.
     if (
         not worker.model_runner.is_pooling_model
         and worker.model_runner.attn_groups
+        # NOTE: This should be `any` instead of `all` but other hybrid attention
+        # backends don't support this dummy run. Once we remove
+        # `build_for_cudagraph_capture`, we can change it to `any`.
         and all(
             _is_flashinfer_backend(group.backend)
             for groups in worker.model_runner.attn_groups

From 0cd5353644d3d045ab33c7e8e19c182bfd7db911 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 17 Dec 2025 15:25:12 +0800
Subject: [PATCH 21/49] [Bugfix][CPU] Fix CPU backend ROPE dispatch for VL
 models (#30829)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Li, Jiang <bigpyj64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/layers/rotary_embedding/common.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 3e6584dbc3da0..50660c6ecc223 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -264,6 +264,15 @@ class ApplyRotaryEmb(CustomOp):
 
         return output
 
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        # TODO (bigPYJ1151): need to enable fused CPU ROPE here
+        return self.forward_native(x, cos, sin)
+
     def extra_repr(self) -> str:
         s = f"is_neox_style={self.is_neox_style}"
         s += f"enable_fp32_compute={self.enable_fp32_compute}"

From 4f735babb7353987137b85ec0465e594e9ed1384 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Wed, 17 Dec 2025 16:28:13 +0800
Subject: [PATCH 22/49] [XPU] fix broken fp8 online quantization for XPU
 platform (#30831)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 .../layers/quantization/ipex_quant.py         | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 463c74c1c1482..f33ee43727f19 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -27,6 +27,10 @@ from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    maybe_create_device_identity,
+)
+from vllm.model_executor.parameter import ModelWeightParameter
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -305,6 +309,37 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
     def __init__(self, quant_config: Fp8Config):
         super().__init__(quant_config)
 
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
     def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:

From 20fda431515d19a883cc962d3a1fa727f225e82d Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Wed, 17 Dec 2025 16:37:57 +0800
Subject: [PATCH 23/49] [Bugfix][Frontend] Prevent IndexError in MiniMax M2
 tool parser during streaming extraction (#30555)

Signed-off-by: WangErXiao <863579016@qq.com>
---
 tests/tool_use/test_minimax_m2_tool_parser.py | 119 ++++++++++++++++++
 vllm/tool_parsers/minimax_m2_tool_parser.py   |  22 +++-
 2 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 tests/tool_use/test_minimax_m2_tool_parser.py

diff --git a/tests/tool_use/test_minimax_m2_tool_parser.py b/tests/tool_use/test_minimax_m2_tool_parser.py
new file mode 100644
index 0000000000000..cf1835b1928b4
--- /dev/null
+++ b/tests/tool_use/test_minimax_m2_tool_parser.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.tool_parsers.minimax_m2_tool_parser import (
+    MinimaxM2ToolParser,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+class FakeTokenizer:
+    """Minimal fake tokenizer that exposes the attributes used by the
+    parser: a truthy model_tokenizer marker and a vocab mapping for the
+    special tokens.
+    """
+
+    def __init__(self):
+        self.model_tokenizer = True
+        # The parser will look up start/end tokens by their literal strings
+        self.vocab = {
+            "<minimax:tool_call>": 1,
+            "</minimax:tool_call>": 2,
+        }
+
+    def get_vocab(self):
+        return self.vocab
+
+
+@pytest.fixture
+def minimax_m2_tool_parser():
+    return MinimaxM2ToolParser(FakeTokenizer())
+
+
+def test_extract_tool_calls_streaming_incremental(minimax_m2_tool_parser):
+    parser = minimax_m2_tool_parser
+    parser._reset_streaming_state()
+    chunks = [
+        "<minimax:tool_call>",
+        '<invoke name="get_weather">',
+        '<parameter name="city">',
+        "Seattle</parameter>",
+        "</invoke></minimax:tool_call>",
+    ]
+    previous = ""
+    for chunk in chunks:
+        current = previous + chunk
+        delta = chunk
+        parser.extract_tool_calls_streaming(
+            previous_text=previous,
+            current_text=current,
+            delta_text=delta,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+        previous = current
+
+    assert len(parser.prev_tool_call_arr) == 1
+    entry = parser.prev_tool_call_arr[0]
+
+    assert entry["name"] == "get_weather"
+    args = entry["arguments"]
+    assert args["city"] == "Seattle"
+
+
+def test_streaming_minimax_m2_multiple_invokes(minimax_m2_tool_parser):
+    parser = minimax_m2_tool_parser
+    parser._reset_streaming_state()
+
+    chunks = [
+        "<minimax:tool_call>",
+        '<invoke name="search_web">',
+        '<parameter name="query_tag">',
+        '["technology", "events"]</parameter>',
+        '<parameter name="query_list">',
+        '["OpenAI", "latest", "release"]</parameter>',
+        "</invoke>",
+        '<invoke name="search_web">',
+        '<parameter name="query_tag">',
+        '["technology", "events"]</parameter>',
+        '<parameter name="query_list">',
+        '["Gemini", "latest", "release"]</parameter>',
+        "</invoke>",
+        "</minimax:tool_call>",
+    ]
+    previous = ""
+    for chunk in chunks:
+        current = previous + chunk
+        delta = chunk
+        parser.extract_tool_calls_streaming(
+            previous_text=previous,
+            current_text=current,
+            delta_text=delta,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=None,
+        )
+        previous = current
+
+    assert len(parser.prev_tool_call_arr) == 2
+
+    for entry, expect_model in zip(parser.prev_tool_call_arr, ["OpenAI", "Gemini"]):
+        assert entry["name"] == "search_web"
+        args = json.dumps(entry["arguments"])
+        assert "technology" in args and "events" in args
+        assert expect_model in args
+
+    # check streamed_args_for_tool for serving_chat.py
+    for index in range(2):
+        expected_call = parser.prev_tool_call_arr[index].get("arguments", {})
+        expected_call = json.dumps(expected_call)
+        actual_call = parser.streamed_args_for_tool[index]
+        assert expected_call == actual_call
diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
index dcb2b64f6e73c..a1ab75f548bfc 100644
--- a/vllm/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -122,6 +122,8 @@ class MinimaxM2ToolParser(ToolParser):
         self.streaming_request = None
         # Clear previous tool call history to avoid state pollution
         self.prev_tool_call_arr.clear()
+        # Reset streamed args tracking
+        self.streamed_args_for_tool.clear()
 
     def _extract_name(self, name_str: str) -> str:
         """Extract name from quoted string."""
@@ -421,9 +423,12 @@ class MinimaxM2ToolParser(ToolParser):
                         self.prev_tool_call_arr.append(
                             {
                                 "name": self.current_function_name,
-                                "arguments": "{}",  # Placeholder, will be updated later
+                                "arguments": {},  # Placeholder, will be updated later
                             }
                         )
+                        # Initialize streamed_args_for_tool for this tool call
+                        if len(self.streamed_args_for_tool) <= self.current_tool_index:
+                            self.streamed_args_for_tool.append("")
 
                     # Send header with function info
                     return DeltaMessage(
@@ -445,6 +450,9 @@ class MinimaxM2ToolParser(ToolParser):
             # Send opening brace if not sent yet
             if self.in_function and not self.json_started:
                 self.json_started = True
+                # Update streamed_args_for_tool for opening brace
+                if self.current_tool_index < len(self.streamed_args_for_tool):
+                    self.streamed_args_for_tool[self.current_tool_index] += "{"
                 return DeltaMessage(
                     tool_calls=[
                         DeltaToolCall(
@@ -493,7 +501,7 @@ class MinimaxM2ToolParser(ToolParser):
                                 args = parsed_tool.function.arguments
                                 self.prev_tool_call_arr[self.current_tool_index][
                                     "arguments"
-                                ] = args
+                                ] = json.loads(args)
                         except Exception:
                             pass  # Ignore parsing errors during streaming
 
@@ -505,7 +513,9 @@ class MinimaxM2ToolParser(ToolParser):
                             )
                         ]
                     )
-
+                    # Update streamed_args_for_tool for closing brace
+                    if self.current_tool_index < len(self.streamed_args_for_tool):
+                        self.streamed_args_for_tool[self.current_tool_index] += "}"
                     # Reset state for next tool
                     self.json_closed = True
                     self.in_function = False
@@ -630,7 +640,11 @@ class MinimaxM2ToolParser(ToolParser):
                             )
 
                         self.param_count += 1
-
+                        # Update streamed_args_for_tool for this tool call
+                        if self.current_tool_index < len(self.streamed_args_for_tool):
+                            self.streamed_args_for_tool[self.current_tool_index] += (
+                                json_fragment
+                            )
                         return DeltaMessage(
                             tool_calls=[
                                 DeltaToolCall(

From a9e15c21efbbc5b4a7a1e69e40378fdfe1acdcb7 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Wed, 17 Dec 2025 10:48:53 +0200
Subject: [PATCH 24/49] [Mamba] Removed disable cascade attn in
 MambaModelConfig (#30712)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 vllm/model_executor/models/config.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 4b08472538db4..a3624b1cfa5f2 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -308,12 +308,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
         if cache_config.mamba_block_size is None:
             cache_config.mamba_block_size = model_config.max_model_len
 
-        # TODO(tdoublep): remove once cascade attention is supported
-        logger.info(
-            "Disabling cascade attention since it is not supported for hybrid models."
-        )
-        model_config.disable_cascade_attn = True
-
 
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
     @classmethod

From 3b1d440ede42855f031ba72af4817583e5dddba0 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Wed, 17 Dec 2025 17:43:00 +0800
Subject: [PATCH 25/49] CustomOp: grouped topk (#29575)

Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
---
 tests/kernels/moe/test_grouped_topk.py        | 10 ++--
 .../layers/fused_moe/__init__.py              |  4 +-
 .../layers/fused_moe/fused_moe.py             | 52 +++++++++++++++++++
 vllm/model_executor/layers/fused_moe/layer.py | 23 +++++---
 4 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py
index 662e0723b7583..d26fe50b815b4 100644
--- a/tests/kernels/moe/test_grouped_topk.py
+++ b/tests/kernels/moe/test_grouped_topk.py
@@ -9,8 +9,8 @@ import pytest
 import torch
 
 from vllm.model_executor.layers.fused_moe.fused_moe import (
+    GroupedTopk,
     fused_grouped_topk,
-    grouped_topk,
 )
 from vllm.platforms import current_platform
 
@@ -50,15 +50,17 @@ def test_grouped_topk(
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0")
-        baseline_topk_weights, baseline_topk_ids = grouped_topk(
-            hidden_states=hidden_states,
-            gating_output=gating_output,
+        grouped_topk = GroupedTopk(
             topk=topk,
             renormalize=renormalize,
             num_expert_group=num_expert_group,
             topk_group=topk_group,
             scoring_func=scoring_func,
             routed_scaling_factor=routed_scaling_factor,
+        )
+        baseline_topk_weights, baseline_topk_ids = grouped_topk(
+            hidden_states=hidden_states,
+            gating_output=gating_output,
             e_score_correction_bias=e_score_correction_bias,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index d71cfc5ad8200..8fee4038b60b8 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -77,11 +77,11 @@ if HAS_TRITON:
         BatchedTritonExperts,
     )
     from vllm.model_executor.layers.fused_moe.fused_moe import (
+        GroupedTopk,
         TritonExperts,
         fused_experts,
         fused_topk,
         get_config_file_name,
-        grouped_topk,
     )
     from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
         TritonOrDeepGemmExperts,
@@ -91,7 +91,7 @@ if HAS_TRITON:
         "fused_topk",
         "fused_experts",
         "get_config_file_name",
-        "grouped_topk",
+        "GroupedTopk",
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
         "cutlass_moe_w4a8_fp8",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index b286c3bc6fc07..20782e2712f27 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -16,6 +16,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
 )
@@ -1286,6 +1287,57 @@ def grouped_topk(
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+@CustomOp.register("grouped_topk")
+class GroupedTopk(CustomOp):
+    """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""
+
+    def __init__(
+        self,
+        topk: int,
+        renormalize: bool,
+        num_expert_group: int = 0,
+        topk_group: int = 0,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.native_impl = grouped_topk
+        self.topk = topk
+        self.renormalize = renormalize
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.scoring_func = scoring_func
+        self.routed_scaling_factor = routed_scaling_factor
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.native_impl(
+            hidden_states,
+            gating_output,
+            self.topk,
+            self.renormalize,
+            self.num_expert_group,
+            self.topk_group,
+            self.scoring_func,
+            self.routed_scaling_factor,
+            e_score_correction_bias,
+        )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        e_score_correction_bias: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.forward_native(
+            hidden_states, gating_output, e_score_correction_bias
+        )
+
+
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def eplb_map_to_physical_and_record(
     topk_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b39ce415a0f83..db97d6eb88ea5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -67,7 +67,7 @@ else:
         return topk_ids
 
     eplb_map_to_physical_and_record = _eplb_map_to_physical_and_record
-from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
+from vllm.model_executor.layers.fused_moe.fused_moe import GroupedTopk
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
     rocm_aiter_grouped_topk,
 )
@@ -1594,19 +1594,26 @@ class FusedMoE(CustomOp):
                 grouped_topk_impl = partial(
                     rocm_aiter_grouped_topk,
                     num_fused_shared_experts=self.num_fused_shared_experts,
+                    topk=self.top_k,
+                    renormalize=self.renormalize,
+                    num_expert_group=self.num_expert_group,
+                    topk_group=self.topk_group,
+                    scoring_func=self.scoring_func,
+                    routed_scaling_factor=self.routed_scaling_factor,
                 )
             else:
-                grouped_topk_impl = grouped_topk
+                grouped_topk_impl = GroupedTopk(
+                    topk=self.top_k,
+                    renormalize=self.renormalize,
+                    num_expert_group=self.num_expert_group,
+                    topk_group=self.topk_group,
+                    scoring_func=self.scoring_func,
+                    routed_scaling_factor=self.routed_scaling_factor,
+                )
 
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=self.top_k,
-                renormalize=self.renormalize,
-                num_expert_group=self.num_expert_group,
-                topk_group=self.topk_group,
-                scoring_func=self.scoring_func,
-                routed_scaling_factor=self.routed_scaling_factor,
                 e_score_correction_bias=self.e_score_correction_bias,
             )
         elif self.e_score_correction_bias is not None:

From f4e884f2224a25612eaeaeac2a854c1dd330c144 Mon Sep 17 00:00:00 2001
From: Sheng Lin <linsh0@protonmail.com>
Date: Wed, 17 Dec 2025 17:52:58 +0800
Subject: [PATCH 26/49] [NIXL][Bugfix] Fix NIXL/RDMA registration failure over
 CuMemAllocator (#29569)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Somoku <linsh0@protonmail.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
---
 csrc/cumem_allocator.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index 78dc840a98b67..6c2c18a6602d2 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -107,6 +107,16 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
   prop.location.id = device;
   prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE;
 
+#ifndef USE_ROCM
+  int flag = 0;
+  CUDA_CHECK(cuDeviceGetAttribute(
+      &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
+      device));
+  if (flag) {  // support GPUDirect RDMA if possible
+    prop.allocFlags.gpuDirectRDMACapable = 1;
+  }
+#endif
+
 #ifndef USE_ROCM
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));

From 4c054d89aa5972014ba7e13c0accb0ab631b5638 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Wed, 17 Dec 2025 17:53:02 +0800
Subject: [PATCH 27/49] [Doc][ResponsesAPI] add documentation (#30840)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
---
 docs/serving/openai_compatible_server.md | 27 ++++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py      | 18 ++++++++++++----
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 0e29204f8947c..6a08f872def15 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -47,6 +47,8 @@ We currently support the following OpenAI APIs:
 - [Completions API](#completions-api) (`/v1/completions`)
     - Only applicable to [text generation models](../models/generative_models.md).
     - *Note: `suffix` parameter is not supported.*
+- [Responses API](#responses-api) (`/v1/responses`)
+    - Only applicable to [text generation models](../models/generative_models.md).
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
     - *Note: `user` parameter is ignored.*
@@ -229,6 +231,31 @@ The following extra parameters are supported:
     --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
     ```
 
+### Responses API
+
+Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py)
+
+#### Extra parameters
+
+The following extra parameters in the request object are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params"
+    ```
+
+The following extra parameters in the response object are supported:
+
+??? code
+
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params"
+    ```
+
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 94dde4564ea0c..a3c347cb1bd3f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1654,13 +1654,23 @@ class ResponsesResponse(OpenAIBaseModel):
     usage: ResponseUsage | None = None
     user: str | None = None
 
-    # --8<-- [start:responses-extra-params]
+    # --8<-- [start:responses-response-extra-params]
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
     # see serialize_input_messages and serialize_output_messages
-    input_messages: ResponseInputOutputMessage | None = None
-    output_messages: ResponseInputOutputMessage | None = None
-    # --8<-- [end:responses-extra-params]
+    input_messages: ResponseInputOutputMessage | None = Field(
+        default=None,
+        description=(
+            "If enable_response_messages, we can show raw token input to model."
+        ),
+    )
+    output_messages: ResponseInputOutputMessage | None = Field(
+        default=None,
+        description=(
+            "If enable_response_messages, we can show raw token output of model."
+        ),
+    )
+    # --8<-- [end:responses-response-extra-params]
 
     # NOTE: openAI harmony doesn't serialize TextContent properly,
     # TODO: this fixes for TextContent, but need to verify for tools etc

From a100152288c8ec50336aea842f0b3d8e36624024 Mon Sep 17 00:00:00 2001
From: "Ye (Charlotte) Qi" <yeq@meta.com>
Date: Wed, 17 Dec 2025 01:54:21 -0800
Subject: [PATCH 28/49] [Kernels][FI] Skip trtllm attention when num_kv_heads=1
 (#30842)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 .../test_flashinfer_trtllm_attention.py       | 35 +++++++++++++++++++
 vllm/utils/flashinfer.py                      | 22 +++++++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 06a7085a82ba0..220d827b9d5fa 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -455,3 +455,38 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol),
         f"{torch.max(torch.abs(output - output_trtllm))}",
     )
+
+
+def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
+    """Test that TRTLLM attention correctly rejects num_kv_heads=1.
+
+    When num_kv_heads=1 (MQA), the KV cache strides become degenerate
+    (stride_heads == stride_batch), which causes CUDA's cuTensorMapEncodeTiled
+    to fail because TMA descriptors cannot handle degenerate 4D tensors with
+    singleton dimensions.
+
+    This test verifies that can_use_trtllm_attention returns False for
+    num_kv_heads=1 configurations.
+    """
+    from vllm.utils.flashinfer import can_use_trtllm_attention
+
+    # num_kv_heads=1 should be rejected
+    assert not can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1), (
+        "can_use_trtllm_attention should return False for num_kv_heads=1"
+    )
+    assert not can_use_trtllm_attention(num_qo_heads=32, num_kv_heads=1), (
+        "can_use_trtllm_attention should return False for num_kv_heads=1"
+    )
+
+    # num_kv_heads > 1 should be accepted (if platform supports it)
+    # Note: This may return False on non-Blackwell platforms, which is fine
+    result_kv8 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=8)
+    result_kv1 = can_use_trtllm_attention(num_qo_heads=64, num_kv_heads=1)
+
+    # Even if platform doesn't support TRTLLM, num_kv_heads=1 should never
+    # return True when num_kv_heads > 1 returns True
+    if result_kv8:
+        assert not result_kv1, (
+            "If TRTLLM is supported for num_kv_heads=8, "
+            "it must be rejected for num_kv_heads=1"
+        )
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 1c2710be3173b..6bbe02348eaf1 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -305,7 +305,18 @@ def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool:
     if force_use_trtllm_attention() is False:
         return False
     has_trtllm = supports_trtllm_attention()
-    return has_trtllm and (num_qo_heads % num_kv_heads == 0)
+    # num_kv_heads=1 is not supported due to TMA descriptor building limitations.
+    # When num_kv_heads=1, the KV cache strides become degenerate (stride_heads ==
+    # stride_batch), which causes CUDA's cuTensorMapEncodeTiled to fail because
+    # TMA descriptors cannot handle degenerate 4D tensors with singleton dimensions.
+    # See: https://fburl.com/352mrydz
+    if has_trtllm and num_kv_heads == 1:
+        logger.warning_once(
+            "TRTLLM attention does not support num_kv_heads=1. "
+            "This configuration causes TMA descriptor building to fail due to "
+            "degenerate tensor strides. Falling back to FlashInfer attention."
+        )
+    return has_trtllm and (num_qo_heads % num_kv_heads == 0) and (num_kv_heads != 1)
 
 
 def use_trtllm_attention(
@@ -355,6 +366,15 @@ def use_trtllm_attention(
             )
         return False
 
+    # num_kv_heads=1 is not supported
+    if num_kv_heads == 1:
+        if force_use_trtllm:
+            logger.warning_once(
+                "TRTLLM attention does not support num_kv_heads=1, "
+                "but --attention-config.use_trtllm_attention is set to 1"
+            )
+        return False
+
     if has_spec and not is_prefill:
         # Speculative decoding requires TRTLLM attention for decodes
         logger.info_once("Using TRTLLM attention (enabled for speculative decoding).")

From 519ef9a91111d2d6f8545c8a6b2c1a28d87309fa Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 17 Dec 2025 04:55:30 -0500
Subject: [PATCH 29/49] [UX] Make `vllm bench serve` discover model by default
 and use --input-len (#30816)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/benchmarks/test_serve_cli.py |  9 ++--
 vllm/benchmarks/serve.py           | 83 +++++++++++++++++++++++++++---
 2 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 90d685c966d3e..c579b38069864 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -19,21 +19,18 @@ def server():
 
 @pytest.mark.benchmark
 def test_bench_serve(server):
+    # Test default model detection and input/output len
     command = [
         "vllm",
         "bench",
         "serve",
-        "--model",
-        MODEL_NAME,
         "--host",
         server.host,
         "--port",
         str(server.port),
-        "--dataset-name",
-        "random",
-        "--random-input-len",
+        "--input-len",
         "32",
-        "--random-output-len",
+        "--output-len",
         "4",
         "--num-prompts",
         "5",
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index f5d8ea5a975a9..12756d1700c9f 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -10,8 +10,10 @@ On the client side, run:
     vllm bench serve \
         --backend <backend or endpoint type. Default 'openai'> \
         --label <benchmark result label. Default using backend> \
-        --model <your_model> \
+        --model <your_model. Optional, defaults to first model from server> \
         --dataset-name <dataset_name. Default 'random'> \
+        --input-len <general input length. Optional, maps to dataset-specific args> \
+        --output-len <general output length. Optional, maps to dataset-specific args> \
         --request-rate <request_rate. Default inf> \
         --num-prompts <num_prompts. Default 1000>
 """
@@ -57,6 +59,33 @@ TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) a
 )
 
 
+async def get_first_model_from_server(
+    base_url: str, headers: dict | None = None
+) -> str:
+    """Fetch the first model from the server's /v1/models endpoint."""
+    models_url = f"{base_url}/v1/models"
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(models_url, headers=headers) as response:
+                response.raise_for_status()
+                data = await response.json()
+                if "data" in data and len(data["data"]) > 0:
+                    return data["data"][0]["id"]
+                else:
+                    raise ValueError(
+                        f"No models found on the server at {base_url}. "
+                        "Make sure the server is running and has models loaded."
+                    )
+        except (aiohttp.ClientError, json.JSONDecodeError) as e:
+            raise RuntimeError(
+                f"Failed to fetch models from server at {models_url}. "
+                "Check that:\n"
+                "1. The server is running\n"
+                "2. The server URL is correct\n"
+                f"Error: {e}"
+            ) from e
+
+
 class TaskType(Enum):
     GENERATION = "generation"
     POOLING = "pooling"
@@ -1025,8 +1054,26 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--model",
         type=str,
-        required=True,
-        help="Name of the model.",
+        required=False,
+        default=None,
+        help="Name of the model. If not specified, will fetch the first model "
+        "from the server's /v1/models endpoint.",
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="General input length for datasets. Maps to dataset-specific "
+        "input length arguments (e.g., --random-input-len, --sonnet-input-len). "
+        "If not specified, uses dataset defaults.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="General output length for datasets. Maps to dataset-specific "
+        "output length arguments (e.g., --random-output-len, --sonnet-output-len). "
+        "If not specified, uses dataset defaults.",
     )
     parser.add_argument(
         "--tokenizer",
@@ -1332,10 +1379,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
 
     label = args.label
-    model_id = args.model
-    model_name = args.served_model_name
-    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
-    tokenizer_mode = args.tokenizer_mode
 
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
@@ -1356,6 +1399,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             else:
                 raise ValueError("Invalid header format. Please use KEY=VALUE format.")
 
+    # Fetch model from server if not specified
+    if args.model is None:
+        print("Model not specified, fetching first model from server...")
+        model_id = await get_first_model_from_server(base_url, headers)
+        print(f"Using model: {model_id}")
+    else:
+        model_id = args.model
+
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id
+    tokenizer_mode = args.tokenizer_mode
+
     tokenizer = get_tokenizer(
         tokenizer_id,
         tokenizer_mode=tokenizer_mode,
@@ -1368,6 +1423,20 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             "'--dataset-path' if required."
         )
 
+    # Map general --input-len and --output-len to all dataset-specific arguments
+    if args.input_len is not None:
+        args.random_input_len = args.input_len
+        args.sonnet_input_len = args.input_len
+
+    if args.output_len is not None:
+        args.random_output_len = args.output_len
+        args.sonnet_output_len = args.output_len
+        args.sharegpt_output_len = args.output_len
+        args.custom_output_len = args.output_len
+        args.hf_output_len = args.output_len
+        args.spec_bench_output_len = args.output_len
+        args.prefix_repetition_output_len = args.output_len
+
     # when using random datasets, default to ignoring EOS
     # so generation runs to the requested length
     if (

From 177c391db2ad8dfc05906473525d4ae0a55549e0 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 04:55:56 -0500
Subject: [PATCH 30/49] [compile] Disable aot when eager backend is used.
 (#30810)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/compilation/decorators.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index d1ee995ee8959..40bde97ac61d8 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -435,7 +435,10 @@ def _support_torch_compile(
                 return self.aot_compiled_fn(self, *args, **kwargs)
 
         if self.compiled:
-            assert not envs.VLLM_USE_AOT_COMPILE
+            assert (
+                not envs.VLLM_USE_AOT_COMPILE
+                or self.vllm_config.compilation_config.backend == "eager"
+            )
             return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 
         # This is the path for the first compilation.
@@ -508,7 +511,11 @@ def _support_torch_compile(
             _torch27_patch_tensor_subclasses(),
             torch._inductor.config.patch(**inductor_config_patches),
         ):
-            if envs.VLLM_USE_AOT_COMPILE:
+            use_aot_compile = envs.VLLM_USE_AOT_COMPILE
+            if self.vllm_config.compilation_config.backend == "eager":
+                logger.warning("Detected eager backend, disabling AOT compile.")
+                use_aot_compile = False
+            if use_aot_compile:
                 self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
                 output = self.aot_compiled_fn(self, *args, **kwargs)
                 assert aot_compilation_path is not None

From 9db1db5949f7abd4b03cd0231450f81bfeeaba0f Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 04:56:24 -0500
Subject: [PATCH 31/49] [compile] Ignore VLLM_FORCE_AOT_LOAD from cache factors
 (#30809)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
---
 vllm/envs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 7e072a588591c..2f8158d88d6c5 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1654,6 +1654,7 @@ def compile_factors() -> dict[str, object]:
         "VLLM_CI_USE_S3",
         "VLLM_MODEL_REDIRECT_PATH",
         "VLLM_HOST_IP",
+        "VLLM_FORCE_AOT_LOAD",
         "S3_ACCESS_KEY_ID",
         "S3_SECRET_ACCESS_KEY",
         "S3_ENDPOINT_URL",

From 7b966ae2ba73b5391937907bfd8aaf63af033ff1 Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:56:38 +0200
Subject: [PATCH 32/49] [Fix]Load kv-cache dtype from hf_quant_config.json
 automatically (fix for reverted PR) (#30785)

Signed-off-by: <>
Co-authored-by: root <root@gpu-937.slurm-workers-slurm.slurm.svc.cluster.local>
---
 vllm/engine/arg_utils.py  |  9 ++++-
 vllm/utils/torch_utils.py | 75 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca19e468914c7..03720bd2516d4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -93,6 +93,7 @@ from vllm.transformers_utils.utils import is_cloud_storage
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
+from vllm.utils.torch_utils import resolve_kv_cache_dtype_string
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -106,6 +107,7 @@ else:
     LoadFormats = Any
     UsageContext = Any
 
+
 logger = init_logger(__name__)
 
 # object is used to allow for special typing forms
@@ -1361,12 +1363,17 @@ class EngineArgs:
             f"dcp_size={self.decode_context_parallel_size}."
         )
 
+        # Resolve "auto" kv_cache_dtype to actual value from model config
+        resolved_cache_dtype = resolve_kv_cache_dtype_string(
+            self.kv_cache_dtype, model_config
+        )
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
-            cache_dtype=self.kv_cache_dtype,
+            cache_dtype=resolved_cache_dtype,
             is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=sliding_window,
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index c97efce312b56..b82e0171b7f7f 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -24,6 +24,10 @@ else:
     ModelConfig = object
     IntermediateTensors = object
 
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 STR_DTYPE_TO_TORCH_DTYPE = {
     "float32": torch.float32,
@@ -49,6 +53,13 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = {
 }
 
 
+MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP = {
+    # TODO: Add more modelopt kv cache dtype
+    # mappings here when it supported by some attention backend
+    # (for example supports nvfp4).
+    "fp8": "fp8_e4m3",
+}
+
 T = TypeVar("T")
 
 
@@ -194,6 +205,70 @@ def get_kv_cache_torch_dtype(
     return torch_dtype
 
 
+def get_kv_cache_quant_algo_string(quant_cfg: dict[str, Any]) -> str | None:
+    """Get the KV cache quantization algorithm string from the quantization config.
+
+    Maps various FP8 format names to vLLM's standard cache dtype strings.
+    Returns None if no kv_cache_quant_algo is specified.
+    Returns "auto" if the value is not recognized/supported.
+    """
+    # Mapping from model config values to vLLM cache_dtype strings
+
+    quant_method = quant_cfg.get("quant_method", "")
+    if quant_method.startswith("modelopt"):
+        quantization_inner = quant_cfg.get("quantization", quant_cfg)
+        # Check if quant config is specified and use kv cache quant algo
+        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
+            "kv_cache_quant_algo"
+        )
+        if isinstance(kv_algo, str):
+            kv_algo_lower = kv_algo.lower()
+
+            # Try to map to vLLM's standard format
+            if kv_algo_lower in MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP:
+                return MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP[kv_algo_lower]
+            else:
+                # Unknown/unsupported format - return "auto" as safe fallback
+                logger.warning(
+                    "WARNING: Unknown kv_cache_quant_algo '%s' in model "
+                    "config. Supported values: %s. Falling back to 'auto'.",
+                    kv_algo,
+                    list(MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAP.keys()),
+                )
+                return "auto"
+    return None
+
+
+def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
+    """Get the KV cache quantization algorithm dtype from the quantization config."""
+    kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg)
+    if kv_algo_str is not None and kv_algo_str != "auto":
+        # Only convert if we have a valid dtype string (not "auto" fallback)
+        return STR_DTYPE_TO_TORCH_DTYPE[kv_algo_str]
+    return None
+
+
+def resolve_kv_cache_dtype_string(
+    kv_cache_dtype: str, model_config: ModelConfig
+) -> str:
+    """Resolve 'auto' kv_cache_dtype to the actual string value from model config.
+    Returns the resolved cache_dtype string.
+    """
+    if kv_cache_dtype != "auto":
+        return kv_cache_dtype
+
+    hf_cfg = getattr(model_config, "hf_config", None)
+    if hf_cfg is not None:
+        quant_cfg = getattr(hf_cfg, "quantization_config", None)
+        if quant_cfg is not None:
+            kv_algo_str = get_kv_cache_quant_algo_string(quant_cfg)
+            if kv_algo_str is not None:
+                return kv_algo_str
+
+    # Default to auto (will be handled by downstream code)
+    return "auto"
+
+
 def kv_cache_dtype_str_to_dtype(
     kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:

From 53cd7f868b3632cbbe982cffaee8e16fb49dd694 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@fb.com>
Date: Wed, 17 Dec 2025 05:00:12 -0500
Subject: [PATCH 33/49] [compile] Recompile graph module during Dynamo cache
 loading. (#30743)

Signed-off-by: Zhengxu Chen <zhxchen17@fb.com>
---
 vllm/compilation/caching.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index ce482572b401b..fc02a08f74265 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -104,6 +104,7 @@ class VllmSerializableFunction(SerializableCallable):
         state = pickle.loads(data)
         fake_mode = FakeTensorMode(shape_env=ShapeEnv())
         state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+        state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
         vllm_backend = VllmBackend(get_current_vllm_config(), state["prefix"])
 

From f284d7bd0c55f929fa7912936b1d247089679191 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 17 Dec 2025 05:00:35 -0500
Subject: [PATCH 34/49] [Bug] Fix AttributeError: 'ColumnParallelLinear' object
 has no attribute `weight_scale_inv` (#30823)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ea68745585160..bdc3d1fc7232d 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -1437,14 +1437,17 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
         layer.orig_dtype, layer.weight
     )
     if should_use_deepgemm:
+        scale_attr = (
+            "weight_scale_inv" if hasattr(layer, "weight_scale_inv") else "weight_scale"
+        )
         dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
             wq=layer.weight.data,
-            ws=layer.weight_scale_inv.data,
+            ws=getattr(layer, scale_attr).data,
             quant_block_shape=tuple(layer.weight_block_size),
             use_e8m0=is_deep_gemm_e8m0_used(),
         )
         replace_parameter(layer, "weight", dg_weight)
-        replace_parameter(layer, "weight_scale_inv", dg_weight_scale)
+        replace_parameter(layer, scale_attr, dg_weight_scale)
 
 
 def expert_weight_is_col_major(x: torch.Tensor) -> bool:

From 9ad5b2171002522772de0a0cc71b747068ec8862 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 17 Dec 2025 18:27:30 +0800
Subject: [PATCH 35/49] [Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into
 the serve directory (#30749)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  1 -
 .buildkite/test-amd.yaml                      | 37 ++++---
 .buildkite/test-pipeline.yaml                 | 34 ++++---
 .buildkite/test_areas/entrypoints.yaml        | 23 ++++-
 .buildkite/test_areas/tool_use.yaml           | 13 ---
 tests/entrypoints/instrumentator/__init__.py  |  0
 .../test_metrics.py                           |  5 +-
 tests/entrypoints/rpc/__init__.py             |  0
 .../{openai => rpc}/test_collective_rpc.py    |  2 +-
 tests/entrypoints/sleep/__init__.py           |  0
 .../{openai => sleep}/test_sleep.py           |  2 +-
 vllm/entrypoints/openai/api_server.py         | 98 +------------------
 vllm/entrypoints/serve/__init__.py            | 29 ++++++
 vllm/entrypoints/serve/cache/__init__.py      |  0
 vllm/entrypoints/serve/cache/api_router.py    | 61 ++++++++++++
 .../serve/instrumentator/server_info.py       | 40 ++++++++
 vllm/entrypoints/serve/rpc/__init__.py        |  0
 vllm/entrypoints/serve/rpc/api_router.py      | 61 ++++++++++++
 vllm/entrypoints/serve/sleep/api_router.py    |  4 -
 19 files changed, 259 insertions(+), 151 deletions(-)
 delete mode 100644 .buildkite/test_areas/tool_use.yaml
 create mode 100644 tests/entrypoints/instrumentator/__init__.py
 rename tests/entrypoints/{openai => instrumentator}/test_metrics.py (99%)
 create mode 100644 tests/entrypoints/rpc/__init__.py
 rename tests/entrypoints/{openai => rpc}/test_collective_rpc.py (96%)
 create mode 100644 tests/entrypoints/sleep/__init__.py
 rename tests/entrypoints/{openai => sleep}/test_sleep.py (98%)
 create mode 100644 vllm/entrypoints/serve/cache/__init__.py
 create mode 100644 vllm/entrypoints/serve/cache/api_router.py
 create mode 100644 vllm/entrypoints/serve/instrumentator/server_info.py
 create mode 100644 vllm/entrypoints/serve/rpc/__init__.py
 create mode 100644 vllm/entrypoints/serve/rpc/api_router.py

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 864eb470bb0a7..08da34d81d117 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
   --ignore=entrypoints/openai/test_audio.py \
   --ignore=entrypoints/openai/test_shutdown.py \
   --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
   --ignore=entrypoints/openai/test_models.py \
   --ignore=entrypoints/openai/test_lora_adapters.py \
   --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 3c9b8cbedcf06..e8f99100a8de0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -128,7 +128,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -148,7 +148,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
@@ -162,10 +162,28 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -751,17 +769,6 @@ steps:
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
   - pytest -s entrypoints/openai/correctness/
 
-- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
 
 #####  models test  #####
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8e6d32f71f220..b4de630b09417 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -114,7 +114,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
@@ -132,7 +132,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
   timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
@@ -144,10 +144,26 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration Test (Pooling)
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
@@ -666,16 +682,6 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
-
 #####  models test  #####
 
 - label: Basic Models Tests (Initialization)
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 0a789be943f37..5b16ea9c1ad07 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -10,7 +10,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration (LLM)
   timeout_in_minutes: 40
@@ -25,7 +25,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Integration (API Server)
+- label: Entrypoints Integration (API Server 1)
   timeout_in_minutes: 130
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
@@ -34,11 +34,26 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
 
+- label: Entrypoints Integration (API Server 2)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration (Pooling)
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/tests"
diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml
deleted file mode 100644
index 69527a1214229..0000000000000
--- a/.buildkite/test_areas/tool_use.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-group: Tool use
-depends_on: 
-  - image-build
-steps:
-- label: OpenAI-Compatible Tool Use
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/instrumentator/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py
similarity index 99%
rename from tests/entrypoints/openai/test_metrics.py
rename to tests/entrypoints/instrumentator/test_metrics.py
index 65a6fd20bd0d1..9f2ad105a380b 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@@ -14,11 +14,10 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
 
+from tests.conftest import LocalAssetServer
+from tests.utils import RemoteOpenAIServer
 from vllm import version
 
-from ...conftest import LocalAssetServer
-from ...utils import RemoteOpenAIServer
-
 MODELS = {
     "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
diff --git a/tests/entrypoints/rpc/__init__.py b/tests/entrypoints/rpc/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/rpc/test_collective_rpc.py
similarity index 96%
rename from tests/entrypoints/openai/test_collective_rpc.py
rename to tests/entrypoints/rpc/test_collective_rpc.py
index cbd6b02f05dce..56d93a427315f 100644
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/rpc/test_collective_rpc.py
@@ -37,7 +37,7 @@ def server():
         "--max-num-seqs",
         "128",
         "--worker-extension-cls",
-        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+        "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
     ]
     with RemoteOpenAIServer(
         MODEL_NAME,
diff --git a/tests/entrypoints/sleep/__init__.py b/tests/entrypoints/sleep/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/sleep/test_sleep.py
similarity index 98%
rename from tests/entrypoints/openai/test_sleep.py
rename to tests/entrypoints/sleep/test_sleep.py
index 5f94ac6da2c25..260dcd00bae91 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/sleep/test_sleep.py
@@ -4,7 +4,7 @@
 import requests
 from prometheus_client.parser import text_string_to_metric_families
 
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B"
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 5d0eacae34dd7..bca9571e39344 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -17,21 +17,20 @@ from argparse import Namespace
 from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
 import uvloop
-from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, Response, StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import URL, Headers, MutableHeaders, State
 from starlette.types import ASGIApp, Message, Receive, Scope, Send
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
@@ -639,97 +638,6 @@ async def create_translations(
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-if envs.VLLM_SERVER_DEV_MODE:
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
-
-    PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
-
-    @router.get("/server_info")
-    async def show_server_info(
-        raw_request: Request,
-        config_format: Annotated[Literal["text", "json"], Query()] = "text",
-    ):
-        vllm_config: VllmConfig = raw_request.app.state.vllm_config
-        server_info = {
-            "vllm_config": str(vllm_config)
-            if config_format == "text"
-            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
-            # fallback=str is needed to handle e.g. torch.dtype
-        }
-        return JSONResponse(content=server_info)
-
-    @router.post("/reset_prefix_cache")
-    async def reset_prefix_cache(
-        raw_request: Request,
-        reset_running_requests: bool = Query(default=False),
-        reset_external: bool = Query(default=False),
-    ):
-        """
-        Reset the local prefix cache.
-
-        Optionally, if the query parameter `reset_external=true`
-        also resets the external (connector-managed) prefix cache.
-
-        Note that we currently do not check if the prefix cache
-        is successfully reset in the API server.
-
-        Example:
-            POST /reset_prefix_cache?reset_external=true
-        """
-        logger.info("Resetting prefix cache...")
-
-        await engine_client(raw_request).reset_prefix_cache(
-            reset_running_requests, reset_external
-        )
-        return Response(status_code=200)
-
-    @router.post("/reset_mm_cache")
-    async def reset_mm_cache(raw_request: Request):
-        """
-        Reset the multi-modal cache. Note that we currently do not check if the
-        multi-modal cache is successfully reset in the API server.
-        """
-        logger.info("Resetting multi-modal cache...")
-        await engine_client(raw_request).reset_mm_cache()
-        return Response(status_code=200)
-
-    @router.post("/collective_rpc")
-    async def collective_rpc(raw_request: Request):
-        try:
-            body = await raw_request.json()
-        except json.JSONDecodeError as e:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail=f"JSON decode error: {e}",
-            ) from e
-        method = body.get("method")
-        if method is None:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail="Missing 'method' in request body",
-            )
-        # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deserialization if needed.
-        args: list[str] = body.get("args", [])
-        kwargs: dict[str, str] = body.get("kwargs", {})
-        timeout: float | None = body.get("timeout")
-        results = await engine_client(raw_request).collective_rpc(
-            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
-        )
-        if results is None:
-            return Response(status_code=200)
-        response: list[Any] = []
-        for result in results:
-            if result is None or isinstance(result, dict | list):
-                response.append(result)
-            else:
-                response.append(str(result))
-        return JSONResponse(content={"results": response})
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py
index c4fcc92db931f..260fd44a02ccb 100644
--- a/vllm/entrypoints/serve/__init__.py
+++ b/vllm/entrypoints/serve/__init__.py
@@ -4,8 +4,19 @@
 
 from fastapi import FastAPI
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def register_vllm_serve_api_routers(app: FastAPI):
+    if envs.VLLM_SERVER_DEV_MODE:
+        logger.warning(
+            "SECURITY WARNING: Development endpoints are enabled! "
+            "This should NOT be used in production!"
+        )
+
     from vllm.entrypoints.serve.lora.api_router import (
         attach_router as attach_lora_router,
     )
@@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI):
 
     attach_sleep_router(app)
 
+    from vllm.entrypoints.serve.rpc.api_router import (
+        attach_router as attach_rpc_router,
+    )
+
+    attach_rpc_router(app)
+
+    from vllm.entrypoints.serve.cache.api_router import (
+        attach_router as attach_cache_router,
+    )
+
+    attach_cache_router(app)
+
     from vllm.entrypoints.serve.tokenize.api_router import (
         attach_router as attach_tokenize_router,
     )
@@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI):
     )
 
     attach_health_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.server_info import (
+        attach_router as attach_server_info_router,
+    )
+
+    attach_server_info_router(app)
diff --git a/vllm/entrypoints/serve/cache/__init__.py b/vllm/entrypoints/serve/cache/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/cache/api_router.py
new file mode 100644
index 0000000000000..d659895463273
--- /dev/null
+++ b/vllm/entrypoints/serve/cache/api_router.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/reset_prefix_cache")
+async def reset_prefix_cache(
+    raw_request: Request,
+    reset_running_requests: bool = Query(default=False),
+    reset_external: bool = Query(default=False),
+):
+    """
+    Reset the local prefix cache.
+
+    Optionally, if the query parameter `reset_external=true`
+    also resets the external (connector-managed) prefix cache.
+
+    Note that we currently do not check if the prefix cache
+    is successfully reset in the API server.
+
+    Example:
+       POST /reset_prefix_cache?reset_external=true
+    """
+    logger.info("Resetting prefix cache...")
+
+    await engine_client(raw_request).reset_prefix_cache(
+        reset_running_requests, reset_external
+    )
+    return Response(status_code=200)
+
+
+@router.post("/reset_mm_cache")
+async def reset_mm_cache(raw_request: Request):
+    """
+    Reset the multi-modal cache. Note that we currently do not check if the
+    multi-modal cache is successfully reset in the API server.
+    """
+    logger.info("Resetting multi-modal cache...")
+    await engine_client(raw_request).reset_mm_cache()
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py
new file mode 100644
index 0000000000000..1a69dfacae1c2
--- /dev/null
+++ b/vllm/entrypoints/serve/instrumentator/server_info.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Annotated, Literal
+
+import pydantic
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
+
+
+@router.get("/server_info")
+async def show_server_info(
+    raw_request: Request,
+    config_format: Annotated[Literal["text", "json"], Query()] = "text",
+):
+    vllm_config: VllmConfig = raw_request.app.state.vllm_config
+    server_info = {
+        "vllm_config": str(vllm_config)
+        if config_format == "text"
+        else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
+        # fallback=str is needed to handle e.g. torch.dtype
+    }
+    return JSONResponse(content=server_info)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/rpc/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/rpc/api_router.py
new file mode 100644
index 0000000000000..54f582c408d54
--- /dev/null
+++ b/vllm/entrypoints/serve/rpc/api_router.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from http import HTTPStatus
+from typing import Any
+
+from fastapi import APIRouter, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/collective_rpc")
+async def collective_rpc(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail=f"JSON decode error: {e}",
+        ) from e
+    method = body.get("method")
+    if method is None:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail="Missing 'method' in request body",
+        )
+    # For security reason, only serialized string args/kwargs are passed.
+    # User-defined `method` is responsible for deserialization if needed.
+    args: list[str] = body.get("args", [])
+    kwargs: dict[str, str] = body.get("kwargs", {})
+    timeout: float | None = body.get("timeout")
+    results = await engine_client(raw_request).collective_rpc(
+        method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
+    )
+    if results is None:
+        return Response(status_code=200)
+    response: list[Any] = []
+    for result in results:
+        if result is None or isinstance(result, dict | list):
+            response.append(result)
+        else:
+            response.append(str(result))
+    return JSONResponse(content={"results": response})
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py
index bc01e185315c8..c0e4c3028b2ea 100644
--- a/vllm/entrypoints/serve/sleep/api_router.py
+++ b/vllm/entrypoints/serve/sleep/api_router.py
@@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request):
 def attach_router(app: FastAPI):
     if not envs.VLLM_SERVER_DEV_MODE:
         return
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
 
     app.include_router(router)

From 4bf6c2366818a1eeae257e06ec337039e6895f13 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <khluu000@gmail.com>
Date: Wed, 17 Dec 2025 02:30:56 -0800
Subject: [PATCH 36/49] [ci] Sync test areas yaml file with test-pipeline
 (#30862)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
---
 .buildkite/test_areas/e2e_integration.yaml | 19 +------------------
 .buildkite/test_areas/lm_eval.yaml         |  4 ++--
 .buildkite/test_areas/lora.yaml            |  2 ++
 .buildkite/test_areas/models_basic.yaml    |  2 ++
 .buildkite/test_areas/pytorch.yaml         |  4 +++-
 5 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 93d389815edac..2e0857986c3fa 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -32,6 +32,7 @@ steps:
 - label: Prime-RL Integration (2 GPUs)
   timeout_in_minutes: 30
   optional: true
+  soft_fail: true
   num_gpus: 2
   working_dir: "/vllm-workspace"
   source_file_dependencies:
@@ -39,21 +40,3 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
-
-- label: DeepSeek V2-Lite Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
-
-- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
-  timeout_in_minutes: 60
-  gpu: h100
-  optional: true
-  num_gpus: 4
-  working_dir: "/vllm-workspace"
-  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index 9af43e0c375a8..e2498512bdef7 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -9,7 +9,7 @@ steps:
   - vllm/model_executor/layers/quantization
   autorun_on_main: true
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 
 - label: LM Eval Large Models (4 GPUs)(A100)
   gpu: a100
@@ -43,4 +43,4 @@ steps:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 809b4138f44ba..59ade40cc8f52 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -22,6 +22,8 @@ steps:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
     # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index 39a5d51c48833..2a86596a6d603 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -9,6 +9,7 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Run a subset of model initialization tests
     - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
@@ -20,6 +21,7 @@ steps:
   source_file_dependencies:
   - vllm/model_executor/models/
   - tests/models/test_initialization.py
+  - tests/models/registry.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
     # subset of supported models (the complement of the small subset in the above
diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml
index 703c82eb1a91b..332d5202d8338 100644
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -13,7 +13,9 @@ steps:
   # tests covered elsewhere.
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+  # However, find does not normally propagate error codes, so we combine it with xargs
+  # (using -0 for proper path handling)
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
 
 - label: PyTorch Fullgraph Smoke Test
   timeout_in_minutes: 30

From 84896fda22d3de74398a88b5769c98eef14258f1 Mon Sep 17 00:00:00 2001
From: baoqian426 <1354987947@qq.com>
Date: Wed, 17 Dec 2025 19:32:34 +0800
Subject: [PATCH 37/49] [Bugfix] deepseek-V3.2 self.weights_proj has no bias
 (#30841)

Signed-off-by: baoqian <1354987947@qq.com>
Signed-off-by: baoqian426 <1354987947@qq.com>
---
 vllm/model_executor/models/deepseek_v2.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 146124153c79d..6670143cda250 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -835,7 +835,11 @@ class Indexer(nn.Module):
         )
         self.k_norm = LayerNorm(self.head_dim, eps=1e-6)
         self.weights_proj = ReplicatedLinear(
-            hidden_size, self.n_head, quant_config=None, prefix=f"{prefix}.weights_proj"
+            hidden_size,
+            self.n_head,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.weights_proj",
         )
         self.softmax_scale = self.head_dim**-0.5
 

From fb980eb2fdd15f81d4c5695347bdea308bb5515e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Dec 2025 11:33:50 +0000
Subject: [PATCH 38/49] Fix lazy import (#30858)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/structured_output/utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index cb5ad99cfbdf7..74df0fa067670 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -20,9 +20,9 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 
 if TYPE_CHECKING:
     import outlines_core as oc
+    import transformers.convert_slow_tokenizer as convert_slow_tokenizer
     import transformers.file_utils as file_utils
     import xgrammar as xgr
-    from transformers.convert_slow_tokenizer import bytes_to_unicode
 
     from vllm.tokenizers import TokenizerLike
     from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -30,8 +30,8 @@ else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
     oc = LazyLoader("oc", globals(), "outlines_core")
     file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
-    bytes_to_unicode = LazyLoader(
-        "bytes_to_unicode", globals(), "transformers.convert_slow_tokenizer"
+    convert_slow_tokenizer = LazyLoader(
+        "convert_slow_tokenizer", globals(), "transformers.convert_slow_tokenizer"
     )
 
     TokenizerLike = object
@@ -202,7 +202,9 @@ def _reduced_vocabulary(
         A Dict of token string -> equivalent token ids
     """
 
-    unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()}
+    unicode_to_bytes = {
+        v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items()
+    }
 
     def convert_token_to_string(token: str) -> str:
         string = tokenizer.convert_tokens_to_string([token])

From 6482e3895baa483fb30227648aa4721f09699cba Mon Sep 17 00:00:00 2001
From: Hank_ <37239608+ILikeIneine@users.noreply.github.com>
Date: Wed, 17 Dec 2025 19:58:16 +0800
Subject: [PATCH 39/49] chores: adjust the attn register param order (#30688)

Signed-off-by: Hank <hcc.mayday@gmail.com>
---
 vllm/attention/backends/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index eaa0fa1d5db39..ed0021db204ac 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -201,8 +201,8 @@ _MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
 def register_backend(
     backend: AttentionBackendEnum | MambaAttentionBackendEnum,
-    is_mamba: bool = False,
     class_path: str | None = None,
+    is_mamba: bool = False,
 ) -> Callable[[type], type]:
     """Register or override a backend implementation.
 

From 6e9dbcc50e35af75ec76bf033ee6402697c02609 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 17 Dec 2025 03:58:43 -0800
Subject: [PATCH 40/49] [Fix] uniform decode batch check (#30747)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 84 ++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py       | 45 ++++++++++---
 2 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 7b8c4268a5237..59f1ac705829f 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1110,3 +1110,87 @@ def test_hybrid_cache_integration(model_runner, dist_init):
     runner._update_states(scheduler_output)
     assert _is_req_scheduled(runner, req_id)
     assert _is_req_state_block_table_match(runner, req_id)
+
+
+def test_is_uniform_decode() -> None:
+    # Normal
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+    )
+    # Spec decoding
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=5,
+        num_tokens=30,
+        num_reqs=6,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=4,
+        num_tokens=30,
+        num_reqs=6,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=5,
+        uniform_decode_query_len=5,
+        num_tokens=30,
+        num_reqs=7,
+    )
+    # Force uniform decode
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=True,
+    )
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=True,
+    )
+    assert GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+        force_uniform_decode=True,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=False,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=2,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=16,
+        force_uniform_decode=False,
+    )
+    assert not GPUModelRunner._is_uniform_decode(
+        max_num_scheduled_tokens=1,
+        uniform_decode_query_len=1,
+        num_tokens=16,
+        num_reqs=15,
+        force_uniform_decode=False,
+    )
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1db5bc99fff6c..a44150432434b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2777,6 +2777,27 @@ class GPUModelRunner(
             **model_kwargs,
         )
 
+    @staticmethod
+    def _is_uniform_decode(
+        max_num_scheduled_tokens: int,
+        uniform_decode_query_len: int,
+        num_tokens: int,
+        num_reqs: int,
+        force_uniform_decode: bool | None = None,
+    ) -> bool:
+        """
+        Checks if it's a decode batch with same amount scheduled tokens
+        across all requests.
+        """
+        return (
+            (
+                (max_num_scheduled_tokens == uniform_decode_query_len)
+                and (num_tokens == max_num_scheduled_tokens * num_reqs)
+            )
+            if force_uniform_decode is None
+            else force_uniform_decode
+        )
+
     def _determine_batch_execution_and_padding(
         self,
         num_tokens: int,
@@ -2798,14 +2819,12 @@ class GPUModelRunner(
         torch.Tensor | None,
         CUDAGraphStat | None,
     ]:
-        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
-        uniform_decode = (
-            (
-                (max_num_scheduled_tokens == self.uniform_decode_query_len)
-                and (num_tokens_padded == max_num_scheduled_tokens * num_reqs)
-            )
-            if force_uniform_decode is None
-            else force_uniform_decode
+        uniform_decode = self._is_uniform_decode(
+            max_num_scheduled_tokens=max_num_scheduled_tokens,
+            uniform_decode_query_len=self.uniform_decode_query_len,
+            num_tokens=num_tokens,
+            num_reqs=num_reqs,
+            force_uniform_decode=force_uniform_decode,
         )
         # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
         # is present). Also, chunked-prefill is disabled, so batch are uniform.
@@ -2819,6 +2838,7 @@ class GPUModelRunner(
             else force_has_lora
         )
 
+        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
         dispatch_cudagraph = (
             lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch(
                 num_tokens=num_tokens,
@@ -2834,6 +2854,15 @@ class GPUModelRunner(
             num_tokens_padded, use_cascade_attn or has_encoder_output
         )
         num_tokens_padded = batch_descriptor.num_tokens
+        if self.compilation_config.pass_config.enable_sp:
+            assert (
+                batch_descriptor.num_tokens
+                % self.vllm_config.parallel_config.tensor_parallel_size
+                == 0
+            ), (
+                "Sequence parallelism requires num_tokens to be "
+                "a multiple of tensor parallel size"
+            )
 
         # Extra coordination when running data-parallel since we need to coordinate
         # across ranks

From 9e67c4ce985b0b8852603cfe3fcaf8f37de137ed Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 17 Dec 2025 20:14:45 +0800
Subject: [PATCH 41/49] [Docs] fix function name (#30748)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 docs/design/plugin_system.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index b0ca2dad23d5b..0fd448c2153c3 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -109,7 +109,7 @@ Every plugin has three parts:
     - `init_device`: This function is called to set up the device for the worker.
     - `initialize_cache`: This function is called to set cache config for the worker.
     - `load_model`: This function is called to load the model weights to device.
-    - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
+    - `get_kv_cache_spec`: This function is called to generate the kv cache spec for the model.
     - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
     - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
     - `execute_model`: This function is called every step to inference the model.

From b7b6a60aca0405b2d6b2ed6fd13853635f000b5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=91=AB=E5=B4=A7?=
 <50285788+SongDI911@users.noreply.github.com>
Date: Wed, 17 Dec 2025 23:10:59 +0800
Subject: [PATCH 42/49] Adapt the old parameter enable_thinking in
 chat_template_kwargs (#30852)

Signed-off-by: xinsong.gao <1418762819@qq.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 vllm/reasoning/deepseek_v3_reasoning_parser.py | 2 ++
 vllm/tokenizers/deepseek_v32.py                | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index 6604f70badbcf..4e6758586bf42 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -26,6 +26,8 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
 
         chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
         thinking = bool(chat_kwargs.pop("thinking", False))
+        enable_thinking = bool(chat_kwargs.pop("enable_thinking", False))
+        thinking = thinking or enable_thinking
 
         if thinking:
             self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py
index bf279a5cf67c5..d519b61ddb76d 100644
--- a/vllm/tokenizers/deepseek_v32.py
+++ b/vllm/tokenizers/deepseek_v32.py
@@ -50,6 +50,8 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
         **kwargs,
     ) -> str | list[int]:
         thinking = kwargs.get("thinking", False)
+        enable_thinking = kwargs.get("enable_thinking", False)
+        thinking = thinking or enable_thinking
         thinking_mode = "thinking"
         if not thinking:
             thinking_mode = "chat"

From 196cdc3224112df7f68c901fe4c5314875a65be8 Mon Sep 17 00:00:00 2001
From: KimHyemin <102578109+www-spam@users.noreply.github.com>
Date: Thu, 18 Dec 2025 00:11:18 +0900
Subject: [PATCH 43/49] [Model] Gemma3: Support untied word embeddings (#30827)

Signed-off-by: www-spam <panmahm@naver.com>
---
 vllm/model_executor/models/gemma3.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 40f6d100c767e..70f72b5cb9beb 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -39,7 +39,10 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader,
     maybe_remap_kv_scale_name,
@@ -532,12 +535,20 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
         super().__init__()
         self.config = config
-        # currently all existing Gemma models have `tie_word_embeddings` enabled
-        assert config.tie_word_embeddings
         self.quant_config = quant_config
         self.model = Gemma3Model(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping
         )
@@ -565,7 +576,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.model.embed_tokens, hidden_states)
+        logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:

From 2497228ad4427310bc55427f6db404a00de4fd78 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 17 Dec 2025 23:32:17 +0800
Subject: [PATCH 44/49] [Chore] Factor out logic for requesting initial memory
 (#30868)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/utils/mem_utils.py      | 31 +++++++++++++++++++++++++++----
 vllm/v1/worker/gpu_worker.py | 20 ++++----------------
 vllm/v1/worker/utils.py      | 26 +++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
index e2517b935bf28..bf6d7846573b9 100644
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -66,27 +66,43 @@ class MemorySnapshot:
     torch_memory: int = 0
     non_torch_memory: int = 0
     timestamp: float = 0.0
+
+    device: torch.types.Device = None
     auto_measure: bool = True
 
     def __post_init__(self) -> None:
+        if self.device is None:
+            from vllm.platforms import current_platform
+
+            device_fn = current_platform.current_device
+            assert device_fn is not None
+            self.device_ = torch.device(device_fn())
+        else:
+            self.device_ = torch.device(self.device)
+
         if self.auto_measure:
             self.measure()
 
     def measure(self) -> None:
         from vllm.platforms import current_platform
 
+        device = self.device_
+
         # we measure the torch peak memory usage via allocated_bytes,
         # rather than `torch.cuda.memory_reserved()` .
         # After `torch.cuda.reset_peak_memory_stats()`,
         # `torch.cuda.memory_reserved()` will keep growing, and only shrink
         # when we call `torch.cuda.empty_cache()` or OOM happens.
-        self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0)
+        self.torch_peak = torch.cuda.memory_stats(device).get(
+            "allocated_bytes.all.peak", 0
+        )
 
-        self.free_memory, self.total_memory = torch.cuda.mem_get_info()
+        self.free_memory, self.total_memory = torch.cuda.mem_get_info(device)
         shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
         if (
             current_platform.is_cuda()
-            and current_platform.get_device_capability() in shared_sysmem_device_mem_sms
+            and current_platform.get_device_capability(device.index)
+            in shared_sysmem_device_mem_sms
         ):
             # On UMA (Orin, Thor and Spark) platform,
             # where both CPU and GPU rely on system memory,
@@ -106,12 +122,18 @@ class MemorySnapshot:
         # torch.cuda.memory_reserved() is how many bytes
         # PyTorch gets from cuda (by calling cudaMalloc, etc.)
         # this is used to measure the non-torch memory usage
-        self.torch_memory = torch.cuda.memory_reserved()
+        self.torch_memory = torch.cuda.memory_reserved(device)
 
         self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
     def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+        if self.device_ != other.device_:
+            raise ValueError(
+                "The two snapshots should be from the same device! "
+                f"Found: {self.device_} vs. {other.device_}"
+            )
+
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
             free_memory=self.free_memory - other.free_memory,
@@ -120,6 +142,7 @@ class MemorySnapshot:
             torch_memory=self.torch_memory - other.torch_memory,
             non_torch_memory=self.non_torch_memory - other.non_torch_memory,
             timestamp=self.timestamp - other.timestamp,
+            device=self.device_,
             auto_measure=False,
         )
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 1e13650cd083e..bc71351d2cc55 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -56,6 +56,8 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from vllm.v1.worker.worker_base import WorkerBase
 from vllm.v1.worker.workspace import init_workspace_manager
 
+from .utils import request_memory
+
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
@@ -237,22 +239,8 @@ class Worker(WorkerBase):
             torch.cuda.empty_cache()
 
             # take current memory snapshot
-            self.init_snapshot = MemorySnapshot()
-            self.requested_memory = (
-                self.init_snapshot.total_memory
-                * self.cache_config.gpu_memory_utilization
-            )
-            if self.init_snapshot.free_memory < self.requested_memory:
-                GiB = lambda b: round(b / GiB_bytes, 2)
-                raise ValueError(
-                    f"Free memory on device "
-                    f"({GiB(self.init_snapshot.free_memory)}/"
-                    f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
-                    f"is less than desired GPU memory utilization "
-                    f"({self.cache_config.gpu_memory_utilization}, "
-                    f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
-                    f"utilization or reduce GPU memory used by other processes."
-                )
+            self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
+            self.requested_memory = request_memory(init_snapshot, self.cache_config)
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 2e8afec024ce9..31ccf7f157468 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -8,13 +8,15 @@ from typing_extensions import deprecated
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.platforms import current_platform
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import MemorySnapshot
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -248,6 +250,28 @@ def gather_mm_placeholders(
     return placeholders[is_embed]
 
 
+def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
+    """
+    Calculate the amount of memory required by vLLM, then validate
+    that the current amount of free memory is sufficient for that.
+    """
+    requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
+
+    if init_snapshot.free_memory < requested_memory:
+        GiB = lambda b: round(b / GiB_bytes, 2)
+        raise ValueError(
+            f"Free memory on device {init_snapshot.device_} "
+            f"({GiB(init_snapshot.free_memory)}/"
+            f"{GiB(init_snapshot.total_memory)} GiB) on startup "
+            f"is less than desired GPU memory utilization "
+            f"({cache_config.gpu_memory_utilization}, "
+            f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+            f"utilization or reduce GPU memory used by other processes."
+        )
+
+    return requested_memory
+
+
 def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],

From 9ca8cb38fd68142627c9649756f1ddc5432c8b19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 17 Dec 2025 18:49:56 +0100
Subject: [PATCH 45/49] [CI][Bugfix] Fix flaky
 `tests/entrypoints/openai/test_audio.py::test_chat_streaming_audio` (#30878)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/entrypoints/openai/test_audio.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a2d8993441fcd..4cf864bdb2de9 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -254,7 +254,9 @@ async def test_single_chat_session_input_audio(
 async def test_chat_streaming_audio(
     client: openai.AsyncOpenAI, model_name: str, audio_url: str
 ):
-    messages = dummy_messages_from_audio_url(audio_url)
+    messages = dummy_messages_from_audio_url(
+        audio_url, "What's a short title for this audio?"
+    )
 
     # test single completion
     chat_completion = await client.chat.completions.create(

From 7eb6cb6c18a948fb49824154cb3ece1e32d12cf8 Mon Sep 17 00:00:00 2001
From: Matthew Bonanni <mbonanni@redhat.com>
Date: Wed, 17 Dec 2025 12:49:59 -0500
Subject: [PATCH 46/49] [Attention] Update tests to remove deprecated env vars
 (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
---
 .../scripts/hardware_ci/run-xpu-test.sh       |   2 +-
 .../test_basic_correctness.py                 |  85 +++++------
 tests/compile/distributed/test_fusions_e2e.py |   9 +-
 .../fullgraph/test_basic_correctness.py       |  82 ++++++-----
 .../compile/fullgraph/test_full_cudagraph.py  |  13 +-
 tests/compile/fullgraph/test_full_graph.py    |   7 +-
 tests/distributed/test_context_parallel.py    |   4 +-
 tests/distributed/test_pp_cudagraph.py        |  26 ++--
 tests/engine/test_arg_utils.py                | 135 +++++++++++++++++-
 tests/entrypoints/openai/test_serving_chat.py |  13 +-
 .../attention/test_attention_selector.py      |  52 +++----
 .../attention/test_rocm_attention_selector.py |  60 +++++---
 tests/kernels/test_flex_attention.py          |  95 ++++++------
 .../generation/test_granite_speech.py         |  12 +-
 tests/models/multimodal/pooling/conftest.py   |  24 ++--
 .../models/multimodal/pooling/test_siglip.py  |   8 ++
 tests/models/quantization/test_fp8.py         |   3 +-
 tests/models/test_initialization.py           |  12 +-
 .../test_rocm_attention_backends_selection.py |  12 +-
 tests/v1/attention/utils.py                   |  47 +++---
 tests/v1/cudagraph/test_cudagraph_mode.py     |  33 +----
 tests/v1/determinism/test_batch_invariance.py |  25 ++--
 .../test_online_batch_invariance.py           |   5 +-
 tests/v1/e2e/test_async_scheduling.py         |  22 +--
 tests/v1/e2e/test_cascade_attention.py        |  29 ++--
 tests/v1/e2e/test_spec_decode.py              |  43 +++---
 .../nixl_integration/run_accuracy_test.sh     |  22 ++-
 .../tp_config_sweep_accuracy_test.sh          |  12 +-
 .../kv_connector/unit/test_nixl_connector.py  |   6 +-
 tests/v1/kv_connector/unit/utils.py           |   4 +
 tests/v1/kv_offload/test_cpu_offloading.py    |  15 +-
 tests/v1/spec_decode/test_eagle.py            |  19 ++-
 tests/v1/spec_decode/test_max_len.py          |  89 ++++++------
 vllm/v1/attention/backends/rocm_attn.py       |   2 +-
 34 files changed, 580 insertions(+), 447 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index dfc9db512d1e9..85b554e5e8646 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -39,7 +39,7 @@ docker run \
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
-    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
     cd tests
     pytest -v -s v1/core
     pytest -v -s v1/engine
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 9e1cc309edd1d..68b5cd5101d5d 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs(
 @pytest.mark.parametrize("model_executor", ["uni", "mp"])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
-    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     model: str,
     backend: str,
@@ -77,48 +76,46 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = (
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(1024))
+        + " are:"
+    )
+    example_prompts = [prompt]
 
-        # 5042 tokens for gemma2
-        # gemma2 has alternating sliding window size of 4096
-        # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = (
-            "The following numbers of the sequence "
-            + ", ".join(str(i) for i in range(1024))
-            + " are:"
-        )
-        example_prompts = [prompt]
+    with hf_runner(model) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        if enable_prompt_embeds:
+            with torch.no_grad():
+                prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
 
-        with hf_runner(model) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-            if enable_prompt_embeds:
-                with torch.no_grad():
-                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+    with VllmRunner(
+        model,
+        max_model_len=8192,
+        enforce_eager=enforce_eager,
+        enable_prompt_embeds=enable_prompt_embeds,
+        gpu_memory_utilization=0.7,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=model_executor,
+        attention_config={"backend": backend},
+    ) as vllm_model:
+        if enable_prompt_embeds:
+            vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
+            vllm_outputs = _fix_prompt_embed_outputs(
+                vllm_outputs, hf_model, example_prompts
+            )
+        else:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-        with VllmRunner(
-            model,
-            max_model_len=8192,
-            enforce_eager=enforce_eager,
-            enable_prompt_embeds=enable_prompt_embeds,
-            gpu_memory_utilization=0.7,
-            async_scheduling=async_scheduling,
-            distributed_executor_backend=model_executor,
-        ) as vllm_model:
-            if enable_prompt_embeds:
-                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-                vllm_outputs = _fix_prompt_embed_outputs(
-                    vllm_outputs, hf_model, example_prompts
-                )
-            else:
-                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -161,12 +158,6 @@ def test_models_distributed(
         ):  # noqa
             pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
 
-        if attention_backend:
-            monkeypatch_context.setenv(
-                "VLLM_ATTENTION_BACKEND",
-                attention_backend,
-            )
-
         for k, v in extra_env.items():
             monkeypatch_context.setenv(k, v)
 
@@ -178,6 +169,7 @@ def test_models_distributed(
         # if we run HF first, the cuda initialization will be done and it
         # will hurt multiprocessing backend with fork method
         # (the default method).
+        attention_config = {"backend": attention_backend} if attention_backend else None
         with vllm_runner(
             model,
             dtype=dtype,
@@ -185,6 +177,7 @@ def test_models_distributed(
             distributed_executor_backend=distributed_executor_backend,
             enable_prompt_embeds=enable_prompt_embeds,
             gpu_memory_utilization=0.7,
+            attention_config=attention_config,
         ) as vllm_model:
             if enable_prompt_embeds:
                 with hf_runner(model, dtype=dtype) as hf_model:
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 960b5b4bd7ad4..28ab2cee71a6a 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -208,7 +208,8 @@ def test_attn_quant(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
     # To capture subprocess logs, we need to know whether spawn or fork is used.
     # Force spawn as it is more general.
     monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    model_kwargs["attention_config"] = {"backend": backend.name}
 
     compilation_config = CompilationConfig(
         # Testing properties
diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
index f2e58b5cc423e..d062ed221ff59 100644
--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -89,7 +89,6 @@ class TestSetting:
     ],
 )
 def test_compile_correctness(
-    monkeypatch: pytest.MonkeyPatch,
     test_setting: TestSetting,
 ):
     # this test is run under multiple suits, with different GPUs.
@@ -107,49 +106,48 @@ def test_compile_correctness(
             f"{cuda_device_count_stateless()}"
         )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-        final_args = [
-            *model_args,
-            "-pp",
-            str(pp_size),
-            "-tp",
-            str(tp_size),
-            "-cc.cudagraph_mode=none",
-        ]
+    final_args = [
+        *model_args,
+        "-pp",
+        str(pp_size),
+        "-tp",
+        str(tp_size),
+        "-cc.cudagraph_mode=none",
+        f"--attention-backend={attn_backend}",
+    ]
 
-        all_args: list[list[str]] = []
-        all_envs: list[dict[str, str] | None] = []
+    all_args: list[list[str]] = []
+    all_envs: list[dict[str, str] | None] = []
 
-        for comp_mode in [
-            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(
-                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
-                )
-
-            # inductor will change the output, so we only compare if the output
-            # is close, not exactly the same.
-            compare_all_settings(
-                model,
-                all_args,
-                all_envs,
-                method=method if method != "generate" else "generate_close",
+    for comp_mode in [
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        for mode in [CompilationMode.NONE, comp_mode]:
+            all_args.append(
+                final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
             )
-            all_envs.clear()
-            all_args.clear()
 
-        for mode in [
-            CompilationMode.NONE,
-            CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
-        ]:
-            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
-            all_envs.append({})
-            all_envs.append({})
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close",
+        )
+        all_envs.clear()
+        all_args.clear()
 
-        compare_all_settings(model, all_args * 3, all_envs, method=method)
+    for mode in [
+        CompilationMode.NONE,
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+        all_envs.append({})
+        all_envs.append({})
+
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
index c6d4b5272dbcf..4ce6abfe3e46d 100644
--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -74,7 +74,6 @@ def llm_pair(request):
         # Force native sampler to avoid potential nondeterminism in FlashInfer
         # when per-request generators are not used in V1.
         "VLLM_USE_FLASHINFER_SAMPLER": "0",
-        **backend_config.env_vars,
     }
     with temporary_environ(env_vars):
         full = LLM(
@@ -170,16 +169,10 @@ class TestFullCUDAGraph:
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with (
-        temporary_environ(
-            {
-                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
-                # Flex_Attention is not supported with full cuda graph
-            }
-        ),
-        pytest.raises(RuntimeError),
-    ):
+    # Flex_Attention is not supported with full cuda graph
+    with pytest.raises(RuntimeError):
         LLM(
             model="Qwen/Qwen2-1.5B-Instruct",
             compilation_config=CompilationConfig(cudagraph_mode="FULL"),
+            attention_config={"backend": "FLEX_ATTENTION"},
         )
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 3cd1d4be2ebdc..22af2d57f4f3d 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -197,20 +197,19 @@ def test_custom_compile_config(
     ],
 )
 def test_fp8_kv_scale_compile(
-    monkeypatch: pytest.MonkeyPatch,
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
 ):
-    if backend:
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
-
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",
         "calculate_kv_scales": True,
         "max_model_len": 512,
     }
+    if backend:
+        model_kwargs["attention_config"] = {"backend": backend.name}
+
     run_model(compilation_mode, model, **model_kwargs)
 
 
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index aa47f28a34dd5..a286309217719 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -219,14 +219,12 @@ def _test_cp_gsm8k(
         ]
     )
 
-    server_env = {}
     if attn_backend:
-        server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
+        server_args.append(f"--attention-backend={attn_backend}")
 
     with RemoteOpenAIServer(
         model_id,
         server_args,
-        env_dict=server_env,
         max_wait_seconds=720,
     ) as remote_server:
         host = f"http://{remote_server.host}"
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 2f2b43cb4cc2b..34ae305c2d2c1 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -20,23 +20,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
 )
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
-    monkeypatch: pytest.MonkeyPatch,
     PP_SIZE: int,
     MODEL_NAME: str,
     ATTN_BACKEND: LiteralString,
 ):
-    with monkeypatch.context() as m:
-        cudagraph_args = [
-            # use half precision for speed and memory savings in CI environment
-            "--dtype",
-            "float16",
-            "--pipeline-parallel-size",
-            str(PP_SIZE),
-            "--distributed-executor-backend",
-            "mp",
-        ]
-        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "mp",
+        f"--attention-backend={ATTN_BACKEND}",
+    ]
 
-        eager_args = cudagraph_args + ["--enforce-eager"]
+    eager_args = cudagraph_args + ["--enforce-eager"]
 
-        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index c2cf77ffa12b6..25a5e00cc0e16 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
 
 import pytest
 
-from vllm.config import CompilationConfig, config
+from vllm.config import AttentionConfig, CompilationConfig, config
 from vllm.engine.arg_utils import (
     EngineArgs,
     contains_type,
@@ -298,6 +298,139 @@ def test_compilation_config():
     )
 
 
+def test_attention_config():
+    from vllm.attention.backends.registry import AttentionBackendEnum
+
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+
+    # default value
+    args = parser.parse_args([])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config == AttentionConfig()
+
+    # set backend via dot notation
+    args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASH_ATTN"
+
+    # set backend via --attention-backend shorthand
+    args = parser.parse_args(["--attention-backend", "FLASHINFER"])
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_backend is not None
+    assert engine_args.attention_backend == "FLASHINFER"
+
+    # set all fields via dot notation
+    args = parser.parse_args(
+        [
+            "--attention-config.backend",
+            "FLASH_ATTN",
+            "--attention-config.flash_attn_version",
+            "3",
+            "--attention-config.use_prefill_decode_attention",
+            "true",
+            "--attention-config.flash_attn_max_num_splits_for_cuda_graph",
+            "16",
+            "--attention-config.use_cudnn_prefill",
+            "true",
+            "--attention-config.use_trtllm_ragged_deepseek_prefill",
+            "true",
+            "--attention-config.use_trtllm_attention",
+            "true",
+            "--attention-config.disable_flashinfer_prefill",
+            "true",
+            "--attention-config.disable_flashinfer_q_quantization",
+            "true",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASH_ATTN"
+    assert engine_args.attention_config.flash_attn_version == 3
+    assert engine_args.attention_config.use_prefill_decode_attention is True
+    assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
+    assert engine_args.attention_config.use_cudnn_prefill is True
+    assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
+    assert engine_args.attention_config.use_trtllm_attention is True
+    assert engine_args.attention_config.disable_flashinfer_prefill is True
+    assert engine_args.attention_config.disable_flashinfer_q_quantization is True
+
+    # set to string form of a dict with all fields
+    args = parser.parse_args(
+        [
+            "--attention-config="
+            '{"backend": "FLASHINFER", "flash_attn_version": 2, '
+            '"use_prefill_decode_attention": false, '
+            '"flash_attn_max_num_splits_for_cuda_graph": 8, '
+            '"use_cudnn_prefill": false, '
+            '"use_trtllm_ragged_deepseek_prefill": false, '
+            '"use_trtllm_attention": false, '
+            '"disable_flashinfer_prefill": false, '
+            '"disable_flashinfer_q_quantization": false}',
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    assert engine_args.attention_config.backend is not None
+    assert engine_args.attention_config.backend.name == "FLASHINFER"
+    assert engine_args.attention_config.flash_attn_version == 2
+    assert engine_args.attention_config.use_prefill_decode_attention is False
+    assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8
+    assert engine_args.attention_config.use_cudnn_prefill is False
+    assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False
+    assert engine_args.attention_config.use_trtllm_attention is False
+    assert engine_args.attention_config.disable_flashinfer_prefill is False
+    assert engine_args.attention_config.disable_flashinfer_q_quantization is False
+
+    # test --attention-backend flows into VllmConfig.attention_config
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-backend",
+            "FLASH_ATTN",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    vllm_config = engine_args.create_engine_config()
+    assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN
+
+    # test --attention-config.backend flows into VllmConfig.attention_config
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-config.backend",
+            "FLASHINFER",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    vllm_config = engine_args.create_engine_config()
+    assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER
+
+    # test --attention-backend and --attention-config.backend are mutually exclusive
+    args = parser.parse_args(
+        [
+            "--model",
+            "facebook/opt-125m",
+            "--attention-backend",
+            "FLASH_ATTN",
+            "--attention-config.backend",
+            "FLASHINFER",
+        ]
+    )
+    assert args is not None
+    engine_args = EngineArgs.from_cli_args(args)
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        engine_args.create_engine_config()
+
+
 def test_prefix_cache_default():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 444275e061c61..2befa40d636da 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -76,15 +76,10 @@ def default_server_args(with_tool_parser: bool):
 
 
 @pytest.fixture(scope="module")
-def gptoss_server(
-    monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str]
-):
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-        with RemoteOpenAIServer(
-            GPT_OSS_MODEL_NAME, default_server_args
-        ) as remote_server:
-            yield remote_server
+def gptoss_server(default_server_args: list[str]):
+    server_args = default_server_args + ["--attention-backend=TRITON_ATTN"]
+    with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
+        yield remote_server
 
 
 @pytest_asyncio.fixture
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index c959b2f4bb03c..d62acc2022d10 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -6,7 +6,9 @@ from unittest.mock import patch
 import pytest
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
@@ -73,18 +75,18 @@ def generate_params():
 
 
 @pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
-def test_env(
+def test_backend_selection(
     device: str,
     name: str,
     use_mla: bool,
     block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
 ):
     """Test attention backend selection with valid device-backend pairs."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", name)
-        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+    # Create AttentionConfig with the specified backend
+    attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
+    vllm_config = VllmConfig(attention_config=attention_config)
 
+    with set_current_vllm_config(vllm_config):
         if device == "cpu":
             with patch("vllm.platforms.current_platform", CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size)
@@ -217,27 +219,32 @@ def test_env(
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_fp32_fallback(device: str):
     """Test attention backend selection with fp32."""
-    if device == "cpu":
-        with patch("vllm.platforms.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "CPU_ATTN"
+    # Use default config (no backend specified)
+    vllm_config = VllmConfig()
 
-    elif device == "cuda":
-        with patch("vllm.platforms.current_platform", CudaPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "FLEX_ATTENTION"
+    with set_current_vllm_config(vllm_config):
+        if device == "cpu":
+            with patch("vllm.platforms.current_platform", CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "CPU_ATTN"
+
+        elif device == "cuda":
+            with patch("vllm.platforms.current_platform", CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "FLEX_ATTENTION"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     pytest.skip(
         "Skipping as current backend selector does not "
-        "handle fallbacks when a backend is set via env var."
+        "handle fallbacks when a backend is explicitly set."
     )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
+    with set_current_vllm_config(vllm_config):
         # Unsupported CUDA arch
         monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         assert backend.get_name() != "FLASH_ATTN"
 
 
-def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
+def test_invalid_backend():
     """Test that invalid attention backend names raise ValueError."""
     with (
-        monkeypatch.context() as m,
-        patch("vllm.platforms.current_platform", CudaPlatform()),
+        pytest.raises(ValueError),
     ):
-        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
-
-        # Should raise ValueError for invalid backend
-        with pytest.raises(ValueError) as exc_info:
-            get_attn_backend(32, torch.float16, None, 16)
-        assert "Invalid value 'INVALID'" in str(exc_info.value)
+        # Invalid backend name should raise ValueError when creating enum
+        AttentionConfig(backend=AttentionBackendEnum["INVALID"])
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index b61058081c0b2..f97d475eb47d7 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -4,7 +4,9 @@
 import pytest
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms.rocm import RocmPlatform
 
 
@@ -16,40 +18,56 @@ def clear_cache():
 
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
+    # Set the current platform to ROCm using monkeypatch
+    monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
 
-        # Set the current platform to ROCm using monkeypatch
-        monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
+    # Test standard ROCm attention
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
-        # Test standard ROCm attention
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
         assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
 
-        # MLA test for deepseek related
+    # MLA test for deepseek related
+    # Change the attention backend to triton MLA
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA)
+    vllm_config = VllmConfig(attention_config=attention_config)
 
-        # change the attention backend to triton MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
-        # If attention backend is None
-        # If use_mla is true
-        # The selected backend is triton MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "")
+    # If attention backend is None
+    # If use_mla is true
+    # The selected backend is triton MLA
+    attention_config = AttentionConfig(backend=None)
+    vllm_config = VllmConfig(attention_config=attention_config)
+
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
-        # change the attention backend to AITER MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
+    # Change the attention backend to AITER MLA
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA)
+    vllm_config = VllmConfig(attention_config=attention_config)
+
+    with set_current_vllm_config(vllm_config):
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
 
-        # If attention backend is None
-        # If use_mla is true
-        # If VLLM_ROCM_USE_AITER is enabled
-        # The selected backend is ROCM_AITER_MLA
-        m.setenv("VLLM_ATTENTION_BACKEND", "")
+    # If attention backend is None
+    # If use_mla is true
+    # If VLLM_ROCM_USE_AITER is enabled
+    # The selected backend is ROCM_AITER_MLA
+    with monkeypatch.context() as m:
         m.setenv("VLLM_ROCM_USE_AITER", "1")
-        backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
-        assert backend.get_name() == "ROCM_AITER_MLA"
+
+        attention_config = AttentionConfig(backend=None)
+        vllm_config = VllmConfig(attention_config=attention_config)
+
+        with set_current_vllm_config(vllm_config):
+            backend = get_attn_backend(
+                576, torch.bfloat16, "auto", 1, False, use_mla=True
+            )
+            assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/test_flex_attention.py b/tests/kernels/test_flex_attention.py
index ae33f422d3732..f6987d54399d2 100644
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -37,7 +37,7 @@ def set_seed(seed):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_flex_attention_vs_default_backend(vllm_runner):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -54,35 +54,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     ]
 
     # Run with flex attention
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-
-        set_seed(seed)
-        with vllm_runner(
-            model_name,
-            runner="generate",
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-        ) as llm_flex:
-            output_flex = llm_flex.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs
-            )
+    set_seed(seed)
+    with vllm_runner(
+        model_name,
+        runner="generate",
+        tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
+        enforce_eager=True,
+        attention_config={"backend": "FLEX_ATTENTION"},
+    ) as llm_flex:
+        output_flex = llm_flex.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
+        )
 
     # Run with default backend
-    with monkeypatch.context() as m:
-        set_seed(seed)
-        with vllm_runner(
-            model_name,
-            runner="generate",
-            tensor_parallel_size=1,
-            num_gpu_blocks_override=128,
-            enforce_eager=True,
-            gpu_memory_utilization=0.85,
-        ) as llm_default:
-            output_default = llm_default.generate_greedy_logprobs(
-                prompts, max_tokens, num_logprobs
-            )
+    set_seed(seed)
+    with vllm_runner(
+        model_name,
+        runner="generate",
+        tensor_parallel_size=1,
+        num_gpu_blocks_override=128,
+        enforce_eager=True,
+        gpu_memory_utilization=0.85,
+    ) as llm_default:
+        output_default = llm_default.generate_greedy_logprobs(
+            prompts, max_tokens, num_logprobs
+        )
 
     check_logprobs_close(
         outputs_0_lst=output_flex,
@@ -96,7 +93,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
     reason="CUDA not available or PyTorch version < 2.7",
 )
-def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
+def test_encoder_flex_attention_vs_default_backend(vllm_runner):
     """Test that FlexAttention produces the same outputs as the default backend.
 
     This test compares the outputs from the FlexAttention backend with
@@ -110,30 +107,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
     ]
 
     # Run with flex attention
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
-        with vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=torch.bfloat16,
-            tensor_parallel_size=1,
-            max_model_len=100,
-            enforce_eager=True,
-        ) as llm_flex:
-            flex_outputs = llm_flex.embed(prompts)
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=torch.bfloat16,
+        tensor_parallel_size=1,
+        max_model_len=100,
+        enforce_eager=True,
+        attention_config={"backend": "FLEX_ATTENTION"},
+    ) as llm_flex:
+        flex_outputs = llm_flex.embed(prompts)
 
     # Run with default backend
-    with (
-        monkeypatch.context() as m,
-        vllm_runner(
-            model_name,
-            runner="pooling",
-            dtype=torch.bfloat16,
-            tensor_parallel_size=1,
-            max_model_len=100,
-            enforce_eager=True,
-        ) as llm_default,
-    ):
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=torch.bfloat16,
+        tensor_parallel_size=1,
+        max_model_len=100,
+        enforce_eager=True,
+    ) as llm_default:
         default_outputs = llm_default.embed(prompts)
 
     check_embeddings_close(
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index f528a993f8551..489743c5a29b3 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]
 
 
-@pytest.fixture(autouse=True)
-def set_attention_backend_for_rocm(monkeypatch):
+@pytest.fixture
+def granite_speech_attention_config():
+    """Return attention config for Granite Speech tests on ROCm."""
     if current_platform.is_rocm():
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        return {"backend": "TRITON_ATTN"}
+    return None
 
 
 def run_test(
@@ -53,6 +55,7 @@ def run_test(
     num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: str | None = None,
+    attention_config: dict | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
@@ -80,6 +83,7 @@ def run_test(
         enable_lora=True,
         max_lora_rank=64,
         enforce_eager=True,
+        attention_config=attention_config,
     ) as vllm_model:
         lora_request = LoRARequest("audio", 1, audio_lora_path)
         vllm_outputs_per_case = [
@@ -131,6 +135,7 @@ def test_models(
     vllm_runner,
     model: str,
     audio_assets: AudioTestAssets,
+    granite_speech_attention_config,
     dtype: str,
     max_model_len: int,
     max_tokens: int,
@@ -157,4 +162,5 @@ def test_models(
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
         tensor_parallel_size=1,
+        attention_config=granite_speech_attention_config,
     )
diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py
index c5f40cb42ca2a..401bc39b4b109 100644
--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -2,23 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""
 
-import os
-import warnings
+import pytest
 
 from vllm.platforms import current_platform
 
 
-def pytest_collection_modifyitems(config, items):
-    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
-    if not current_platform.is_rocm():
-        return
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.
 
-    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
-
-    if siglip_tests:
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
-        warnings.warn(
-            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
-            UserWarning,
-            stacklevel=1,
-        )
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 72886cbf7f323..0b8cd33ccfb9d 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -38,6 +38,7 @@ def _run_test(
     *,
     dtype: str,
     tokenization_kwargs: dict[str, Any] | None = None,
+    attention_config: dict[str, Any] | None = None,
 ) -> None:
     if tokenization_kwargs is None:
         tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
         enforce_eager=True,
         max_model_len=64,
         gpu_memory_utilization=0.7,
+        attention_config=attention_config,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(
             input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
     hf_runner,
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
             "padding": "max_length",
             "max_length": 64,
         },  # siglip2 was trained with this padding setting.
+        attention_config=siglip_attention_config,
     )
 
 
@@ -117,6 +121,7 @@ def test_models_image(
     hf_runner,
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
         input_images,
         model,
         dtype=dtype,
+        attention_config=siglip_attention_config,
     )
 
 
@@ -141,6 +147,7 @@ def test_models_image(
 def test_models_text_image_no_crash(
     vllm_runner,
     image_assets,
+    siglip_attention_config,
     model: str,
     dtype: str,
 ) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
         enforce_eager=True,
         max_model_len=64,
         gpu_memory_utilization=0.7,
+        attention_config=siglip_attention_config,
     ) as vllm_model:
         with pytest.raises(ValueError, match="not both"):
             vllm_model.embed(texts, images=images)
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 7dfedaf2799d4..f3b85ba0ee394 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -75,7 +75,6 @@ def test_models(
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
@@ -86,6 +85,7 @@ def test_models(
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype="auto",
+            attention_config={"backend": backend},
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS
@@ -97,6 +97,7 @@ def test_models(
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype=kv_cache_dtype,
+            attention_config={"backend": backend},
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 8c4bd6eaa2dd8..0a573847bf913 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -108,11 +108,12 @@ def can_initialize(
         patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
         monkeypatch.context() as m,
     ):
-        if model_arch == "GptOssForCausalLM":
-            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
-            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
-            # L4 supports FA3.
-            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
+        # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
+        # L4 supports FA3.
+        attention_config = (
+            {"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None
+        )
         if model_arch == "WhisperForConditionalGeneration":
             m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
@@ -143,6 +144,7 @@ def can_initialize(
             else "vllm",
             hf_overrides=hf_overrides_fn,
             max_num_seqs=model_info.max_num_seqs,
+            attention_config=attention_config,
         )
 
 
diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
index 77790be6f892b..d8c747056faf6 100644
--- a/tests/v1/attention/test_rocm_attention_backends_selection.py
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -94,26 +94,20 @@ def mock_on_gfx9():
             None,
             AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
         ),
-        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
-        (
-            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
-            None,
-            AttentionBackendEnum.ROCM_ATTN.get_path(),
-        ),
-        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        # Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
         (
             {"VLLM_ROCM_USE_AITER": "1"},
             "TRITON_ATTN",
             AttentionBackendEnum.TRITON_ATTN.get_path(),
         ),
-        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
         # (explicitly disabled)
         (
             {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
             None,
             AttentionBackendEnum.TRITON_ATTN.get_path(),
         ),
-        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
         (
             {"VLLM_ROCM_USE_AITER": "1"},
             "ROCM_ATTN",
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 4dcaf9d908690..031436a030908 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
 @dataclass
 class BackendConfig:
     name: str
-    env_vars: dict
-    comp_config: dict  # compilation config
+    attention_config: dict
+    comp_config: dict
     specific_gpu_arch: tuple | None = None
 
 
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
     # FA3 on Hopper
     "FA3": BackendConfig(
         name="FA3",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "3",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 3,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL",
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
     # FlashMLA on Hopper
     "FlashMLA": BackendConfig(
         name="FlashMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHMLA",
-        },
+        attention_config={"backend": "FLASHMLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
     # Cutlass MLA on Blackwell
     "CutlassMLA": BackendConfig(
         name="CutlassMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
-        },
+        attention_config={"backend": "CUTLASS_MLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
     # FlashInfer MLA on Blackwell
     "FlashInferMLA": BackendConfig(
         name="FlashInferMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
-        },
+        attention_config={"backend": "FLASHINFER_MLA"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
     # FlashAttention MLA on Hopper
     "FlashAttentionMLA": BackendConfig(
         name="FlashAttentionMLA",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN_MLA",
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL_DECODE_ONLY",
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
     # FA2
     "FA2": BackendConfig(
         name="FA2",
-        env_vars={
-            "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
-            "VLLM_FLASH_ATTN_VERSION": "2",
-            "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
+        attention_config={
+            "backend": "FLASH_ATTN",
+            "flash_attn_version": 2,
+            "flash_attn_max_num_splits_for_cuda_graph": 16,
         },
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
     # Triton Attention
     "TritonAttn": BackendConfig(
         name="TritonAttn",
-        env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
+        attention_config={"backend": "TRITON_ATTN"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
     # FlashInfer
     "FlashInfer": BackendConfig(
         name="FlashInfer",
-        env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+        attention_config={"backend": "FLASHINFER"},
         comp_config={
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
     "RocmAttn": BackendConfig(
         name="RocmAttn",
-        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        attention_config={
+            "backend": "ROCM_ATTN",
+            "use_prefill_decode_attention": True,
+        },
         comp_config={
             "cudagraph_mode": "FULL",
         },
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index b1895e83b8b37..f4f74d16c7019 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
-import os
 import weakref
 from contextlib import ExitStack
 
@@ -13,26 +11,6 @@ from vllm import LLM
 from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform
 
-
-@contextlib.contextmanager
-def temporary_environ(env_vars):
-    """
-    Temporarily set environment variables and restore them afterward.
-    We have to do this vs monkeypatch because monkeypatch doesn't work
-    with "module" scoped fixtures.
-    """
-    original_env = {k: os.environ.get(k) for k in env_vars}
-    try:
-        os.environ.update(env_vars)
-        yield
-    finally:
-        for k, v in original_env.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-
-
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
 if current_platform.is_rocm():
@@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
     ):
         pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
 
-    env_vars = backend_configs[backend_name].env_vars
+    attention_config = backend_config.attention_config
 
-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
@@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
             trust_remote_code=True,
             gpu_memory_utilization=0.45,
             max_model_len=1024,
+            attention_config=attention_config,
             compilation_config=CompilationConfig(
                 mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
             ),
@@ -122,9 +101,10 @@ combo_cases_2 = [
 def test_cudagraph_compilation_combo(
     backend_name, cudagraph_mode, compilation_mode, supported
 ):
-    env_vars = backend_configs[backend_name].env_vars
+    backend_config = backend_configs[backend_name]
+    attention_config = backend_config.attention_config
 
-    with temporary_environ(env_vars), ExitStack() as stack:
+    with ExitStack() as stack:
         if not supported:
             stack.enter_context(pytest.raises(Exception))
 
@@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
             trust_remote_code=True,
             gpu_memory_utilization=0.45,
             max_model_len=1024,
+            attention_config=attention_config,
             compilation_config=CompilationConfig(
                 mode=compilation_mode, cudagraph_mode=cudagraph_mode
             ),
diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
index 7a58e1c9bad03..61fb5f07303b4 100644
--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
     BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
     """
     Ensures that the same request (the 'needle' prompt) yields identical output
@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
 
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    attention_config = {"backend": backend}
     # Allow overrides from environment (useful for CI tuning)
     # "facebook/opt-125m" is too small, doesn't reliably test determinism
     model = resolve_model_name(backend)
@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             max_num_seqs=max_batch_size,
             gpu_memory_utilization=gpu_mem_util,
             max_model_len=max_model_len,
+            attention_config=attention_config,
         )
 
         # Baseline generation for the needle prompt alone.
@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
             max_num_seqs=max_batch_size,
             gpu_memory_utilization=gpu_mem_util,
             max_model_len=max_model_len,
+            attention_config=attention_config,
         )
 
         mismatches = 0
@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     BACKENDS,
 )
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
@@ -193,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         dtype="bfloat16",  # not everything is supported
         gpu_memory_utilization=0.9,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # Use more realistic prompts for better token generation
@@ -381,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     "backend",
     BACKENDS,
 )
-def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
+def test_simple_generation(backend):
     """
     Simple test that runs the model with a basic prompt and prints the output.
     Useful for quick smoke testing and debugging.
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     model = resolve_model_name(backend)
 
     llm = LLM(
@@ -398,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         dtype="bfloat16",
         enable_prefix_caching=False,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     prompt = "the capital of france is"
@@ -444,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
     The test will PASS if we detect differences (proving batch invariance matters).
     The test will FAIL if everything matches (suggesting batch invariance isn't needed).
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     # CRITICAL: Disable batch invariance for this test
     monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
     monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
@@ -465,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
         max_model_len=8192,
         dtype="bfloat16",
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # build ragged prompts to change shapes significantly across BS=1 vs BS=N
@@ -649,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 def test_decode_logprobs_match_prefill_logprobs(
-    backend, monkeypatch: pytest.MonkeyPatch
+    backend,
 ):
     """
     Test that verifies decode logprobs match prefill logprobs.
@@ -664,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
     This ensures that the logprobs from decode are consistent with what
     we would get if we ran prefill on each prefix.
     """
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
     model_name = resolve_model_name(backend)
@@ -689,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
         max_model_len=8192,
         dtype="bfloat16",
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config={"backend": backend},
     )
 
     # Use a few test prompts
@@ -920,6 +919,7 @@ def LLM_with_max_seqs(
     max_num_seqs: int,
     gpu_memory_utilization: float,
     max_model_len: int,
+    attention_config: dict | None = None,
 ) -> LLM:
     """
     Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
@@ -934,6 +934,7 @@ def LLM_with_max_seqs(
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
         enable_prefix_caching=False,
         enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
+        attention_config=attention_config,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
index 5e3b997364949..52c8103b2f1ce 100644
--- a/tests/v1/determinism/test_online_batch_invariance.py
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
 @skip_unsupported
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
-    backend: str, monkeypatch: pytest.MonkeyPatch
+    backend: str,
 ) -> None:
     random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
-    # Override backend for this test (and the RemoteOpenAIServer child process).
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     model_name = resolve_model_name(backend)
     prompts_all = [_random_prompt(10, 50) for _ in range(32)]
 
@@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     server_args: list[str] = [
         "--max-model-len=8192",
         "--max-num-seqs=32",
+        f"--attention-backend={backend}",
     ]
     if tp_size:
         server_args += ["-tp", tp_size]
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 5cef9b33c9984..61e56c079a3b5 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -142,16 +142,17 @@ def run_tests(
     """Test consistency of combos of async scheduling, preemption,
     uni/multiproc executor with spec decoding."""
 
-    with monkeypatch.context() as m:
-        # avoid precision errors
-        if current_platform.is_rocm():
-            if is_testing_with_spec_decoding:
-                # Use TRITON_ATTN for spec decoding test for consistency
-                m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-            else:
-                m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
+    # Determine attention config based on platform
+    if current_platform.is_rocm():
+        if is_testing_with_spec_decoding:
+            # Use TRITON_ATTN for spec decoding test for consistency
+            attention_config = {"backend": "TRITON_ATTN"}
         else:
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+            attention_config = {"backend": "ROCM_AITER_FA"}
+    else:
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
+    with monkeypatch.context() as m:
         # lock matmul precision to full FP32 (IEEE)
         m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
@@ -174,6 +175,7 @@ def run_tests(
                 spec_config,
                 test_prefill_chunking=test_prefill_chunking,
                 is_testing_with_spec_decoding=is_testing_with_spec_decoding,
+                attention_config=attention_config,
             )
             outputs.append(test_results)
 
@@ -262,6 +264,7 @@ def run_test(
     spec_config: dict[str, Any] | None,
     test_prefill_chunking: bool,
     is_testing_with_spec_decoding: bool = False,
+    attention_config: dict[str, Any] | None = None,
 ):
     spec_decoding = spec_config is not None
     cache_arg: dict[str, Any] = (
@@ -301,6 +304,7 @@ def run_test(
         dtype=dtype,
         speculative_config=spec_config,
         disable_log_stats=False,
+        attention_config=attention_config,
         **cache_arg,
     ) as vllm_model:
         results = []
diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py
index 0fcb97fe63055..a7be981805c0d 100644
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -10,7 +10,7 @@ from ...utils import create_new_process_for_each_test
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
+def test_cascade_attention(example_system_message, attn_backend):
     prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
 
     if attn_backend == "FLASHINFER":
@@ -19,19 +19,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
             "needs investigation. See issue #25679."
         )
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+    llm = LLM(
+        model="Qwen/Qwen2-1.5B-Instruct", attention_config={"backend": attn_backend}
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    # No cascade attention.
+    single_prompt = [example_system_message + prompt]
+    responses = llm.generate(single_prompt, sampling_params)
+    ref_output = responses[0].outputs[0].text
 
-        # No cascade attention.
-        single_prompt = [example_system_message + prompt]
-        responses = llm.generate(single_prompt, sampling_params)
-        ref_output = responses[0].outputs[0].text
-
-        # (Probably) Use cascade attention.
-        prompts = [example_system_message + prompt] * 64
-        responses = llm.generate(prompts, sampling_params)
-        for response in responses:
-            assert response.outputs[0].text == ref_output
+    # (Probably) Use cascade attention.
+    prompts = [example_system_message + prompt] * 64
+    responses = llm.generate(prompts, sampling_params)
+    for response in responses:
+        assert response.outputs[0].text == ref_output
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index fcfc8bdce12e9..a25114a4d96cb 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -438,25 +438,26 @@ def test_eagle_correctness(
     should be the same when using eagle speculative decoding.
     model_setup: (method, model_name, eagle_model_name, tp_size)
     """
+    # Determine attention config
+    # Scout requires default backend selection because vision encoder has
+    # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
+    # to Flex Attn
+    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+        if current_platform.is_rocm():
+            # TODO: Enable Flex Attn for spec_decode on ROCm
+            pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
+        attention_config = None  # Let it fall back to default
+    else:
+        attention_config = {"backend": attn_backend}
+
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )
+
     with monkeypatch.context() as m:
-        if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-            # Scout requires default backend selection
-            # because vision encoder has head_dim 88 being incompatible
-            #  with FLASH_ATTN and needs to fall back to Flex Attn
-
-            # pass if not ROCm
-            if current_platform.is_rocm():
-                # TODO: Enable Flex Attn for spec_decode on ROCm
-                pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
-        else:
-            m.setenv("VLLM_MLA_DISABLE", "1")
-            m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
+        m.setenv("VLLM_MLA_DISABLE", "1")
 
         if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
             if "deepseek" in model_setup[1].lower():
@@ -471,7 +472,10 @@ def test_eagle_correctness(
         max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
-            model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
+            model=model_name,
+            max_model_len=max_model_len,
+            tensor_parallel_size=tp_size,
+            attention_config=attention_config,
         )
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
@@ -492,6 +496,7 @@ def test_eagle_correctness(
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=enable_chunked_prefill,
             model_impl=model_impl,
+            attention_config=attention_config,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index 453ccc81eb14a..c2c38f51c5003 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -3,21 +3,29 @@ set -xe
 
 # Parse command line arguments
 KV_BUFFER_DEVICE="cuda"  # Default to cuda
+ATTENTION_BACKEND=""  # Default to empty (use vllm default)
 while [[ $# -gt 0 ]]; do
   case $1 in
     --kv_buffer_device)
       KV_BUFFER_DEVICE="$2"
       shift 2
       ;;
+    --attention-backend)
+      ATTENTION_BACKEND="$2"
+      shift 2
+      ;;
     *)
       echo "Unknown option $1"
-      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
+      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>] [--attention-backend <backend>]"
       exit 1
       ;;
   esac
 done
 
 echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
+if [[ -n "$ATTENTION_BACKEND" ]]; then
+  echo "Using attention backend: $ATTENTION_BACKEND"
+fi
 
 DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
 if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
@@ -148,6 +156,11 @@ run_tests_for_model() {
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
 
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
     if [ -n "$model_args" ]; then
     FULL_CMD="$BASE_CMD $model_args"
     else
@@ -188,7 +201,12 @@ run_tests_for_model() {
     --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
-  
+
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
   # DP-EP attention mode
   if [[ -z "$DP_EP" ]]; then
     BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"
diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
index 9308c81da0635..8199fd516cd43 100755
--- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -15,14 +15,14 @@ configs=(
 
 run_tests() {
   local label=$1
-  local extra_env=$2
+  local extra_args=$2
 
   echo "=== Running tests (${label}) ==="
   for cfg in "${configs[@]}"; do
-    echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
+    echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}"
     # Use 'env' to safely set variables without eval
-    if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
-      echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
+    if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then
+      echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}"
       exit 1
     fi
   done
@@ -34,8 +34,8 @@ run_tests "default backend" ""
 
 # Check if FLASHINFER is set (non-empty)
 if [[ -n "${FLASHINFER:-}" ]]; then
-  echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
-  run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
+  echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER"
+  run_tests "FLASHINFER backend" "--attention-backend FLASHINFER"
 else
   echo "FLASHINFER not set, skipping FLASHINFER runs."
 fi
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 66804fa671c7c..25f4308079595 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1132,7 +1132,7 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
         "TRITON_ATTN",
     ],
 )
-def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
+def test_register_kv_caches(dist_init, attn_backend):
     """
     Test that register_kv_caches() properly calls nixl_wrapper methods with
     correct data.
@@ -1144,9 +1144,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
        block layout info
     """
 
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-    vllm_config = create_vllm_config()
+    vllm_config = create_vllm_config(attention_backend=attn_backend)
 
     # Import the appropriate backend based on the parameter
     if attn_backend == "FLASH_ATTN":
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 5cdb1f84b30d4..3a0dbb8e43b52 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -11,6 +11,7 @@ import torch
 
 from vllm import SamplingParams
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     DeviceConfig,
     KVTransferConfig,
@@ -94,6 +95,7 @@ def create_vllm_config(
     dtype: str = "float16",
     cache_dtype: str = "auto",
     hf_overrides: dict[str, Any] | None = None,
+    attention_backend: str | None = None,
 ) -> VllmConfig:
     """Initialize VllmConfig For Testing."""
     model_config = ModelConfig(
@@ -124,12 +126,14 @@ def create_vllm_config(
         enable_permute_local_kv=enable_permute_local_kv,
         kv_connector_extra_config=kv_connector_extra_config or {},
     )
+    attention_config = AttentionConfig(backend=attention_backend)
     return VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
         device_config=DeviceConfig("cpu"),
+        attention_config=attention_config,
     )
 
 
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index 57474a3dc01e7..1ac5e5b8cdc57 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -13,7 +13,6 @@ from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
 from vllm.platforms import current_platform
-from vllm.utils.system_utils import set_env_var
 
 CPU_BLOCK_SIZES = [48]
 ATTN_BACKENDS = ["FLASH_ATTN"]
@@ -180,13 +179,13 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
         topic="test",
     )
 
-    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
-        llm = LLM(
-            model="meta-llama/Llama-3.2-1B-Instruct",
-            gpu_memory_utilization=0.5,
-            kv_events_config=kv_events_config,
-            kv_transfer_config=kv_transfer_config,
-        )
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        gpu_memory_utilization=0.5,
+        kv_events_config=kv_events_config,
+        kv_transfer_config=kv_transfer_config,
+        attention_config={"backend": attn_backend},
+    )
 
     events_endpoint = events_endpoint.replace("*", "127.0.0.1")
     subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 55e9b4d0660f5..f63cd3a6e42aa 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -15,6 +15,7 @@ from tests.v1.attention.utils import (
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
+    AttentionConfig,
     CacheConfig,
     DeviceConfig,
     ModelConfig,
@@ -38,6 +39,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 def _create_proposer(
     method: str,
     num_speculative_tokens: int,
+    attention_backend: str | None = None,
     speculative_token_tree: list[tuple[int, ...]] | None = None,
 ) -> EagleProposer:
     model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100)
@@ -70,6 +72,7 @@ def _create_proposer(
             max_model_len=model_config.max_model_len,
             is_encoder_decoder=model_config.is_encoder_decoder,
         ),
+        attention_config=AttentionConfig(backend=attention_backend),
     )
 
     return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
@@ -331,8 +334,6 @@ def test_load_model(
     use_distinct_lm_head,
     monkeypatch,
 ):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
     if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
         pytest.skip(
             "TRITON_ATTN does not support "
@@ -394,7 +395,9 @@ def test_load_model(
     assert not isinstance(target_model, SupportsMultiModal)
 
     # Create proposer using the helper function
-    proposer = _create_proposer(method, num_speculative_tokens=8)
+    proposer = _create_proposer(
+        method, num_speculative_tokens=8, attention_backend=attn_backend
+    )
 
     # Call the method under test
     proposer.load_model(target_model)
@@ -420,8 +423,6 @@ def test_load_model(
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
 def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
     if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
         pytest.skip(
             "TRITON_ATTN does not support "
@@ -449,7 +450,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
     seq_lens = [seq_len_1, seq_len_2]
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", num_speculative_tokens)
+    proposer = _create_proposer(
+        "eagle", num_speculative_tokens, attention_backend=attn_backend
+    )
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size
 
@@ -622,7 +625,9 @@ def test_propose_tree(spec_token_tree):
 
     # Create proposer first so we can use its actual hidden_size.
     proposer = _create_proposer(
-        "eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree
+        "eagle",
+        num_speculative_tokens,
+        speculative_token_tree=spec_token_tree,
     )
     # Get the hidden_size from the proposer to ensure consistency.
     hidden_size = proposer.hidden_size
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 15a6bd2659ea9..42991f9f1ae03 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -38,53 +38,48 @@ def test_ngram_max_len(num_speculative_tokens: int):
 def test_eagle_max_len(
     monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
-
-        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
-            m.setenv("VLLM_ROCM_USE_AITER", "1")
-
-        llm = LLM(
-            model="meta-llama/Meta-Llama-3-8B-Instruct",
-            enforce_eager=True,  # For faster initialization.
-            speculative_config={
-                "method": "eagle",
-                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-                "num_speculative_tokens": num_speculative_tokens,
-                "max_model_len": 80,
-            },
-            max_model_len=200,
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
         )
-        sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
-        outputs = llm.generate(_PROMPTS, sampling_params)
-        for o in outputs:
-            assert o.outputs[0].finish_reason == "length", (
-                "This test is only meaningful if the output "
-                "is truncated due to max length"
-            )
 
-        sampling_params = SamplingParams(
-            max_tokens=200,
-            structured_outputs=StructuredOutputsParams(
-                regex="^" + "a b c d e " * 15 + "$"
-            ),
+    if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        enforce_eager=True,  # For faster initialization.
+        speculative_config={
+            "method": "eagle",
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "num_speculative_tokens": num_speculative_tokens,
+            "max_model_len": 80,
+        },
+        max_model_len=200,
+        attention_config={"backend": attn_backend},
+    )
+    sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
+    outputs = llm.generate(_PROMPTS, sampling_params)
+    for o in outputs:
+        assert o.outputs[0].finish_reason == "length", (
+            "This test is only meaningful if the output is truncated due to max length"
         )
-        output = llm.generate(_PROMPTS, sampling_params)
-        for o in output:
-            assert o.prompt_token_ids is not None
-            assert (
-                len(o.prompt_token_ids)
-                < 80
-                < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
-                <= 200
-            ), (
-                "This test is only meaningful if the output "
-                "is longer than the eagle max length"
-            )
-            assert o.outputs[0].text == "a b c d e " * 15
+
+    sampling_params = SamplingParams(
+        max_tokens=200,
+        structured_outputs=StructuredOutputsParams(regex="^" + "a b c d e " * 15 + "$"),
+    )
+    output = llm.generate(_PROMPTS, sampling_params)
+    for o in output:
+        assert o.prompt_token_ids is not None
+        assert (
+            len(o.prompt_token_ids)
+            < 80
+            < len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
+            <= 200
+        ), (
+            "This test is only meaningful if the output "
+            "is longer than the eagle max length"
+        )
+        assert o.outputs[0].text == "a b c d e " * 15
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index e2410a70b1a63..e231c600cba7a 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend):
             raise ValueError(
                 f"Head size {head_size} is not supported by {attn_type}. "
                 f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
-                "Set --attention-config.backend=FLEX_ATTENTION to use "
+                "Set --attention-backend=FLEX_ATTENTION to use "
                 "FlexAttention backend which supports all head sizes."
             )
 

From e3a0f21e6ce78268865cafcdc3dc58c7a80dbc57 Mon Sep 17 00:00:00 2001
From: Xunzhuo <xunzhuo@vllm-semantic-router.ai>
Date: Thu, 18 Dec 2025 02:45:56 +0800
Subject: [PATCH 47/49] [docs]: add ecosystem projects sr in docs/governance
 (#30844)

Signed-off-by: bitliu <bitliu@tencent.com>
---
 docs/governance/committers.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/governance/committers.md b/docs/governance/committers.md
index c9428027da953..2f0780a08978b 100644
--- a/docs/governance/committers.md
+++ b/docs/governance/committers.md
@@ -181,3 +181,4 @@ If you have PRs touching the area, please feel free to ping the area owner for r
 
 - Ascend NPU: [@wangxiyuan](https://github.com/wangxiyuan) and [see more details](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html#maintainers)
 - Intel Gaudi HPU [@xuechendi](https://github.com/xuechendi) and [@kzawora-intel](https://github.com/kzawora-intel)
+- Semantic Router: [@xunzhuo](https://github.com/xunzhuo), [@rootfs](https://github.com/rootfs) and [see more details](https://vllm-semantic-router.com/community/team)

From e06d0bf0aa2af11220b5c3aa5ccc8f999d0e3161 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Wed, 17 Dec 2025 15:20:22 -0500
Subject: [PATCH 48/49] 2.9.1 PyTorch release update (#28495)

---
 .buildkite/test-amd.yaml           | 2 +-
 .buildkite/test-pipeline.yaml      | 2 +-
 CMakeLists.txt                     | 4 ++--
 pyproject.toml                     | 2 +-
 requirements/build.txt             | 2 +-
 requirements/cuda.txt              | 6 +++---
 requirements/rocm-build.txt        | 8 ++++----
 requirements/test.in               | 6 +++---
 requirements/test.txt              | 8 ++++----
 vllm/model_executor/layers/conv.py | 2 +-
 10 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index e8f99100a8de0..6df373632d730 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -740,7 +740,7 @@ steps:
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0
+  - uv pip install --system torchao==0.14.1
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b4de630b09417..8e3bcfe4a36bc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -658,7 +658,7 @@ steps:
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
-  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129
   - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ca71f6ba4df0..a14496e035d9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/pyproject.toml b/pyproject.toml
index a250ab6567f12..c03f96dd7acd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.9.0",
+    "torch == 2.9.1",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index 23ff8d4fdc1c0..3756371638bad 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.0
+torch==2.9.1
 wheel
 jinja2>=3.1.6
 regex
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 462f18ef7159b..1417fb99120bc 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.0
-torchaudio==2.9.0
+torch==2.9.1
+torchaudio==2.9.1
 # These must be updated alongside torch
-torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index b977e80be067f..54af9d995c4a2 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,11 +2,11 @@
 -r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.0
-torchvision==0.24.0
-torchaudio==2.9.0
+torch==2.9.1
+torchvision==0.24.1
+torchaudio==2.9.1
 
-triton==3.5.0
+triton==3.5.1
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/test.in b/requirements/test.in
index dfae5b75821f8..55452ce83f232 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,9 +24,9 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.0
-torchaudio==2.9.0
-torchvision==0.24.0
+torch==2.9.1
+torchaudio==2.9.1
+torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.5 # required for voxtral test
diff --git a/requirements/test.txt b/requirements/test.txt
index 571194e05c1ba..ea2093e4347fe 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1123,7 +1123,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.9.0+cu129
+torch==2.9.1+cu129
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1152,7 +1152,7 @@ torch==2.9.0+cu129
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.9.0+cu129
+torchaudio==2.9.1+cu129
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.24.0+cu129
+torchvision==0.24.1+cu129
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1206,7 +1206,7 @@ transformers==4.57.3
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.5.0
+triton==3.5.1
     # via torch
 tritonclient==2.51.0
     # via
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
index 8d51e5bd9920a..1cd02698b3863 100644
--- a/vllm/model_executor/layers/conv.py
+++ b/vllm/model_executor/layers/conv.py
@@ -251,6 +251,6 @@ class Conv3dLayer(ConvLayerBase):
         # See: https://github.com/vllm-project/vllm/issues/27406
         # and https://github.com/pytorch/pytorch/issues/166122
         # By default, we use CUDNN's convolution ops with optimization.
-        if self.enable_linear and is_torch_equal("2.9.0"):
+        if self.enable_linear and (is_torch_equal("2.9.0") or is_torch_equal("2.9.1")):
             return self._forward_mulmat(x)
         return self._forward_conv(x)

From e3fc374a9a69dddb16885d810f1e28d3fdd39ebd Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 17 Dec 2025 18:00:59 -0500
Subject: [PATCH 49/49] [BugFix] Workspace allocation during profile run :
 DeepEPHighThroughput + DeepGEMM  (#30899)

---
 vllm/model_executor/layers/fused_moe/modular_kernel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 484314091cb15..b0834e861338f 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -795,7 +795,10 @@ class FusedMoEModularKernel(torch.nn.Module):
                     top_k,
                     global_num_experts,
                     local_num_experts,
-                    expert_tokens_meta,
+                    # expert_tokens_meta help in allocating optimal/minimal
+                    # amount of workspace. Mark it None, so we allocate for
+                    # the worst-case scenario.
+                    expert_tokens_meta=None,
                 )
             )