Merge branch 'main' into fix-oom-test-entrypoints

2026-05-10 17:56:48 +08:00 · 2025-09-06 22:21:57 +02:00 · 2025-09-06 22:21:57 +02:00 · 276ec32ba9
commit 276ec32ba9
parent 5e3ab94a00 fb691ee4e7
76 changed files with 2898 additions and 735 deletions
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,6 +1,6 @@
 [
    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -32,7 +32,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -64,7 +64,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -96,7 +96,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -131,7 +131,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -166,7 +166,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -198,5 +198,413 @@
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,6 +1,6 @@
 [
    {
-        "test_name": "serving_llama8B_pp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -32,7 +32,39 @@
        }
    },
    {
-        "test_name": "serving_llama8B_pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -64,7 +96,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -97,7 +129,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_pp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -132,7 +164,42 @@
        }
    },
    {
-        "test_name": "serving_llama8B_pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -167,7 +234,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -201,5 +268,553 @@
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
    }
 ]
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,7 +41,8 @@ steps:
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh

- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -63,7 +64,8 @@ steps:
  - pytest -v -s utils_ # Utils
  - pytest -v -s worker # Worker

- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
@ -71,7 +73,8 @@ steps:
  commands:
  - bash standalone_tests/python_only_compile.sh

- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  fast_check: true
  torch_nightly: true
@ -88,7 +91,8 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Core Test # 10min
+- label: Core Test # 22min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: true
  source_file_dependencies:
@ -98,7 +102,8 @@ steps:
  commands:
  - pytest -v -s core

- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Test (LLM) # 30min
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -114,7 +119,8 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Test (API Server) # 100min
+  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -129,7 +135,8 @@ steps:
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
  - pytest -v -s entrypoints/test_chat_utils.py

- label: Distributed Tests (4 GPUs) # 10min
+- label: Distributed Tests (4 GPUs) # 35min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -172,7 +179,8 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
@ -181,6 +189,7 @@ steps:
  - pytest -v -s distributed/test_eplb_algo.py

 - label: EPLB Execution Test # 5min
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -189,7 +198,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_execute.py

- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  num_gpus: 2
  source_file_dependencies:
@ -208,7 +218,8 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####

- label: Regression Test # 5min
+- label: Regression Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -218,7 +229,8 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

- label: Engine Test # 10min
+- label: Engine Test # 25min
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -233,7 +245,8 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization

- label: V1 Test e2e + engine
+- label: V1 Test e2e + engine # 30min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -244,7 +257,8 @@ steps:
    - pytest -v -s v1/e2e
    - pytest -v -s v1/engine

- label: V1 Test entrypoints
+- label: V1 Test entrypoints # 35min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -252,7 +266,8 @@ steps:
  commands:
    - pytest -v -s v1/entrypoints

- label: V1 Test others
+- label: V1 Test others # 42min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -276,7 +291,8 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

- label: Examples Test # 25min
+- label: Examples Test # 30min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
@ -301,7 +317,8 @@ steps:
    - python3 offline_inference/basic/score.py
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -309,7 +326,8 @@ steps:
  commands:
    - pytest -v -s cuda/test_cuda_context.py

- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
@ -320,15 +338,23 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4

- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -344,7 +370,8 @@ steps:
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py

- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -358,7 +385,8 @@ steps:
  - pytest -v -s compile/piecewise/test_full_cudagraph.py
  - pytest -v -s compile/piecewise/test_multiple_graphs.py

- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -367,7 +395,8 @@ steps:
  commands:
  - pytest -v -s compile/test_full_graph.py

- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -375,7 +404,8 @@ steps:
  commands:
    - pytest -v -s kernels/core

- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
@ -386,7 +416,8 @@ steps:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/
@ -396,7 +427,8 @@ steps:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
@ -408,7 +440,8 @@ steps:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2

- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
@ -416,7 +449,8 @@ steps:
  commands:
    - pytest -v -s kernels/mamba

- label: Tensorizer Test # 11min
+- label: Tensorizer Test # 14min
+  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -428,7 +462,8 @@ steps:
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py

- label: Model Executor Test
+- label: Model Executor Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
@ -438,7 +473,8 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor

- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@ -446,7 +482,8 @@ steps:
  commands:
  - bash scripts/run-benchmarks.sh

- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -454,7 +491,8 @@ steps:
  commands:
  - pytest -v -s benchmarks/

- label: Quantization Test
+- label: Quantization Test # 70min
+  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -467,6 +505,7 @@ steps:
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

 - label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -474,7 +513,8 @@ steps:
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1

- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -483,7 +523,8 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: Encoder Decoder tests # 5min
+- label: Encoder Decoder tests # 12min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -491,7 +532,8 @@ steps:
  commands:
    - pytest -v -s encoder_decoder

- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
@ -504,7 +546,8 @@ steps:

 #####  models test  #####

- label: Basic Models Test # 24min
+- label: Basic Models Test # 57min
+  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -517,7 +560,8 @@ steps:
    - pytest -v -s models/test_vision.py
    - pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 35min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -528,6 +572,7 @@ steps:
    - pytest -v -s models/language -m core_model

 - label: Language Models Test (Hybrid) # 35 min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -540,7 +585,8 @@ steps:
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m hybrid_model

- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -552,6 +598,7 @@ steps:
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -560,7 +607,8 @@ steps:
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'

- label: Multi-Modal Processor Test
+- label: Multi-Modal Processor Test # 44min
+  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
@ -568,7 +616,8 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing

- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+  timeout_in_minutes: 80
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -610,7 +659,8 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
@ -640,7 +690,8 @@ steps:
    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl

- label: Blackwell Test
+- label: Blackwell Test # 38 min
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -682,6 +733,7 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
+  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -693,6 +745,7 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py

 - label: 2 Node Tests (4 GPUs in total) # 16min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -716,7 +769,8 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 110min
+  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -757,6 +811,7 @@ steps:
  - pytest -v -s models/multimodal/generation/test_maverick.py

 - label: Plugin Tests (2 GPUs) # 40min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -782,7 +837,8 @@ steps:
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
+  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -795,8 +851,10 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
+  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support

- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  num_gpus: 4
  source_file_dependencies:
@ -814,6 +872,7 @@ steps:


 - label: Weight Loading Multiple GPU Test  # 33min
+  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2 
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,13 +5,15 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
+/vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96
+/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
@ -25,7 +27,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett
 /vllm/v1/attention/backends/triton_attn.py @tdoublep

 # Test ownership
@ -85,4 +88,3 @@ mkdocs.yaml @hmellor
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
-
--- a/README.md
+++ b/README.md
@ -18,16 +18,17 @@ Easy, fast, and cheap LLM serving for everyone

 *Latest News* 🔥

+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

 <details>
 <summary>Previous News</summary>

+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);

-void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
-                                   torch::Tensor& cp_local_token_select_indices,
-                                   torch::Tensor& kv_cache,
-                                   torch::Tensor& slot_mapping,
-                                   const std::string& kv_cache_dtype,
-                                   torch::Tensor& scale);
-
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }

-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void cp_fused_concat_and_cache_mla_kernel(
-    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
-    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                     // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size,                      //
-    const float* scale                         //
-) {
-  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
-  const int64_t slot_idx = slot_mapping[blockIdx.x];
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-
-  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
-                  int src_stride, int dst_stride, int size, int offset) {
-    for (int i = threadIdx.x; i < size; i += blockDim.x) {
-      const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx =
-          block_idx * block_stride + block_offset * entry_stride + i + offset;
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        dst[dst_idx] = src[src_idx];
-      } else {
-        dst[dst_idx] =
-            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
-      }
-    }
-  };
-
-  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-}
-
 }  // namespace vllm

 // KV_T is the data type of key and value tensors.
@ -554,20 +509,6 @@ void reshape_and_cache_flash(
          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
-  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
-          cp_local_token_select_indices.data_ptr<int64_t>(),            \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
-          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
-          reinterpret_cast<const float*>(scale.data_ptr()));
-
 void concat_and_cache_mla(
    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@ -606,50 +547,6 @@ void concat_and_cache_mla(
                             CALL_CONCAT_AND_CACHE_MLA);
 }

-// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
-// calls into one:
-// k_c_normed.index_select(0, cp_local_token_select_indices) + \
-// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
-// concat_and_cache_mla.
-void cp_fused_concat_and_cache_mla(
-    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
-    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
-  // slot_mapping.size(0) because of padding for CUDA graphs.
-  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
-  // both include padding.
-  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
-  // since key includes padding for CUDA graphs, while slot_mapping does not.
-  // In this case, slot_mapping.size(0) represents the actual number of tokens
-  // before padding.
-  // For compatibility with both cases, we use slot_mapping.size(0) as the
-  // number of tokens.
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
-}
-
 namespace vllm {

 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
      }
    }

-    float scale_val, azp_val;
+    float scale_val;
+    float azp_val = 0.0f;
    if constexpr (AZP) {
      float max_scalar = max_value.reduce_max();
      float min_scalar = min_value.reduce_min();
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
            ))

    def prepacked_type_key(prepack_type: PrepackTypeConfig):
-        # For now we we can just use the first accumulator type seen since
+        # For now, we can just use the first accumulator type seen since
        # the tensor core shapes/layouts don't vary based on accumulator
        # type so we can generate less code this way
        return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -693,16 +693,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                     Tensor scale) -> ()");
  cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);

-  cache_ops.def(
-      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                              Tensor cp_local_token_select_indices,"
-      "                              Tensor! kv_cache,"
-      "                              Tensor slot_mapping,"
-      "                              str kv_cache_dtype,"
-      "                              Tensor scale) -> ()");
-  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
-                 &cp_fused_concat_and_cache_mla);
-
  // Convert the key and value cache to fp8 data type.
  cache_ops.def(
      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@ -2,6 +2,7 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
 - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
 - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks:

 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.

+### Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.  
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.  
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.  
+
+```bash
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.  
+
+#### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).

-More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).

 [](){ #nightly-benchmarks }

--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
    - Offline Inference: `256 * world_size`
    - Online Serving: `128 * world_size`

-vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.

 ### Which quantization configs does vLLM CPU support?

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -440,6 +440,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
 | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@ -118,7 +118,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
    fusion_pass = ActivationQuantFusionPass(config)

    backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = model_class(hidden_size, cuda_force_torch)
+    model = model_class(hidden_size=hidden_size,
+                        cuda_force_torch=cuda_force_torch)

    # First dimension dynamic
    x = torch.rand(num_tokens, hidden_size * 2)
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import RunnerOption
+from vllm.logger import init_logger
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_context_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    dcp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class CPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+
+
+@dataclass
+class CPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    runner: RunnerOption
+    test_options: CPTestOptions
+
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 4,
+        pp_base: int = 1,
+        dcp_base: int = 1,
+        multi_node_only: bool = False,
+        runner: RunnerOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for eager_mode_val in [False]:
+            for pp_multiplier in [1]:
+                for dcp_multiplier in [2, 4]:
+                    for chunked_prefill_val in [True]:
+                        parallel_setups.append(
+                            ParallelSetup(tp_size=tp_base,
+                                          pp_size=pp_multiplier * pp_base,
+                                          dcp_size=dcp_multiplier * dcp_base,
+                                          eager_mode=eager_mode_val,
+                                          chunked_prefill=chunked_prefill_val))
+        return CPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp"],
+            vllm_major_versions=["1"],
+            runner=runner,
+            test_options=CPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.runner, opts)
+
+
+def _compare_cp_with_tp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        pp_size,
+        dcp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    cp_env = tp_env = {
+        "VLLM_USE_V1":
+        vllm_major_version,  # Note(hc): DCP only support V1 engine only
+    }
+
+    cp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--decode-context-parallel-size",
+        str(dcp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    try:
+        compare_two_settings(model_id,
+                             cp_args,
+                             tp_args,
+                             cp_env,
+                             tp_env,
+                             method=method,
+                             max_wait_seconds=720)
+    except Exception:
+        testing_ray_compiled_graph = cp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+
+
+CP_TEXT_GENERATION_MODELS = {
+    # [MLA attention only]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
+}
+
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+]
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
+    [
+        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in CP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_cp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available,
+):
+    _compare_cp_with_tp(model_id,
+                        parallel_setup,
+                        distributed_backend,
+                        vllm_major_version,
+                        runner,
+                        test_options,
+                        num_gpus_available,
+                        method="generate",
+                        is_multimodal=False)
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -36,21 +36,41 @@ def monkeypatch_module():
    mpatch.undo()


+@pytest.fixture(scope="module",
+                params=[True, False],
+                ids=["with_tool_parser", "without_tool_parser"])
+def with_tool_parser(request) -> bool:
+    return request.param
+
+
@pytest.fixture(scope="module")
-def gptoss_server(monkeypatch_module: pytest.MonkeyPatch):
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
-        args = [
-            "--enforce-eager",
-            "--max-model-len",
-            "8192",
+def default_server_args(with_tool_parser: bool):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--enforce-eager",
+        "--max-model-len",
+        "4096",
+        "--reasoning-parser",
+        "openai_gptoss",
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+    if with_tool_parser:
+        args.extend([
            "--tool-call-parser",
            "openai",
-            "--reasoning-parser",
-            "openai_gptoss",
            "--enable-auto-tool-choice",
-        ]
-        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, args) as remote_server:
+        ])
+    return args
+
+
+@pytest.fixture(scope="module")
+def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
+                  default_server_args: list[str]):
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
+                                default_server_args) as remote_server:
            yield remote_server


@ -61,7 +81,8 @@ async def gptoss_client(gptoss_server):


@pytest.mark.asyncio
-async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
+async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
+                                                with_tool_parser: bool):
    tools = [{
        "type": "function",
        "function": {
@ -94,10 +115,14 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
    ]

    stream = await gptoss_client.chat.completions.create(
-        model=GPT_OSS_MODEL_NAME, messages=messages, tools=tools, stream=True)
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools if with_tool_parser else None,
+        stream=True)

    name = None
    args_buf = ""
+    content_buf = ""
    async for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.tool_calls:
@ -106,13 +131,22 @@ async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI):
                name = tc.function.name
            if tc.function and tc.function.arguments:
                args_buf += tc.function.arguments
-
-    assert name is not None
-    assert len(args_buf) > 0
+        if getattr(delta, "content", None):
+            content_buf += delta.content
+    if with_tool_parser:
+        assert name is not None
+        assert len(args_buf) > 0
+    else:
+        assert name is None
+        assert len(args_buf) == 0
+        assert len(content_buf) > 0


@pytest.mark.asyncio
-async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI):
+async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
+                                       with_tool_parser: bool):
+    if not with_tool_parser:
+        pytest.skip("skip non-tool for multi-turn tests")
    tools = [{
        "type": "function",
        "function": {
@ -175,7 +209,7 @@ async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI):
    )
    second_msg = second.choices[0].message
    assert (second_msg.content is not None and len(second_msg.content) > 0) or \
-        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)  # noqa: E501
+        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)


 MODEL_NAME = "openai-community/gpt2"
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@ -371,8 +371,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
@pytest.mark.parametrize(
    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
@torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
-                     monkeypatch):
+def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
+                     use_rocm_aiter: bool, monkeypatch):
    """Make sure our Mixtral MoE implementation agrees with the one from
    huggingface."""

--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@ -10,7 +10,8 @@ import numpy as np
 import pytest
 import requests

-from tests.models.utils import EmbedModelInfo, RerankModelInfo
+from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
+                                check_embeddings_close)

 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@ -163,12 +164,14 @@ def mteb_test_embed_models(hf_runner,
                           model_info: EmbedModelInfo,
                           vllm_extra_kwargs=None,
                           hf_model_callback=None,
-                           atol=MTEB_RERANK_TOL):
+                           atol=MTEB_EMBED_TOL):
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

+    example_prompts = ["The chef prepared a delicious meal."]
+
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

@ -191,6 +194,7 @@ def mteb_test_embed_models(hf_runner,
        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
+        vllm_outputs = vllm_model.embed(example_prompts)

    if model_info.mteb_score is None:
        with hf_runner(model_info.name,
@ -202,6 +206,16 @@ def mteb_test_embed_models(hf_runner,

            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
            st_dtype = next(hf_model.model.parameters()).dtype
+
+            # Test embed_dims and whether to use normalize
+            hf_outputs = hf_model.encode(example_prompts)
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
    else:
        st_main_score = model_info.mteb_score
        st_dtype = "Constant"
--- a/tests/models/language/pooling/test_st_projector.py
+++ b/tests/models/language/pooling/test_st_projector.py
@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
+                      LASTPoolingEmbedModelInfo)
 from .mteb_utils import mteb_test_embed_models

 # ST models with projector (Dense) layers
@ -13,6 +14,10 @@ ST_PROJECTOR_MODELS = [
        mteb_score=0.688611955,
        enable_test=True,
    ),
+    LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
+                              architecture="Gemma3TextModel",
+                              mteb_score=0.7473819294684156,
+                              enable_test=True)
 ]


--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@ -42,7 +42,7 @@ def run_test(
    tensor_parallel_size: int = 1,
    vllm_embeddings: Optional[torch.Tensor] = None,
 ):
-    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
    # In the case of embeddings, vLLM takes separate input tensors
    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -352,6 +352,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
+    "Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                               trust_remote_code=True),
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -41,8 +41,9 @@ EAGLE_SPEC_CONFIG = {
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
-     None),
+    #FIXME: This test is flaky on CI thus disabled
+    #("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto",
+    # None),
    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
    ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -1625,20 +1625,6 @@ def concat_and_cache_mla(
                                                scale)


-def cp_fused_concat_and_cache_mla(
-    kv_c: torch.Tensor,
-    k_pe: torch.Tensor,
-    cp_local_token_select_indices: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-    kv_cache_dtype: str,
-    scale: torch.Tensor,
-) -> None:
-    torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla(
-        kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping,
-        kv_cache_dtype, scale)
-
-
 def copy_blocks(key_caches: list[torch.Tensor],
                value_caches: list[torch.Tensor],
                block_mapping: torch.Tensor) -> None:
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.distributed.parallel_state import GroupCoordinator
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
+                                vlse_ptr, outputs_stride_B, outputs_stride_H,
+                                outputs_stride_D, lses_stride_N, lses_stride_B,
+                                lses_stride_H, lse_idx, HEAD_DIM: tl.constexpr,
+                                N_ROUNDED: tl.constexpr):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        output: [ B, H, D ]
+        lses   : [ N, B, H ]
+        cp, batch, q_heads, v_head_dim
+    Return:
+        output: [ B, H, D ]
+        lse   : [ B, H ]
+    """
+    batch_idx = tl.program_id(axis=0).to(tl.int64)
+    head_idx = tl.program_id(axis=1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
+    num_n_offsets = tl.arange(0, N_ROUNDED)
+
+    # shape = [N]
+    lse_offsets = num_n_offsets * lses_stride_N + batch_idx * \
+        lses_stride_B + head_idx * lses_stride_H
+
+    # calc final lse
+    lse = tl.load(lses_ptr + lse_offsets)
+    lse = tl.where((lse != lse) | (lse == float('inf')), -float('inf'), lse)
+    lse_max = tl.max(lse, axis=0)
+    lse -= lse_max
+    lse_exp = tl.exp(lse)
+    lse_acc = tl.sum(lse_exp, axis=0)
+    lse = tl.log(lse_acc)
+    lse += lse_max
+
+    lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
+    tl.store(vlse_ptr + lse_offsets, lse)
+
+    # shape = [D]
+    output_offsets = batch_idx * outputs_stride_B + \
+                    head_idx * outputs_stride_H + \
+                    d_offsets * outputs_stride_D
+
+    # correct output
+    lse_offset = lse_idx * lses_stride_N + batch_idx * \
+        lses_stride_B + head_idx * lses_stride_H
+    lse_tmp = tl.load(lses_ptr + lse_offset)
+    lse_finally = lse_tmp - lse
+    lse_finally = tl.where(
+        (lse_finally != lse_finally) | (lse_finally == float('inf')),
+        -float('inf'), lse_finally)
+    factor = tl.exp(lse_finally)
+    output = tl.load(outputs_ptr + output_offsets)
+    output = output * factor
+
+    tl.store(new_output_ptr + output_offsets, output)
+
+
+class CPTritonContext:
+    """ The CPTritonContext is used to avoid recompilation of the Triton JIT.
+    """
+
+    def __init__(self):
+        self.inner_kernel = None
+
+    def call_kernel(self, kernel, grid, *regular_args, **const_args):
+        if self.inner_kernel is None:
+            self.inner_kernel = kernel[grid](*regular_args, **const_args)
+        else:
+            self.inner_kernel[grid](*regular_args)
+
+
+def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
+                     ctx: CPTritonContext):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        output: [ B, H, D ]
+        lses   : [ N, B, H ]
+    Return:
+        output: [ B, H, D ]
+        lse   : [ B, H ]
+    """
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lse = torch.empty_like(lses[0])
+
+    grid = (out.shape[0], out.shape[1], 1)
+    regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(),
+                    cp_rank)
+    const_args = {
+        "HEAD_DIM": out.shape[-1],
+        "N_ROUNDED": lses.shape[0],
+    }
+
+    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args,
+                    **const_args)
+    return out, lse
+
+
+def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor,
+                     cp_attn_lse: torch.Tensor,
+                     cp_group: GroupCoordinator,
+                     ctx: CPTritonContext = None):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    if cp_group.world_size == 1:
+        return cp_attn_out
+
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lses = torch.empty((cp_group.world_size, ) + cp_attn_lse.shape,
+                       dtype=cp_attn_lse.dtype,
+                       device=cp_attn_lse.device)
+
+    cp_attn_lse = cp_attn_lse.contiguous()
+    lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
+    out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    assert out.is_contiguous()
+    out = cp_group.reduce_scatter(out, dim=1)
+    return out
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@ -105,7 +105,9 @@ def flash_mla_with_kvcache(
        descale_q,
        descale_k,
    )
-    return out, softmax_lse
+
+    # Note(hc): need revisit when we support DCP with decode query_len > 1.
+    return out.squeeze(1), softmax_lse.squeeze(-1)


 #
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@ -2458,7 +2458,6 @@ class LoRAConfig:
    LoRA adapter. Will be removed in v0.12.0."""
    lora_vocab_padding_size: ClassVar[int] = current_platform\
        .get_lora_vocab_padding_size()
-
    default_mm_loras: Optional[dict[str, str]] = None
    """Dictionary mapping specific modalities to LoRA model paths; this field
    is only applicable to multimodal models and should be leveraged when a
@ -2470,7 +2469,8 @@ class LoRAConfig:
    will be automatically assigned to 1-n with the names of the modalities
    in alphabetic order."""
    bias_enabled: bool = False
-    """Enable bias for LoRA adapters."""
+    """[DEPRECATED] Enable bias for LoRA adapters. This option will be
+    removed in v0.12.0."""

    def compute_hash(self) -> str:
        """
@ -2503,6 +2503,11 @@ class LoRAConfig:
            "in v0.12.0. Additional vocabulary support for "
            "LoRA adapters is being phased out.")

+        # Deprecation warning for enable_lora_bias
+        if self.bias_enabled:
+            logger.warning("`enable_lora_bias` is deprecated "
+                           "and will be removed in v0.12.0.")
+
        # Setting the maximum rank to 512 should be able to satisfy the vast
        # majority of applications.
        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
@ -2750,6 +2755,8 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
 _FLOAT16_NOT_SUPPORTED_MODELS = {
    "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
    "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3_text":
+    "Numerical instability. Please use bfloat16 or float32 instead.",
    "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
    "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
 }
@ -3665,6 +3672,24 @@ class VllmConfig:
                " Disabling `torch.compile`.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

+        if self.cache_config.kv_sharing_fast_prefill:
+            if not envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "Fast prefill optimization for KV sharing is not supported "
+                    "in V0 currently.")
+
+            if self.speculative_config is not None and \
+                self.speculative_config.use_eagle():
+                raise NotImplementedError(
+                    "Fast prefill optimization for KV sharing is not "
+                    "compatible with EAGLE as EAGLE requires correct logits "
+                    "for all tokens while fast prefill gives incorrect logits "
+                    "for prompt tokens.")
+
+            logger.warning_once(
+                "--kv-sharing-fast-prefill requires changes on model side for "
+                "correctness and to realize prefill savings. ")
+
        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
                and self.compilation_config.level
                != CompilationLevel.NO_COMPILATION):
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@ -145,19 +145,12 @@ class CacheConfig:

        self._verify_cache_dtype()
        self._verify_prefix_caching()
-        self._verify_kv_sharing_fast_prefill()

    def metrics_info(self):
        # convert cache_config to dict(key: str, value: str) for prometheus
        # metrics info
        return {key: str(value) for key, value in self.__dict__.items()}

-    def _verify_kv_sharing_fast_prefill(self) -> None:
-        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Fast prefill optimization for KV sharing is not supported "
-                "in V0 currently.")
-
    @model_validator(mode='after')
    def _verify_args(self) -> Self:
        if self.cpu_offload_gb < 0:
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -170,6 +170,11 @@ class ParallelConfig:
    Set to be private as it's not intended to be configured by users.
    """

+    decode_context_parallel_size: int = 1
+    """Number of decode context parallel groups, because the world size does
+    not change by dcp, it simply reuse the GPUs of TP group, and tp_size
+    needs to be divisible by dcp_size."""
+
    @property
    def world_size_across_dp(self) -> int:
        """world_size_across_dp is TPxPPxDP, it is the size of the world
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -60,7 +60,7 @@ class CustomAllreduce:
            group: the process group to work on. If None, it will use the
                default process group.
            device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
        It is the caller's responsibility to make sure each communicator
        is bind to a unique device, and all communicators in this group
        are in the same node.
@ -158,7 +158,7 @@ class CustomAllreduce:

        self.disabled = False
        # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
+        # Metadata composes of two parts: metadata for synchronization and a
        # temporary buffer for storing intermediate allreduce results.
        self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
                                                   group=group,
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -904,6 +904,18 @@ def get_tensor_model_parallel_group():
    return get_tp_group()


+_DCP: Optional[GroupCoordinator] = None
+
+
+def get_dcp_group() -> GroupCoordinator:
+    assert _DCP is not None, (
+        "decode context model parallel group is not initialized")
+    return _DCP
+
+
+# kept for backward compatibility
+get_context_model_parallel_group = get_dcp_group
+
 _PP: Optional[GroupCoordinator] = None

 _DP: Optional[GroupCoordinator] = None
@ -1034,6 +1046,7 @@ def init_distributed_environment(
 def initialize_model_parallel(
    tensor_model_parallel_size: int = 1,
    pipeline_model_parallel_size: int = 1,
+    decode_context_model_parallel_size: Optional[int] = 1,
    backend: Optional[str] = None,
 ) -> None:
    """
@ -1098,6 +1111,23 @@ def initialize_model_parallel(
                                    use_message_queue_broadcaster=True,
                                    group_name="tp")

+    # Build the DCP model-parallel groups.
+    global _DCP
+    assert _DCP is None, (
+        "decode context model parallel group is already initialized")
+    # Note(hc): In the current implementation of decode context parallel,
+    # dcp_size must not exceed tp_size, because the world size does not
+    # change by DCP, it simply reuse the GPUs of TP group, and split one
+    # TP group into tp_size//dcp_size DCP groups.
+    group_ranks = all_ranks.reshape(
+        -1, decode_context_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _DCP = init_model_parallel_group(group_ranks,
+                                     get_world_group().local_rank,
+                                     backend,
+                                     use_message_queue_broadcaster=True,
+                                     group_name="dcp")
+
    # Build the pipeline model-parallel groups.
    global _PP
    assert _PP is None, (
@ -1141,6 +1171,7 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
    tensor_model_parallel_size: int,
    pipeline_model_parallel_size: int,
+    decode_context_model_parallel_size: Optional[int] = 1,
    backend: Optional[str] = None,
 ) -> None:
    """Helper to initialize model parallel groups if they are not initialized,
@ -1151,7 +1182,8 @@ def ensure_model_parallel_initialized(
        get_world_group().device_group)
    if not model_parallel_is_initialized():
        initialize_model_parallel(tensor_model_parallel_size,
-                                  pipeline_model_parallel_size, backend)
+                                  pipeline_model_parallel_size,
+                                  decode_context_model_parallel_size, backend)
        return

    assert (
@ -1226,6 +1258,16 @@ def get_tensor_model_parallel_rank():
    return get_tp_group().rank_in_group


+def get_decode_context_model_parallel_world_size():
+    """Return world size for the decode context model parallel group."""
+    return get_dcp_group().world_size
+
+
+def get_decode_context_model_parallel_rank():
+    """Return my rank for the decode context model parallel group."""
+    return get_dcp_group().rank_in_group
+
+
 def get_node_count() -> int:
    """Return the total number of nodes in the distributed environment. """
    assert _NODE_COUNT is not None, (
@ -1246,6 +1288,11 @@ def destroy_model_parallel():
        _PP.destroy()
    _PP = None

+    global _DCP
+    if _DCP:
+        _DCP.destroy()
+    _DCP = None
+
    global _DP
    if _DP:
        _DP.destroy()
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -306,6 +306,8 @@ class EngineArgs:
    # number of P/D disaggregation (or other disaggregation) workers
    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    decode_context_parallel_size: int = \
+        ParallelConfig.decode_context_parallel_size
    data_parallel_size: int = ParallelConfig.data_parallel_size
    data_parallel_rank: Optional[int] = None
    data_parallel_start_rank: Optional[int] = None
@ -636,6 +638,9 @@ class EngineArgs:
            **parallel_kwargs["pipeline_parallel_size"])
        parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                    **parallel_kwargs["tensor_parallel_size"])
+        parallel_group.add_argument(
+            "--decode-context-parallel-size", "-dcp",
+            **parallel_kwargs["decode_context_parallel_size"])
        parallel_group.add_argument("--data-parallel-size", "-dp",
                                    **parallel_kwargs["data_parallel_size"])
        parallel_group.add_argument(
@ -1156,6 +1161,17 @@ class EngineArgs:
            # global layers in interleaved sliding window models.
            sliding_window = model_config.get_sliding_window()

+        # Note(hc): In the current implementation of decode context
+        # parallel(DCP), tp_size needs to be divisible by dcp_size,
+        # because the world size does not change by dcp, it simply
+        # reuse the GPUs of TP group, and split one TP group into
+        # tp_size//dcp_size DCP groups.
+        assert self.tensor_parallel_size % self.decode_context_parallel_size \
+            == 0, (
+            f"tp_size={self.tensor_parallel_size} must be divisible by"
+            f"dcp_size={self.decode_context_parallel_size}."
+        )
+
        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
@ -1306,6 +1322,7 @@ class EngineArgs:
            distributed_executor_backend=self.distributed_executor_backend,
            worker_cls=self.worker_cls,
            worker_extension_cls=self.worker_extension_cls,
+            decode_context_parallel_size=self.decode_context_parallel_size,
        )

        speculative_config = self.create_speculative_config(
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@ -238,11 +238,11 @@ class StreamingHarmonyContext(HarmonyContext):
            # (finished=True), then the next token processed will mark the
            # beginning of a new message
            self.first_tok_of_message = output.finished
-            tok = output.outputs[0].token_ids[0]
-            self.parser.process(tok)
+            for tok in output.outputs[0].token_ids:
+                self.parser.process(tok)
            self._update_num_output_tokens(output.outputs[0].token_ids)
            # Check if the current token is part of reasoning content
-            self._update_num_reasoning_tokens([tok])
+            self._update_num_reasoning_tokens(output.outputs[0].token_ids)
            self.last_tok = tok
        else:
            # Handle the case of tool output in direct message format
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -6,7 +6,7 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import TYPE_CHECKING, Callable, Final, Optional, Union
+from typing import Callable, Final, Optional, Union

 import jinja2
 import partial_json_parser
@ -1174,6 +1174,7 @@ class OpenAIServingChat(OpenAIServing):
        for output in final_res.outputs:
            token_ids = output.token_ids
            out_logprobs = output.logprobs
+            tool_call_info = None

            if request.logprobs and request.top_logprobs is not None:
                assert out_logprobs is not None, "Did not output logprobs"
@ -1188,32 +1189,42 @@ class OpenAIServingChat(OpenAIServing):
                logprobs = None

            if self.use_harmony:
-                if TYPE_CHECKING:
-                    assert self.tool_parser is not None
-                tool_parser = self.tool_parser(tokenizer)
-                # NOTE: We use token_ids for openai tool parser
-                tool_call_info = tool_parser.extract_tool_calls(
-                    "",
-                    request=request,
-                    token_ids=token_ids,  # type: ignore
-                )
-                reasoning_content, content = None, tool_call_info.content
-                if request.include_reasoning:
+                if self.tool_parser is not None:
+                    tool_parser = self.tool_parser(tokenizer)
+                    # NOTE: We use token_ids for openai tool parser
+                    tool_call_info = tool_parser.extract_tool_calls(
+                        "",
+                        request=request,
+                        token_ids=token_ids,  # type: ignore
+                    )
+                    reasoning_content, content = None, tool_call_info.content
+                    if request.include_reasoning:
+                        reasoning_content, content, _ = parse_chat_output(
+                            token_ids)
+                    message = ChatMessage(
+                        role=role,
+                        reasoning_content=reasoning_content,
+                        content=content,
+                        tool_calls=tool_call_info.tool_calls,
+                    )
+                else:
                    reasoning_content, content, _ = parse_chat_output(
                        token_ids)
-                message = ChatMessage(
-                    role=role,
-                    reasoning_content=reasoning_content,
-                    content=content,
-                    tool_calls=tool_call_info.tool_calls,
-                )
+                    if not request.include_reasoning:
+                        reasoning_content = None
+                    message = ChatMessage(
+                        role=role,
+                        reasoning_content=reasoning_content,
+                        content=content,
+                    )

                choice_data = ChatCompletionResponseChoice(
                    index=output.index,
                    message=message,
                    logprobs=logprobs,
-                    finish_reason="tool_calls"
-                    if tool_call_info.tools_called else
+                    finish_reason="tool_calls" if
+                    (tool_call_info is not None
+                     and tool_call_info.tools_called) else
                    output.finish_reason if output.finish_reason else "stop",
                    stop_reason=output.stop_reason,
                )
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        if request.tools and request.tool_choice != 'none':
            # do not skip special tokens because internlm use the special
-            # tokens to indicated the start and end of the tool calls
+            # tokens to indicate the start and end of the tool calls
            # information.
            request.skip_special_tokens = False
        return request
@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
        if '<|action_start|>' not in current_text:
            self.position = len(current_text)
            return DeltaMessage(content=delta_text)
-        # if the tool call is sended, return a empty delta message
-        # to make sure the finish_reason will be send correctly.
+        # if the tool call is sended, return an empty delta message
+        # to make sure the finish_reason will be sent correctly.
        if self.current_tool_id > 0:
            return DeltaMessage(content='')

--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # vllm should use flashinfer fused allreduce. The variable should be a
    # JSON with the following format:
    #     { <world size>: <max size in mb> }
-    # Unspecified world sizes will fallback to
+    # Unspecified world sizes will fall back to
    #     { 2: 64, 4: 1, <everything else>: 0.5 }
    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
    lambda: json.loads(os.getenv(
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
    EM = sorted_token_ids.size(0)
    if A.size(0) < config["BLOCK_SIZE_M"]:
        # optimize for small batch_size.
-        # We assume that top_ids of each token is unique, so
+        # We assume that top_ids of each token is unique,
        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
        # and we can skip some invalid blocks.
        EM = min(sorted_token_ids.size(0),
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -710,7 +710,7 @@ def determine_expert_map(

    # Create a tensor of size num_experts filled with -1
    expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
-    # Create a expert map for the local experts
+    # Create an expert map for the local experts
    start_idx = ep_rank * base_experts + min(ep_rank, remainder)
    expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
        0, local_num_experts, dtype=torch.int32)
@ -806,7 +806,7 @@ class FusedMoE(CustomOp):

        self.global_num_experts = num_experts + num_redundant_experts

-        # we padding globally so EP buffer allocation works
+        # we are padding globally so EP buffer allocation works
        if quant_config and quant_config.get_name() == "mxfp4":
            from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
                should_use_flashinfer_mxfp4)
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@ -223,6 +223,7 @@ class LinearBase(CustomOp):
        quant_config: Quantization configure.
        prefix: Prefix for parameter names.
        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, tensor parallelism will be disabled for this layer.
    """

    def __init__(
@ -235,6 +236,7 @@ class LinearBase(CustomOp):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
        super().__init__()

@ -254,6 +256,17 @@ class LinearBase(CustomOp):
            self.quant_method = quant_config.get_quant_method(self,
                                                              prefix=prefix)
        self.return_bias = return_bias
+        self.disable_tp = disable_tp
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
+
+    def update_param_tp_status(self):
+        for param in self.parameters():
+            if isinstance(param, BasevLLMParameter):
+                param.tp_rank = self.tp_rank
+                param.tp_size = self.tp_size


@CustomOp.register("replicated_linear")
@ -270,6 +283,7 @@ class ReplicatedLinear(LinearBase):
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: Take no effect for replicated linear layers.
    """

    def __init__(
@ -283,26 +297,21 @@ class ReplicatedLinear(LinearBase):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
-        # If MergedReplicatedLinear, use output size of each partition.
-        if hasattr(self, "output_sizes"):
-            self.output_partition_sizes = self.output_sizes
-        else:
-            self.output_partition_sizes = [output_size]
-
        super().__init__(input_size,
                         output_size,
                         skip_bias_add,
                         params_dtype,
                         quant_config,
                         prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)

        # All the linear layer supports quant method.
        assert self.quant_method is not None
        self.quant_method.create_weights(self,
-                                         self.input_size,
-                                         self.output_partition_sizes,
+                                         self.input_size, [self.output_size],
                                         self.input_size,
                                         self.output_size,
                                         self.params_dtype,
@ -358,74 +367,6 @@ class ReplicatedLinear(LinearBase):
        return s


-class MergedReplicatedLinear(ReplicatedLinear):
-    """Replicated linear layer.
-
-    Args:
-        input_size: input dimension of the linear layer.
-        output_sizes: list of output dimensions of the linear layer.
-        bias: If true, add bias.
-        skip_bias_add: If true, skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configure.
-        prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj)
-        return_bias: If true, return bias together with outputs in forward pass.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_sizes: list[int],
-        bias: bool = True,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-        *,
-        return_bias: bool = True,
-    ):
-        self.output_sizes = output_sizes
-        super().__init__(input_size,
-                         sum(output_sizes),
-                         bias,
-                         skip_bias_add,
-                         params_dtype,
-                         quant_config,
-                         prefix=prefix,
-                         return_bias=return_bias)
-
-    def weight_loader(self,
-                      param: Union[Parameter, BasevLLMParameter],
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[int] = None):
-        assert loaded_shard_id is not None
-        assert loaded_shard_id < len(self.output_sizes)
-
-        if isinstance(param, BlockQuantScaleParameter):
-            from vllm.model_executor.layers.quantization.fp8 import (
-                Fp8LinearMethod, Fp8MoEMethod)
-            assert self.quant_method is not None
-            assert isinstance(self.quant_method,
-                              (Fp8LinearMethod, Fp8MoEMethod))
-            weight_block_size = self.quant_method.quant_config.weight_block_size
-            assert weight_block_size is not None
-            block_n, _ = weight_block_size[0], weight_block_size[1]
-            shard_offset = (
-                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n)
-            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n)
-        elif isinstance(param, PerTensorScaleParameter):
-            shard_offset = loaded_shard_id
-            shard_size = 1
-        else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id])
-            shard_size = self.output_sizes[loaded_shard_id]
-
-        param.data[shard_offset:shard_offset + shard_size] = loaded_weight
-
-
@CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
    """Linear layer with column parallelism.
@ -448,7 +389,9 @@ class ColumnParallelLinear(LinearBase):
        output_sizes: list of output sizes packed into one output, like for QKV
                       the list would be size 3.
        prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj) 
+                        (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
    """

    def __init__(
@ -464,9 +407,13 @@ class ColumnParallelLinear(LinearBase):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
        # Divide the weight matrix along the last dimension.
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
        self.input_size_per_partition = input_size
        self.output_size_per_partition = divide(output_size, self.tp_size)
        self.output_partition_sizes = [self.output_size_per_partition]
@ -483,7 +430,8 @@ class ColumnParallelLinear(LinearBase):
                         params_dtype,
                         quant_config,
                         prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)

        self.gather_output = gather_output

@ -511,8 +459,7 @@ class ColumnParallelLinear(LinearBase):
            })
        else:
            self.register_parameter("bias", None)
-
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self.update_param_tp_status()

    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):

@ -554,7 +501,8 @@ class ColumnParallelLinear(LinearBase):
        assert param_data.shape == loaded_weight.shape
        param_data.copy_(loaded_weight)

-    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+    def weight_loader_v2(self, param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor):
        # Special case for loading scales off disk, which often do not
        # have a shape (such as in the case of AutoFP8).
        if len(loaded_weight.shape) == 0:
@ -570,7 +518,7 @@ class ColumnParallelLinear(LinearBase):
        # Matrix multiply.
        assert self.quant_method is not None
        output_parallel = self.quant_method.apply(self, input_, bias)
-        if self.gather_output:
+        if self.gather_output and self.tp_size > 1:
            # All-gather across the partitions.
            output = tensor_model_parallel_all_gather(output_parallel)
        else:
@ -584,7 +532,7 @@ class ColumnParallelLinear(LinearBase):
        s = f"in_features={self.input_size}"
        s += f", output_features={self.output_size_per_partition}"
        s += f", bias={self.bias is not None}"
-        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", tp_size={self.tp_size}"
        s += f", gather_output={self.gather_output}"
        return s

@ -611,6 +559,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, all weights matrix won't be sharded, this layer
+                    will be treated as a "Replicated" MergedLinear.
    """

    def __init__(
@ -625,10 +575,13 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
        self.output_sizes = output_sizes
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)

        assert all(output_size % self.tp_size == 0
                   for output_size in output_sizes)
@ -640,7 +593,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                         params_dtype=params_dtype,
                         quant_config=quant_config,
                         prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)

    def weight_loader(self,
                      param: Parameter,
@ -832,8 +786,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):

        assert loaded_shard_id < len(self.output_sizes)

-        tp_size = get_tensor_model_parallel_world_size()
-
        if isinstance(param, BlockQuantScaleParameter):
            from vllm.model_executor.layers.quantization.fp8 import (
                Fp8LinearMethod, Fp8MoEMethod)
@ -845,17 +797,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            block_n, _ = weight_block_size[0], weight_block_size[1]
            shard_offset = (
                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n) // tp_size
+                block_n) // self.tp_size
            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n // tp_size)
+                          block_n // self.tp_size)
        else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(
+                self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size

        param.load_merged_column_weight(loaded_weight=loaded_weight,
                                        shard_id=loaded_shard_id,
                                        shard_offset=shard_offset,
-                                        shard_size=shard_size)
+                                        shard_size=shard_size,
+                                        tp_rank=self.tp_rank)


 class QKVParallelLinear(ColumnParallelLinear):
@ -883,6 +837,7 @@ class QKVParallelLinear(ColumnParallelLinear):
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
    """

    def __init__(
@ -898,6 +853,7 @@ class QKVParallelLinear(ColumnParallelLinear):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
        self.hidden_size = hidden_size
        self.head_size = head_size
@ -906,7 +862,8 @@ class QKVParallelLinear(ColumnParallelLinear):
            total_num_kv_heads = total_num_heads
        self.total_num_kv_heads = total_num_kv_heads
        # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        tp_size = (get_tensor_model_parallel_world_size()
+                   if not disable_tp else 1)
        self.num_heads = divide(self.total_num_heads, tp_size)
        if tp_size >= self.total_num_kv_heads:
            self.num_kv_heads = 1
@ -932,7 +889,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                         params_dtype=params_dtype,
                         quant_config=quant_config,
                         prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)

    def _get_shard_offset_mapping(self, loaded_shard_id: str):
        shard_offset_mapping = {
@ -993,10 +951,13 @@ class QKVParallelLinear(ColumnParallelLinear):
                         loaded_shard_id: Optional[str] = None):
        if loaded_shard_id is None:  # special case for certain models
            if isinstance(param, PerTensorScaleParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
+                param.load_qkv_weight(loaded_weight=loaded_weight,
+                                      shard_id=0,
+                                      tp_rank=self.tp_rank)
                return
            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight)
+                param.load_qkv_weight(loaded_weight=loaded_weight,
+                                      tp_rank=self.tp_rank)
                return
            # TODO: @dsikka - move to parameter.py
            self._load_fused_module_from_checkpoint(param, loaded_weight)
@ -1020,7 +981,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                              num_heads=self.num_kv_head_replicas,
                              shard_id=loaded_shard_id,
                              shard_offset=shard_offset,
-                              shard_size=shard_size)
+                              shard_size=shard_size,
+                              tp_rank=self.tp_rank)

    def weight_loader(self,
                      param: Parameter,
@ -1226,6 +1188,7 @@ class RowParallelLinear(LinearBase):
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.down_proj)
        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
    """

    def __init__(
@ -1241,10 +1204,13 @@ class RowParallelLinear(LinearBase):
        prefix: str = "",
        *,
        return_bias: bool = True,
+        disable_tp: bool = False,
    ):
        # Divide the weight matrix along the first dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
        self.input_size_per_partition = divide(input_size, self.tp_size)
        self.output_size_per_partition = output_size
        self.output_partition_sizes = [output_size]
@ -1255,7 +1221,8 @@ class RowParallelLinear(LinearBase):
                         params_dtype,
                         quant_config,
                         prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)

        self.input_is_parallel = input_is_parallel
        self.reduce_results = reduce_results
@ -1284,6 +1251,7 @@ class RowParallelLinear(LinearBase):
            })
        else:
            self.register_parameter("bias", None)
+        self.update_param_tp_status()

    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
        input_dim = getattr(param, "input_dim", None)
@ -1339,10 +1307,9 @@ class RowParallelLinear(LinearBase):
        if self.input_is_parallel:
            input_parallel = input_
        else:
-            tp_rank = get_tensor_model_parallel_rank()
            splitted_input = split_tensor_along_last_dim(
                input_, num_partitions=self.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()

        # Matrix multiply.
        assert self.quant_method is not None
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -270,7 +270,8 @@ class Fp8LinearMethod(LinearMethodBase):
        layer.weight_block_size = None

        if self.block_quant:
-            tp_size = get_tensor_model_parallel_world_size()
+            tp_size = getattr(layer, "tp_size",
+                              get_tensor_model_parallel_world_size())
            assert self.quant_config.weight_block_size is not None
            layer.weight_block_size = self.quant_config.weight_block_size
            block_n, block_k = (
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
        )
        layer.register_parameter("w2_scales", w2_scales)
        set_weight_attrs(w2_scales, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
        set_weight_attrs(w2_scales,
                         {"load_full_w2": self.quant_config.desc_act})
        # up_proj scales
@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
        )
        layer.register_parameter("w2_qzeros", w2_qzeros)
        set_weight_attrs(w2_qzeros, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
        set_weight_attrs(w2_qzeros,
                         {"load_full_w2": self.quant_config.desc_act})
        w13_g_idx = torch.nn.Parameter(
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@ -69,6 +69,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
        # Store all module names (from transformers) that support
        # BNB quantization.
        self.target_modules: list[str] = []
+        self.tp_disabled_modules: list[str] = []
        # Store the mapping of expert parameters for MoE models.
        self.expert_params_mapping: list[tuple[str, str, int, str]] = []
        # mapping weight names from transformers to vllm.
@ -322,14 +323,24 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                               quant_state_dict) -> Generator:
        from bitsandbytes.functional import quantize_4bit

-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
+        global_tp_size = get_tensor_model_parallel_world_size()
+        global_tp_rank = get_tensor_model_parallel_rank()

        for (
                org_weight_name,
                mapped_weight_name,
                weight_tensor,
        ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+
+            # override tp_size and tp_rank if the module has disabled TP
+            if any(tp_disabled_module in mapped_weight_name
+                   for tp_disabled_module in self.tp_disabled_modules):
+                tp_size = 1
+                tp_rank = 0
+            else:
+                tp_size = global_tp_size
+                tp_rank = global_tp_rank
+
            if any(target_module in mapped_weight_name
                   for target_module in self.target_modules
                   ) and mapped_weight_name.endswith(".weight"):
@ -418,12 +429,16 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                    # Map vllm's names to transformers's names.
                    rep_name, sub_modules = modules_info
                    for sub_name in sub_modules:
-                        self.target_modules.append(
-                            name.replace(rep_name, sub_name))
+                        new_name = name.replace(rep_name, sub_name)
+                        self.target_modules.append(new_name)
+                        if module.disable_tp:
+                            self.tp_disabled_modules.append(new_name)
                # Add original module name even if the module has stacked map,
                # in case model has a mixture of disk-merged and disk-split
                # weights with same last name.
                self.target_modules.append(name)
+                if module.disable_tp:
+                    self.tp_disabled_modules.append(name)
            elif isinstance(module, FusedMoE) and hasattr(
                    module.quant_method, "quant_config"):
                # TODO: support FusedMoE with prequant and 8bit.
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@ -49,26 +49,28 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
        if not dense_modules:
            return None

-        module = dense_modules[0]
-        folder = module.get("path", "")
+        layers = []
+        for module in dense_modules:
+            folder = module.get("path", "")

-        config_path = f"{folder}/config.json" if folder else "config.json"
-        layer_config = get_hf_file_to_dict(config_path, model_config.model,
-                                           model_config.revision)
-        if not layer_config:
-            return None
+            config_path = f"{folder}/config.json" if folder else "config.json"
+            layer_config = get_hf_file_to_dict(config_path, model_config.model,
+                                               model_config.revision)
+            if not layer_config:
+                continue

-        linear = nn.Linear(layer_config.get("in_features", 768),
-                           layer_config.get("out_features", 768),
-                           bias=layer_config.get("bias", True),
-                           dtype=torch.float32)
+            linear = nn.Linear(layer_config.get("in_features", 768),
+                               layer_config.get("out_features", 768),
+                               bias=layer_config.get("bias", True),
+                               dtype=torch.float32)

-        if _load_dense_weights(linear, folder, model_config):
-            layers = [linear]
+            if not _load_dense_weights(linear, folder, model_config):
+                continue
+
+            layers.append(linear)
            if act_name := layer_config.get("activation_function"):
                layers.append(get_act_fn(act_name))
-            return nn.Sequential(*layers).to(dtype=torch.float32)
-
+        return nn.Sequential(*layers).to(dtype=torch.float32)
    except Exception:
        logger.exception("ST projector loading failed")

--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@ -24,6 +24,14 @@ class VerifyAndUpdateConfig:
        raise NotImplementedError


+class Gemma3TextModelConfig:
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.is_causal = not hf_config.use_bidirectional_attention
+
+
 class GteNewModelConfig(VerifyAndUpdateConfig):

    @staticmethod
@ -409,6 +417,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
    "GteModel": SnowflakeGteNewModelConfig,
    "GteNewModel": GteNewModelConfig,
    "GteNewForSequenceClassification": GteNewModelConfig,
+    "Gemma3TextModel": Gemma3TextModelConfig,
    "NomicBertModel": NomicBertModelConfig,
    "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
    "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@ -43,7 +43,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                               ReplicatedLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@ -435,12 +434,13 @@ class DeepseekV2MLAAttention(nn.Module):
        self.max_position_embeddings = max_position_embeddings

        if self.q_lora_rank is not None:
-            self.fused_qkv_a_proj = MergedReplicatedLinear(
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
                self.hidden_size,
                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
                bias=False,
                quant_config=quant_config,
-                prefix=f"{prefix}.fused_qkv_a_proj")
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True)
        else:
            self.kv_a_proj_with_mqa = ReplicatedLinear(
                self.hidden_size,
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@ -24,7 +24,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import Gemma3TextConfig

-from vllm.attention import Attention
+from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@ -44,6 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

+from ...attention.layers.encoder_only_attention import EncoderOnlyAttention
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, extract_layer_index,
                    is_pp_missing_parameter,
@ -169,16 +170,24 @@ class Gemma3Attention(nn.Module):
            rope_scaling=self.rope_scaling,
        )

-        # Initialize the attention.
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              logits_soft_cap=attn_logits_soft_cap,
-                              per_layer_sliding_window=sliding_window,
-                              prefix=f"{prefix}.attn")
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+
+        self.attn = attn_cls(self.num_heads,
+                             self.head_dim,
+                             self.scaling,
+                             num_kv_heads=self.num_kv_heads,
+                             cache_config=cache_config,
+                             quant_config=quant_config,
+                             attn_type=attn_type,
+                             logits_soft_cap=attn_logits_soft_cap,
+                             per_layer_sliding_window=sliding_window,
+                             prefix=f"{prefix}.attn")

    def forward(
        self,
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@ -51,14 +51,10 @@ from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.layernorm import RMSNorm
-# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                               QKVParallelLinear,
-                                               ReplicatedLinear,
                                               RowParallelLinear)
-# yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@ -174,20 +170,22 @@ class Glm4vVisionMLP(nn.Module):
        use_data_parallel: bool = False,
    ):
        super().__init__()
-        cls_gate_up = (MergedReplicatedLinear
-                       if use_data_parallel else MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up(input_size=in_features,
-                                        output_sizes=[hidden_features] * 2,
-                                        bias=bias,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.gate_up_proj")
-        cls_down = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down(hidden_features,
-                                  in_features,
-                                  bias=bias,
-                                  quant_config=quant_config,
-                                  prefix=f"{prefix}.down_proj")
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
+        )
        self.act_fn = SiluAndMul()

    def forward(self, x: torch.Tensor):
@ -234,48 +232,32 @@ class Glm4vVisionAttention(nn.Module):
        # Per attention head and per partition values.
        self.tp_size = (1 if use_data_parallel else
                        get_tensor_model_parallel_world_size())
-        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.tp_rank = (0 if use_data_parallel else
+                        parallel_state.get_tensor_model_parallel_rank())
        self.hidden_size_per_attention_head = dist_utils.divide(
            projection_size, num_heads)
        self.num_attention_heads_per_partition = dist_utils.divide(
            num_heads, self.tp_size)

-        if use_data_parallel:
-            self.qkv = ReplicatedLinear(
-                input_size=embed_dim,
-                output_size=3 * projection_size,
-                bias=False,
-                quant_config=quant_config,
-                # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
-                prefix=f"{prefix}.qkv_proj"
-                if quant_config else f"{prefix}.qkv",
-            )
-            self.proj = ReplicatedLinear(
-                input_size=projection_size,
-                output_size=embed_dim,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-                bias=False,
-            )
-        else:
-            self.qkv = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.hidden_size_per_attention_head,
-                total_num_heads=num_heads,
-                total_num_kv_heads=num_heads,
-                bias=False,
-                quant_config=quant_config,
-                # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
-                prefix=f"{prefix}.qkv_proj"
-                if quant_config else f"{prefix}.qkv",
-            )
-            self.proj = RowParallelLinear(
-                input_size=projection_size,
-                output_size=embed_dim,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-                bias=False,
-            )
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=False,
+            quant_config=quant_config,
+            # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
+            prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+        self.proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            bias=False,
+            disable_tp=use_data_parallel,
+        )

        # Detect attention implementation.
        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
@ -494,41 +476,31 @@ class Glm4vPatchMerger(nn.Module):
    ) -> None:
        super().__init__()
        self.hidden_size = d_model
-        if use_data_parallel:
-            self.proj = ReplicatedLinear(
-                input_size=self.hidden_size,
-                output_size=self.hidden_size,
-                bias=bias,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-            )
-        else:
-            self.proj = ColumnParallelLinear(
-                self.hidden_size,
-                self.hidden_size,
-                bias=bias,
-                gather_output=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.proj",
-            )
+        self.proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=bias,
+            gather_output=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
        self.post_projection_norm = nn.LayerNorm(self.hidden_size)
-        cls_gate_up = (MergedReplicatedLinear
-                       if use_data_parallel else MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up(
+        self.gate_up_proj = MergedColumnParallelLinear(
            input_size=self.hidden_size,
            output_sizes=[context_dim] * 2,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
        )
-        cls_down = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down(
+        self.down_proj = RowParallelLinear(
            context_dim,
            self.hidden_size,
            bias=bias,
            quant_config=quant_config,
            prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
        )
        self.act_fn = SiluAndMul()
        self.extra_activation_func = nn.GELU()
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@ -216,12 +216,9 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> int:
-        max_images = mm_counts.get("image", 0)
        max_videos = mm_counts.get("video", 0)

-        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len)
        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                   _MAX_FRAMES_PER_VIDEO)

--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@ -48,7 +48,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                               QKVParallelLinear,
                                               ReplicatedLinear,
                                               RowParallelLinear)
@ -178,22 +177,20 @@ class Qwen2_5_VisionMLP(nn.Module):
                 prefix: str = "",
                 use_data_parallel: bool = False):
        super().__init__()
-        cls_gate_up_proj = (MergedReplicatedLinear if use_data_parallel else
-                            MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up_proj(
+        self.gate_up_proj = MergedColumnParallelLinear(
            input_size=in_features,
            output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
            bias=bias,
            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
+            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel)

-        cls_down_proj = (ReplicatedLinear
-                         if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down_proj(hidden_features,
-                                       in_features,
-                                       bias=bias,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.down_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj",
+                                           disable_tp=use_data_parallel)
        self.act_fn = act_fn

    def forward(self, x: torch.Tensor):
@ -243,30 +240,21 @@ class Qwen2_5_VisionAttention(nn.Module):
        self.num_attention_heads_per_partition = dist_utils.divide(
            num_heads, self.tp_size)

-        if use_data_parallel:
-            self.qkv = ReplicatedLinear(embed_dim,
-                                        self.hidden_size_per_attention_head *
-                                        3 * num_heads,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv")
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel)

-        else:
-            self.qkv = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.hidden_size_per_attention_head,
-                total_num_heads=num_heads,
-                total_num_kv_heads=num_heads,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.qkv")
-
-        cls_proj = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.proj = cls_proj(input_size=projection_size,
-                             output_size=embed_dim,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.proj")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj",
+                                      disable_tp=use_data_parallel)

        # Detect attention implementation.
        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@ -915,12 +915,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> int:
-        max_images = mm_counts.get("image", 0)
        max_videos = mm_counts.get("video", 0)

-        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len)
        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                   _MAX_FRAMES_PER_VIDEO)

--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -155,6 +155,7 @@ _EMBEDDING_MODELS = {
    "BertModel": ("bert", "BertEmbeddingModel"),
    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
    "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
+    "Gemma3TextModel": ("gemma3", "Gemma3Model"),
    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
    "GPT2ForSequenceClassification": ("gpt2", "GPT2ForSequenceClassification"),
    "GritLM": ("gritlm", "GritLM"),
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@ -21,7 +21,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               QKVParallelLinear,
-                                               ReplicatedLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@ -667,35 +666,21 @@ class Step3VisionAttention(nn.Module):

        self.q_size = self.num_heads * self.head_dim

-        if use_data_parallel:
-            self.qkv_proj = ReplicatedLinear(
-                self.embed_dim,
-                3 * self.q_size,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-            self.out_proj = ReplicatedLinear(
-                self.total_num_heads * self.head_dim,
-                self.embed_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-        else:
-            self.qkv_proj = QKVParallelLinear(
-                self.embed_dim,
-                self.head_dim,
-                self.total_num_heads,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-            self.out_proj = RowParallelLinear(self.embed_dim,
-                                              self.embed_dim,
-                                              bias=True,
-                                              quant_config=quant_config,
-                                              prefix=prefix)
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.out_proj = RowParallelLinear(self.embed_dim,
+                                          self.embed_dim,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.out_proj",
+                                          disable_tp=use_data_parallel)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads,
@ -740,20 +725,18 @@ class Step3VisionMLP(nn.Module):
        super().__init__()
        self.config = config
        self.activation_fn = get_act_fn(config.hidden_act)
-        cls_fc1 = (ReplicatedLinear
-                   if use_data_parallel else ColumnParallelLinear)
-        self.fc1 = cls_fc1(config.hidden_size,
-                           config.intermediate_size,
-                           bias=True,
-                           quant_config=quant_config,
-                           prefix=prefix)
-        cls_fc2 = (ReplicatedLinear
-                   if use_data_parallel else RowParallelLinear)
-        self.fc2 = cls_fc2(config.intermediate_size,
-                           config.hidden_size,
-                           bias=True,
-                           quant_config=quant_config,
-                           prefix=prefix)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1",
+                                        disable_tp=use_data_parallel)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2",
+                                     disable_tp=use_data_parallel)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states, _ = self.fc1(hidden_states)
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@ -57,6 +57,8 @@ class BasevLLMParameter(Parameter):
            weight_loader = _make_synced_weight_loader(weight_loader)

        self._weight_loader = weight_loader
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()

    @property
    def weight_loader(self):
@ -116,10 +118,10 @@ class _ColumnvLLMParameter(BasevLLMParameter):
        return self._output_dim

    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
        shard_size = self.data.shape[self.output_dim]
        loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
        assert self.data.shape == loaded_weight.shape
        self.data.copy_(loaded_weight)

@ -127,6 +129,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):

        shard_offset = kwargs.get("shard_offset")
        shard_size = kwargs.get("shard_size")
+
        # TODO: move these to PackedColumnParameter and PackedvLLMParameter
        if isinstance(
                self,
@ -137,11 +140,11 @@ class _ColumnvLLMParameter(BasevLLMParameter):

        param_data = self.data

-        tp_rank = get_tensor_model_parallel_rank()
        param_data = param_data.narrow(self.output_dim, shard_offset,
                                       shard_size)
        loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
        assert param_data.shape == loaded_weight.shape
        param_data.copy_(loaded_weight)

@ -161,8 +164,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
                shard_offset=shard_offset, shard_size=shard_size)

        param_data = self.data
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        shard_id = (self.tp_rank if shard_id == "q" else self.tp_rank //
+                    num_heads)
        param_data = param_data.narrow(self.output_dim, shard_offset,
                                       shard_size)
        loaded_weight = loaded_weight.narrow(self.output_dim,
@ -189,10 +192,10 @@ class RowvLLMParameter(BasevLLMParameter):
        return self._input_dim

    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
        shard_size = self.data.shape[self.input_dim]
        loaded_weight = loaded_weight.narrow(self.input_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)

        if len(loaded_weight.shape) == 0:
            loaded_weight = loaded_weight.reshape(1)
@ -414,9 +417,6 @@ class SharedWeightParameter(BasevLLMParameter):
            "weight_loader": self._fake_weight_loader
        }

-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-
        if self.tp_size > 1:
            raise NotImplementedError(f"{self.__class__.__name__} does not "
                                      "currently support tensor parallelism")
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl):
            else:
                raise ValueError(f"Unsupported output dtype: {output.dtype}")

-            # TRTLLM attn kernel requires o scale to pass as a host scalar,
+            # TRTLLM attn kernel requires to scale to pass as a host scalar,
            # store the o scale as a host scalar in warmup run with cuda graph
            # not enabled
            if layer._o_scale_float is None:
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@ -201,10 +201,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                              AttentionMetadata,
                                              MLAAttentionImpl)
 from vllm.attention.backends.utils import get_mla_dims
+from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import is_global_first_rank
+from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearBase,
@ -323,6 +324,13 @@ class MLACommonPrefillMetadata:
        seq_lens: torch.Tensor
        workspace: torch.Tensor

+        # for mla DCP
+        cp_chunk_seq_lens: Optional[list[list[int]]] = None
+        origin_context_lens: Optional[list[int]] = None
+        cp_cu_seq_lens: Optional[torch.Tensor] = None
+        chunk_size: Optional[int] = None
+        cu_seq_lens_lst: Optional[list[list[int]]] = None
+
    block_table: torch.Tensor
    query_start_loc: torch.Tensor
    max_query_len: int
@ -444,6 +452,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
            parallel_config)
        self.mla_dims = get_mla_dims(self.model_config)
        self.aot_schedule = current_platform.is_cuda()
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0

        # Dont try to access the runner on AMD
        if self.aot_schedule:
@ -465,12 +480,27 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
            128 * 1024)
        assert self.chunked_prefill_workspace_size >= \
            scheduler_config.max_num_seqs * cache_config.block_size
-        self.chunked_prefill_workspace = torch.empty(
-            (self.chunked_prefill_workspace_size,
-             self.model_config.get_head_size()),
-            dtype=self.model_config.dtype,
-            device=device,
-        )
+        if self.dcp_world_size > 1:
+            # Note(hc): The local kvcache is incomplete when DCP is triggered,
+            # an additional kvcache allgather across the DCP group is therefore
+            # required, so the workspace has to be enlarged by 1/DCP relative
+            # to the original TP allocation.
+            assert self.chunked_prefill_workspace_size % \
+                self.dcp_world_size == 0
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size +
+                 self.chunked_prefill_workspace_size // self.dcp_world_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )
+        else:
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )

        self._use_cudnn_prefill = use_cudnn_prefill()
        self._use_fi_prefill = use_flashinfer_prefill()
@ -631,6 +661,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
            split_decodes_and_prefills(common_attn_metadata,
                                       decode_threshold=self.reorder_batch_threshold)

+        # Note(hc): update seq_lens of decode reqs under DCP.
+        if self.dcp_world_size > 1:
+            seq_lens[:num_decodes] = seq_lens[:num_decodes] \
+                // self.dcp_world_size + (self.dcp_rank <= \
+                (seq_lens[:num_decodes] - 1) % self.dcp_world_size)
+
        assert num_decodes + num_prefills == num_reqs
        assert num_decode_tokens + num_prefill_tokens == num_tokens

@ -639,6 +675,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
            reqs_start = num_decodes  # prefill_start

            context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
+            # Note(hc): The context lengths in the perspective of dcp rank0.
+            cp_context_lens_cpu = torch.ceil(context_lens_cpu.float() /
+                                             self.dcp_world_size).int()
+            origin_context_lens = context_lens_cpu.tolist()
            max_context_len_cpu = context_lens_cpu.max().item()
            num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
            prefill_query_start_loc = query_start_loc[
@ -691,20 +731,66 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                             out=cu_seq_lens_cpu[:, 1:],
                             dtype=torch.int32)

+                if self.dcp_world_size > 1:
+                    # Note(hc): The above max_context_chunk already enforces
+                    # block_size alignment, DCP just need the block_size can
+                    # be divisible by dcp_world_size, because DCP use
+                    # cp_gather_cache which not require `cp_chunk_starts`
+                    # aligned to page_size.
+                    assert max_context_chunk % self.dcp_world_size == 0
+                    cp_max_context_chunk = max_context_chunk // \
+                        self.dcp_world_size
+                    cp_chunk_starts = \
+                        torch.arange(num_chunks, dtype=torch.int32) \
+                        .unsqueeze(1).expand(-1, num_prefills) \
+                        * cp_max_context_chunk
+                    cp_chunk_ends = torch.min(
+                        cp_context_lens_cpu.unsqueeze(0),
+                        cp_chunk_starts + cp_max_context_chunk)
+                    cp_chunk_seq_lens = (cp_chunk_ends -
+                                         cp_chunk_starts).clamp(min=0)
+
+                    cp_cu_seq_lens_cpu = torch.zeros(num_chunks,
+                                                     num_prefills + 1,
+                                                     dtype=torch.int32,
+                                                     pin_memory=True)
+                    torch.cumsum(cp_chunk_seq_lens,
+                                 dim=1,
+                                 out=cp_cu_seq_lens_cpu[:, 1:],
+                                 dtype=torch.int32)
+
                chunked_context_metadata_cls = \
                    CudnnPrefillMetadata.ChunkedContextMetadata \
                    if self._use_cudnn_prefill else \
                        MLACommonPrefillMetadata.ChunkedContextMetadata
-
-                chunked_context_metadata = \
-                    chunked_context_metadata_cls(
-                    cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
-                    starts=chunk_starts.to(device, non_blocking=True),
-                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
-                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
-                    seq_lens=chunk_seq_lens,
-                    workspace=self.chunked_prefill_workspace,
-                )
+                if self.dcp_world_size > 1:
+                    chunked_context_metadata = \
+                        chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        starts=cp_chunk_starts.to(device, non_blocking=True),
+                        seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(),
+                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                        seq_lens=chunk_seq_lens,
+                        workspace=self.chunked_prefill_workspace,
+                        cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(),
+                        origin_context_lens=origin_context_lens,
+                        cp_cu_seq_lens=cp_cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        chunk_size=max_context_chunk,
+                        cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                    )
+                else:
+                    chunked_context_metadata = \
+                        chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        starts=chunk_starts.to(device, non_blocking=True),
+                        seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                        seq_lens=chunk_seq_lens,
+                        workspace=self.chunked_prefill_workspace,
+                    )

                if self._use_cudnn_prefill:
                    chunked_context_metadata.seq_lens = chunk_seq_lens
@ -757,6 +843,71 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
        return attn_metadata


+def reorg_kvcache(
+    allgatered_kv_c_normed: torch.Tensor,
+    allgatered_k_pe: torch.Tensor,
+    cp_chunk_seq_lens_lst: list[int],
+    origin_context_lens: list[int],
+    cp_world_size: int,
+    sum_seq_len: int,
+    max_seq_len: int,
+    chunk_size: int,
+    chunk_idx: int,
+    toks: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    reorg kvcache after cp local gather to tp layout for attn kernel.
+
+    Args:
+        cp_chunk_seq_lens_lst: chunk context lengths under CP.
+        origin_context_lens: origin full context lengths under CP.
+        cp_world_size: CP size.
+        sum_seq_len: the sum of cp_chunk_seq_lens_lst.
+        max_seq_len: the max value of cp_chunk_seq_lens_lst.
+        chunk_size: equals to max_context_chunk from
+            chunked_context_metadata building.
+        chunk_idx: chunk idx of chunked_prefill.
+        toks: the number of tokens for local gather cache.
+    """
+    kv_c_segments = []
+    k_pe_segments = []
+    src_token_idx = 0
+    max_seq_len_check = 0
+    for cp_chunk_seq_len, origin_context_len in zip(cp_chunk_seq_lens_lst,
+                                                    origin_context_lens):
+        chunk_context_len = chunk_size
+        if cp_chunk_seq_len != 0:
+            chunk_context_len = min(
+                chunk_context_len, origin_context_len - chunk_size * chunk_idx)
+        cp_target_rank = (chunk_context_len - 1) % cp_world_size
+        cur_seq_len = 0
+        for rank in range(cp_world_size):
+            if rank > cp_target_rank and cp_chunk_seq_len:
+                real_cp_chunk_seq_len = cp_chunk_seq_len - 1
+            else:
+                real_cp_chunk_seq_len = cp_chunk_seq_len
+            if real_cp_chunk_seq_len:
+                kv_c_segment = allgatered_kv_c_normed[rank * toks +
+                                                      src_token_idx:rank *
+                                                      toks + src_token_idx +
+                                                      real_cp_chunk_seq_len]
+                k_pe_segment = allgatered_k_pe[rank * toks +
+                                               src_token_idx:rank * toks +
+                                               src_token_idx +
+                                               real_cp_chunk_seq_len]
+                kv_c_segments.append(kv_c_segment)
+                k_pe_segments.append(k_pe_segment)
+                cur_seq_len += real_cp_chunk_seq_len
+        max_seq_len_check = max(max_seq_len_check, cur_seq_len)
+        src_token_idx += cp_chunk_seq_len
+    reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
+    reorganized_k_pe = torch.cat(k_pe_segments, dim=0)
+    assert reorganized_kv_c_normed.shape[0] == sum_seq_len
+    assert reorganized_k_pe.shape[0] == sum_seq_len
+    assert max_seq_len_check == max_seq_len
+    return reorganized_kv_c_normed, reorganized_k_pe
+
+
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
    """
    NOTE: Please read the comment at the top of the file before trying to
@ -836,6 +987,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                self.vllm_flash_attn_version == 3
                and current_platform.get_device_capability()[0] == 9)

+        self.dcp_world_size: Optional[int] = None
+
    def _flash_attn_varlen_diff_headdims(self,
                                         q,
                                         k,
@ -1152,6 +1305,108 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):

        return output, output_lse

+    def _context_parallel_compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
+        dcp_world_size: int,
+    ):
+        assert k_scale is None, "DCP not support sacled kvcache now."
+        assert attn_metadata.prefill is not None
+        prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.chunked_context is not None
+        assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None
+        assert prefill_metadata.chunked_context.origin_context_lens is not None
+        assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
+        assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+
+        output = None
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        workspace = prefill_metadata.chunked_context.workspace
+
+        for i in range(iters):
+            toks = prefill_metadata.chunked_context.seq_tot[i]
+            ops.cp_gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=prefill_metadata.block_table,
+                cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i],
+                batch_size=attn_metadata.num_prefills,
+                seq_starts=prefill_metadata.chunked_context.starts[i],
+            )
+            # workspace
+            # |------- N tokens --------|--------- N*dcp_size tokens ----------|
+            # |<- use for loca_gather ->|<--------- use for allgather -------->|
+            allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
+            assert allgather_offset * (dcp_world_size +
+                                       1) == workspace.shape[0]
+            assert toks <= allgather_offset
+            local_gathered_kvcache = workspace[:toks]
+            cur_allgather_workspace = workspace[
+                allgather_offset:allgather_offset * (1 + dcp_world_size)]
+            assert toks * dcp_world_size <= cur_allgather_workspace.shape[0]
+            cur_allgather_kvcache = cur_allgather_workspace[:toks *
+                                                            dcp_world_size]
+            cur_allgather_kvcache.copy_(get_dcp_group().all_gather(
+                local_gathered_kvcache, dim=0))
+            assert cur_allgather_kvcache.shape[
+                -1] == self.kv_lora_rank + self.qk_rope_head_dim
+            allgatered_kv_c_normed, allgatered_k_pe = \
+                cur_allgather_kvcache.unsqueeze(
+                1).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+            kv_c_normed, k_pe = reorg_kvcache(
+                allgatered_kv_c_normed,
+                allgatered_k_pe,
+                cp_chunk_seq_lens_lst=prefill_metadata.chunked_context.
+                cp_chunk_seq_lens[i],
+                origin_context_lens=prefill_metadata.chunked_context.
+                origin_context_lens,
+                cp_world_size=dcp_world_size,
+                sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i]
+                [-1],
+                max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
+                chunk_size=prefill_metadata.chunked_context.chunk_size,
+                chunk_idx=i,
+                toks=toks)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
+                prefill=prefill_metadata,
+                chunk_idx=i,
+                q=q,
+                k=k,
+                v=v,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
    def _forward_prefill(
        self,
        q: torch.Tensor,
@ -1162,6 +1417,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        k_scale: torch.Tensor,
    ) -> torch.Tensor:
        assert attn_metadata.prefill is not None
+        assert self.dcp_world_size is not None

        has_context = attn_metadata.prefill.chunked_context is not None
        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
@ -1181,8 +1437,15 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):

        if has_context:
            suffix_output, suffix_lse = output
-            context_output, context_lse = self._compute_prefill_context( \
-                q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
+            if self.dcp_world_size > 1:
+                context_output, context_lse = \
+                    self._context_parallel_compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata,
+                    k_scale=None, dcp_world_size=self.dcp_world_size)
+            else:
+                context_output, context_lse = \
+                    self._compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale)

            output = torch.empty_like(suffix_output)
            merge_attn_states(
@ -1202,12 +1465,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
    @abstractmethod
    def _forward_decode(
        self,
-        ql_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: M,
        layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        raise NotImplementedError

    def forward(
@ -1235,6 +1497,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
            # same expert outputs.
            return output.fill_(0)

+        if self.dcp_world_size is None:
+            self.dcp_world_size = get_dcp_group().world_size
+
        fp8_attention = self.kv_cache_dtype.startswith("fp8")

        num_actual_toks = attn_metadata.num_actual_tokens
@ -1313,7 +1578,26 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                    layer._q_scale)
                decode_q_pe = decode_q_pe.reshape(q_pe_shape)

-            output[:num_decode_tokens] = self._forward_decode(
-                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata, layer)
+            decode_q = (decode_ql_nope, decode_q_pe)
+            if self.dcp_world_size > 1:
+                assert not fp8_attention, "DCP not support fp8 kvcache now."
+                # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P)
+                decode_q = torch.cat(decode_q, dim=-1)
+                # decode_q do allgather in head dim.
+                decode_q = get_dcp_group().all_gather(decode_q, dim=1)

+            # call decode attn
+            attn_out, lse = self._forward_decode(decode_q, kv_cache,
+                                                 attn_metadata, layer)
+
+            # recorect dcp attn_out with lse.
+            if self.dcp_world_size > 1:
+                assert lse is not None, (
+                    "For a mla backend want to enable"
+                    "DCP, it is mandatory that the corresponding decode attn"
+                    "kernel return the softmax lse.")
+                attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
+
+            # v_up projection
+            output[:num_decode_tokens] = self._v_up_proj(attn_out)
        return output_padded
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@ -232,7 +232,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                           self._workspace.get_buf(),
                                           self.scale, self._num_kv_splits)

-        return self._v_up_proj(o)
+        return o

    # TODO: Currently we leave it here only for backup in case something is
    #       wrong with the new SM100 CUTLASS MLA kernel
@ -265,21 +265,25 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                               attn_metadata.decode.seq_lens,
                               attn_metadata.decode.block_table, self.scale)

-        return self._v_up_proj(o)
+        return o

    def _forward_decode(
        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: torch.Tensor,
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: MLACommonMetadata,
        layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
        if self._use_old_cutlass_mla:
            # TODO: Remove the old cutlass MLA kernel after more extensive
            #       testing
            return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                            attn_metadata)
+                                            attn_metadata), None

        return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                          attn_metadata)
+                                          attn_metadata), None
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union

 import torch

@ -154,15 +154,20 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):

    def _forward_decode(
        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: FlashAttnMLAMetadata,
        layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None

+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
        if self.kv_cache_dtype.startswith("fp8"):
            raise NotImplementedError(
                "FP8 FlashAttention MLA not yet supported")
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union

 import torch

@ -169,20 +169,20 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):

    def _forward_decode(
        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: FlashMLAMetadata,
        layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None

-        q = torch.cat([q_nope, q_pe], dim=-1)\
-            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)

-        o, _ = flash_mla_with_kvcache(
-            q=q,
+        assert isinstance(q, torch.Tensor)
+        o, lse = flash_mla_with_kvcache(
+            q=q.unsqueeze(1),  # Add seqlen dim of 1 (decode)
            k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
            block_table=attn_metadata.decode.block_table,
            cache_seqlens=attn_metadata.decode.seq_lens,
@ -196,4 +196,4 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
            descale_k=layer._k_scale.reshape(1),
        )

-        return self._v_up_proj(o)
+        return o, lse
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union

 import torch

@ -220,18 +220,19 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):

    def _forward_decode(
        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: AiterMLAMetadata,
        layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None

-        B = q_nope.shape[0]
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)

-        q = torch.cat([q_nope, q_pe], dim=-1)
+        assert isinstance(q, torch.Tensor)
+        B = q.shape[0]
        o = torch.zeros(B,
                        self.num_heads,
                        self.kv_lora_rank,
@ -249,4 +250,4 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
                             attn_metadata.decode.paged_kv_indices,
                             attn_metadata.decode.paged_kv_last_page_len)

-        return self._v_up_proj(o)
+        return o, None
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional
+from typing import Optional, Union

 import torch

@ -123,21 +123,22 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):

    def _forward_decode(
        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
        kv_c_and_k_pe_cache: torch.Tensor,
        attn_metadata: MLACommonMetadata,
        layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None

        if self.kv_cache_dtype.startswith("fp8"):
            raise NotImplementedError("FP8 Triton MLA not yet supported")

-        B = q_nope.shape[0]
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)

-        q = torch.cat([q_nope, q_pe], dim=-1)
+        assert isinstance(q, torch.Tensor)
+        B = q.shape[0]
        o = torch.zeros(B,
                        self.num_heads,
                        self.kv_lora_rank,
@ -171,4 +172,4 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
                             attn_metadata.decode.seq_lens, attn_logits,
                             num_kv_splits, self.scale, PAGE_SIZE)

-        return self._v_up_proj(o)
+        return o, None
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@ -24,6 +24,7 @@ class KVCacheCoordinator(ABC):
        use_eagle: bool,
        enable_caching: bool,
        enable_kv_cache_events: bool,
+        dcp_world_size: int,
    ):
        self.kv_cache_config = kv_cache_config
        self.max_model_len = max_model_len
@ -39,6 +40,7 @@ class KVCacheCoordinator(ABC):
                kv_cache_spec=kv_cache_group.kv_cache_spec,
                block_pool=self.block_pool,
                kv_cache_group_id=i,
+                dcp_world_size=dcp_world_size,
            ) for i, kv_cache_group in enumerate(
                self.kv_cache_config.kv_cache_groups))

@ -197,9 +199,14 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
    """

    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle, False,
-                         enable_kv_cache_events)
+                 use_eagle: bool, enable_kv_cache_events: bool,
+                 dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         False,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
        self.num_single_type_manager = len(self.single_type_managers)

    def get_num_common_prefix_blocks(self, request_id: str,
@ -225,12 +232,19 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):

    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                 use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, enable_kv_cache_events)
+                 enable_kv_cache_events: bool, dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         enable_caching,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
        self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
            0].kv_cache_spec
        self.block_size = self.kv_cache_spec.block_size
+        self.dcp_world_size = dcp_world_size
+        if dcp_world_size > 1:
+            self.block_size *= dcp_world_size
        assert len(self.kv_cache_config.kv_cache_groups) == 1, (
            "UnitaryKVCacheCoordinator assumes only one kv cache group")

@ -246,6 +260,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
            block_pool=self.block_pool,
            kv_cache_spec=self.kv_cache_spec,
            use_eagle=self.use_eagle,
+            dcp_world_size=self.dcp_world_size,
        )
        return hit_blocks, len(hit_blocks[0]) * self.block_size

@ -261,9 +276,14 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):

    def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                 use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, enable_kv_cache_events)
+                 enable_kv_cache_events: bool, dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         enable_caching,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
+        assert dcp_world_size == 1, "DCP not support hybrid attn now."
        self.verify_and_split_kv_cache_groups()

    def verify_and_split_kv_cache_groups(self) -> None:
@ -394,17 +414,27 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
        return hit_blocks, hit_length


-def get_kv_cache_coordinator(
-        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
-        enable_caching: bool,
-        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+def get_kv_cache_coordinator(kv_cache_config: KVCacheConfig,
+                             max_model_len: int, use_eagle: bool,
+                             enable_caching: bool,
+                             enable_kv_cache_events: bool,
+                             dcp_world_size: int) -> KVCacheCoordinator:
    if not enable_caching:
-        return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
+        return KVCacheCoordinatorNoPrefixCache(kv_cache_config,
+                                               max_model_len,
                                               use_eagle,
-                                               enable_kv_cache_events)
+                                               enable_kv_cache_events,
+                                               dcp_world_size=dcp_world_size)
    if len(kv_cache_config.kv_cache_groups) == 1:
-        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
-                                         use_eagle, enable_caching,
-                                         enable_kv_cache_events)
-    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
-                                    enable_caching, enable_kv_cache_events)
+        return UnitaryKVCacheCoordinator(kv_cache_config,
+                                         max_model_len,
+                                         use_eagle,
+                                         enable_caching,
+                                         enable_kv_cache_events,
+                                         dcp_world_size=dcp_world_size)
+    return HybridKVCacheCoordinator(kv_cache_config,
+                                    max_model_len,
+                                    use_eagle,
+                                    enable_caching,
+                                    enable_kv_cache_events,
+                                    dcp_world_size=dcp_world_size)
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -91,6 +91,7 @@ class KVCacheManager:
        use_eagle: bool = False,
        log_stats: bool = False,
        enable_kv_cache_events: bool = False,
+        dcp_world_size: int = 1,
    ) -> None:
        self.max_model_len = max_model_len

@ -109,12 +110,20 @@ class KVCacheManager:
            self.block_size = kv_cache_config.kv_cache_groups[
                0].kv_cache_spec.block_size

+            if dcp_world_size > 1:
+                assert len(kv_cache_config.kv_cache_groups) == 1
+                # Note(hc): need revisit. When both DCP and any future
+                # PCP are enabled, the block_size may need to be scaled
+                # by a factor of dcp_size × pcp_size?
+                self.block_size *= dcp_world_size
+
        self.coordinator = get_kv_cache_coordinator(
            kv_cache_config=kv_cache_config,
            max_model_len=self.max_model_len,
            use_eagle=self.use_eagle,
            enable_caching=self.enable_caching,
            enable_kv_cache_events=enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
        )
        self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
        self.block_pool = self.coordinator.block_pool
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -846,6 +846,12 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
    )

    num_tokens = num_blocks * vllm_config.cache_config.block_size
+    if vllm_config.parallel_config.decode_context_parallel_size > 1:
+        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+        logger.info(
+            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
+            vllm_config.parallel_config.decode_context_parallel_size)
+
    num_tokens_str = f"{num_tokens:,}"
    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -100,6 +100,15 @@ class Scheduler(SchedulerInterface):

        self.block_size = self.cache_config.block_size

+        self.dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): The scheduler’s block_size must be multiplied
+        # by dcp_world_size, since block hashes are computed on the
+        # original full token sequence at a granularity of
+        # original_block_size × dcp_world_size.
+        if self.dcp_world_size > 1:
+            self.block_size *= self.dcp_world_size
+
        # req_id -> Request
        self.requests: dict[str, Request] = {}
        # Scheduling policy
@ -161,6 +170,7 @@ class Scheduler(SchedulerInterface):
            use_eagle=self.use_eagle,
            log_stats=self.log_stats,
            enable_kv_cache_events=self.enable_kv_cache_events,
+            dcp_world_size=self.dcp_world_size,
        )
        self.use_pp = self.parallel_config.pipeline_parallel_size > 1

--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@ -25,6 +25,7 @@ class SingleTypeKVCacheManager(ABC):
        kv_cache_spec: KVCacheSpec,
        block_pool: BlockPool,
        kv_cache_group_id: int,
+        dcp_world_size: int = 1,
    ) -> None:
        """
        Initializes the SingleTypeKVCacheManager.
@ -33,8 +34,10 @@ class SingleTypeKVCacheManager(ABC):
            block_pool: The block pool.
            kv_cache_group_id: The id of the kv cache group of this manager.
        """
-
        self.block_size = kv_cache_spec.block_size
+        self.dcp_world_size = dcp_world_size
+        if self.dcp_world_size > 1:
+            self.block_size *= dcp_world_size
        self.kv_cache_spec = kv_cache_spec
        self.block_pool = block_pool

@ -196,6 +199,7 @@ class SingleTypeKVCacheManager(ABC):
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        dcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        """
        Get the longest cache hit prefix of the blocks that is not longer than 
@ -253,6 +257,7 @@ class FullAttentionManager(SingleTypeKVCacheManager):
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        dcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        assert isinstance(
            kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
@ -260,7 +265,10 @@ class FullAttentionManager(SingleTypeKVCacheManager):
            "and chunked local attention groups"
        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
            [] for _ in range(len(kv_cache_group_ids)))
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        if dcp_world_size > 1:
+            block_size *= dcp_world_size
+        max_num_blocks = max_length // block_size
        for block_hash in itertools.islice(block_hashes, max_num_blocks):
            # block_hashes is a chain of block hashes. If a block hash is not
            # in the cached_block_hash_to_id, the following block hashes are
@ -310,9 +318,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        dcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        assert isinstance(kv_cache_spec, SlidingWindowSpec), (
            "SlidingWindowManager can only be used for sliding window groups")
+        assert dcp_world_size == 1, "DCP not support sliding window attn now."

        # The number of contiguous blocks needed for prefix cache hit.
        # -1 since the input token itself is also included in the window
@ -408,6 +418,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        dcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        """
        For chunked local attention, we need to find the longest cache hit
@ -445,6 +456,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
            "chunked local attention groups")
        assert use_eagle is False, ("Hybrid KV cache is not supported for " +
                                    "eagle + chunked local attention.")
+        assert dcp_world_size == 1, "DCP not support chunked local attn now."
        max_num_blocks = max_length // kv_cache_spec.block_size
        if max_length > 0:
            local_attention_start_idx = (max_length //
@ -525,10 +537,12 @@ class MambaManager(SingleTypeKVCacheManager):
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        dcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        assert isinstance(
            kv_cache_spec,
            MambaSpec), ("MambaManager can only be used for mamba groups")
+        assert dcp_world_size == 1, "DCP not support mamba now."
        # Prefix caching is not supported for mamba now. Always return empty
        # list.
        computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
@ -583,6 +597,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
        block_pool: BlockPool,
        kv_cache_spec: KVCacheSpec,
        use_eagle: bool,
+        dcp_world_size: int = 1,
    ) -> tuple[list[KVCacheBlock], ...]:
        assert isinstance(kv_cache_spec, CrossAttentionSpec), (
            "CrossAttentionManager can only be used for cross-attention groups"
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -439,7 +439,7 @@ class EngineCore:
        """
        # Note on thread safety: no race condition.
        # `mm_receiver_cache` is reset at the end of LLMEngine init,
-        # and will only accessed in the input processing thread afterwards.
+        # and will only be accessed in the input processing thread afterwards.
        if self.mm_receiver_cache is not None and request.mm_features:
            request.mm_features = (
                self.mm_receiver_cache.get_and_update_features(
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@ -233,6 +233,11 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
    def _protected_step(self, next_token_id: int) -> Optional[str]:
        try:
            token = self.stream.step(self.tokenizer, next_token_id)
+        except OverflowError:
+            # Handle rare observed overflow, still to be diagnosed.
+            # See https://github.com/vllm-project/vllm/issues/21951.
+            logger.exception("Encountered invalid token id: %d", next_token_id)
+            token = None
        except Exception as e:
            if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
                raise e
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@ -3,6 +3,7 @@
 import multiprocessing
 import os
 import pickle
+import queue
 import signal
 import threading
 import time
@ -33,7 +34,8 @@ from vllm.utils import (decorate_logs, get_distributed_init_method,
                        get_loopback_ip, get_mp_context, get_open_port,
                        set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
+                             ModelRunnerOutput)
 from vllm.worker.worker_base import WorkerWrapperBase

 logger = init_logger(__name__)
@ -253,7 +255,8 @@ class MultiprocExecutor(Executor):
                    if not non_block:
                        result = result.result()
                elif not non_block:
-                    result = get_response(w, dequeue_timeout)
+                    result = get_response(w, dequeue_timeout,
+                                          self.shutdown_event)
                else:
                    raise RuntimeError("non_block can only be used when"
                                       " max_concurrent_batches > 1")
@ -295,12 +298,8 @@ class MultiprocExecutor(Executor):
        """Properly shut down the executor and its workers"""
        if not getattr(self, 'shutting_down', False):
            self.shutting_down = True
-            self.shutdown_event.set()
-
-            if self.io_thread_pool is not None:
-                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
-                self.io_thread_pool = None

+            # Make sure all the worker processes are terminated first.
            if workers := getattr(self, 'workers', None):
                for w in workers:
                    # Close death_writer to signal child processes to exit
@ -310,6 +309,11 @@ class MultiprocExecutor(Executor):
                    w.worker_response_mq = None
                self._ensure_worker_termination([w.proc for w in workers])

+            self.shutdown_event.set()
+            if self.io_thread_pool is not None:
+                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
+                del self.io_thread_pool
+
        self.rpc_broadcast_mq = None

    def check_health(self) -> None:
@ -412,6 +416,16 @@ class WorkerProc:
        # Initializes a message queue for sending the model output
        self.worker_response_mq = MessageQueue(1, 1)

+        scheduler_config = vllm_config.scheduler_config
+        self.use_async_scheduling = scheduler_config.async_scheduling
+        if self.use_async_scheduling:
+            self.async_output_queue: queue.Queue = queue.Queue()
+            self.async_output_copy_thread = Thread(
+                target=self.async_output_busy_loop,
+                daemon=True,
+                name="WorkerAsyncOutputCopy")
+            self.async_output_copy_thread.start()
+
        # Initialize device and loads weights
        self.worker.init_device()
        self.worker.load_model()
@ -593,6 +607,36 @@ class WorkerProc:
        SUCCESS = auto()
        FAILURE = auto()

+    def enqueue_output(self, output: Any):
+        """Prepares output from the worker and enqueues it to the
+        worker_response_mq. If the output is an Exception, it is
+        converted to a FAILURE response.
+        """
+        if isinstance(output, AsyncModelRunnerOutput):
+            output = output.get_output()
+
+        if isinstance(output, Exception):
+            result = (WorkerProc.ResponseStatus.FAILURE, str(output))
+        else:
+            result = (WorkerProc.ResponseStatus.SUCCESS, output)
+        self.worker_response_mq.enqueue(result)
+
+    def handle_output(self, output: Any):
+        """Handles output from the worker. If async scheduling is enabled,
+        it is passed to the async_output_busy_loop thread. Otherwise, it is
+        enqueued directly to the worker_response_mq.
+        """
+        if self.use_async_scheduling:
+            self.async_output_queue.put(output)
+        else:
+            self.enqueue_output(output)
+
+    def async_output_busy_loop(self):
+        """Entrypoint for the thread which handles outputs asynchronously."""
+        while True:
+            output = self.async_output_queue.get()
+            self.enqueue_output(output)
+
    def worker_busy_loop(self):
        """Main busy loop for Multiprocessing Workers"""
        while True:
@ -612,10 +656,8 @@ class WorkerProc:
                # exception might not be serializable, so we convert it to
                # string, only for logging purpose.
                if output_rank is None or self.rank == output_rank:
-                    self.worker_response_mq.enqueue(
-                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                    self.handle_output(e)
                continue

            if output_rank is None or self.rank == output_rank:
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.SUCCESS, output))
+                self.handle_output(output)
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@ -86,6 +86,12 @@ class FullAttentionSpec(AttentionSpec):

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        max_model_len = vllm_config.model_config.max_model_len
+        dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): each dcp rank only need save
+        # (max_model_len//dcp_world_size) tokens locally.
+        if dcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size)
        return cdiv(max_model_len, self.block_size) * self.page_size_bytes

    @classmethod
@ -162,6 +168,8 @@ class SlidingWindowSpec(AttentionSpec):
        assert not self.use_mla, "MLA is not supported for sliding window"

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        assert vllm_config.parallel_config.decode_context_parallel_size == 1, \
+            "DCP not support sliding window."
        max_model_len = vllm_config.model_config.max_model_len
        max_num_batched_tokens = (
            vllm_config.scheduler_config.max_num_batched_tokens)
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import NamedTuple, Optional

@ -114,6 +115,20 @@ class ModelRunnerOutput:
    num_nans_in_logits: Optional[dict[str, int]] = None


+# ModelRunnerOutput wrapper for async scheduling.
+class AsyncModelRunnerOutput(ABC):
+
+    @abstractmethod
+    def get_output(self) -> ModelRunnerOutput:
+        """Get the ModelRunnerOutput for this async output.
+        
+        This is a blocking call that waits until the results are ready, which
+        might involve copying device tensors to the host.
+        This method should only be called once per AsyncModelRunnerOutput.
+        """
+        pass
+
+
@dataclass
 class DraftTokenIds:

--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@ -19,6 +19,8 @@ from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
                        kill_process_tree)

 if TYPE_CHECKING:
+    import numpy as np
+
    from vllm.v1.engine.coordinator import DPCoordinator
    from vllm.v1.engine.utils import (CoreEngineActorManager,
                                      CoreEngineProcManager)
@ -97,20 +99,31 @@ class ConstantList(Generic[T], Sequence):


 class CpuGpuBuffer:
+    """Buffer to easily copy tensors between CPU and GPU."""

    def __init__(
        self,
-        *args,
+        *size: Union[int, torch.SymInt],
        dtype: torch.dtype,
        device: torch.device,
        pin_memory: bool,
-    ):
-        self.cpu = torch.zeros(*args,
+        with_numpy: bool = True,
+    ) -> None:
+        self.cpu = torch.zeros(*size,
                               dtype=dtype,
                               device="cpu",
                               pin_memory=pin_memory)
-        self.np = self.cpu.numpy()
        self.gpu = self.cpu.to(device)
+        self.np: np.ndarray
+        # To keep type hints simple (avoiding generics and subclasses), we
+        # only conditionally create the numpy array attribute. This can cause
+        # AttributeError if `self.np` is accessed when `with_numpy=False`.
+        if with_numpy:
+            if dtype == torch.bfloat16:
+                raise ValueError(
+                    "Bfloat16 torch tensors cannot be directly cast to a "
+                    "numpy array, so call CpuGpuBuffer with with_numpy=False")
+            self.np = self.cpu.numpy()

    def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
        if n is None:
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@ -4,6 +4,7 @@
 import numpy as np
 import torch

+from vllm.distributed import get_dcp_group
 from vllm.logger import init_logger
 from vllm.utils import cdiv

@ -50,6 +51,13 @@ class BlockTable:
        self.slot_mapping = torch.zeros(self.max_num_batched_tokens,
                                        dtype=torch.int64,
                                        device=self.device)
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0

    def append_row(
        self,
@ -89,13 +97,36 @@ class BlockTable:
        # NOTE(woosuk): We can't simply use `token_indices // block_size`
        # here because M (max_model_len) is not necessarily divisible by
        # block_size.
-        block_table_indices = (req_indices * self.max_num_blocks_per_req +
-                               positions // self.block_size)
-        block_numbers = self.block_table_np.ravel()[block_table_indices]
-        block_offsets = positions % self.block_size
-        np.add(block_numbers * self.block_size,
-               block_offsets,
-               out=self.slot_mapping_np[:req_indices.shape[0]])
+        if self.dcp_world_size > 1:
+            # Note(hc): The DCP implement store kvcache with a interleave
+            # style, the kvcache for the token whose token_idx is i is
+            # always stored on the GPU whose dcp_rank equals i % cp_world_size:
+
+            # Use a "virtual block" which equals to world_size * block_size
+            # for block_table_indices calculation.
+            virtual_block_size = self.block_size * self.dcp_world_size
+            block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                                   positions // virtual_block_size)
+            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            # Use virtual_block_size for mask calculation, which marks local
+            # tokens.
+            virtual_block_offsets = positions % virtual_block_size
+            mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
+            # Calcuate local block_offsets
+            block_offsets = virtual_block_offsets // self.dcp_world_size
+            # Calcuate slot_mapping
+            slot_mapping = block_numbers * self.block_size + block_offsets
+            # Write final slots, use -1 for not-local
+            self.slot_mapping_np[:req_indices.shape[0]] = np.where(
+                mask, slot_mapping, -1)
+        else:
+            block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                                   positions // self.block_size)
+            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            block_offsets = positions % self.block_size
+            np.add(block_numbers * self.block_size,
+                   block_offsets,
+                   out=self.slot_mapping_np[:req_indices.shape[0]])

    def commit_block_table(self, num_reqs: int) -> None:
        self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
@ -128,9 +159,19 @@ class MultiGroupBlockTable:
    def __init__(self, max_num_reqs: int, max_model_len: int,
                 max_num_batched_tokens: int, pin_memory: bool,
                 device: torch.device, block_sizes: list[int]) -> None:
+        # Note(hc): each dcp rank only store
+        # (max_model_len//dcp_world_size) tokens in kvcache,
+        # so the block_size which used for calc max_num_blocks_per_req
+        # must be multiplied by dcp_world_size.
+        try:
+            dcp_world_size = get_dcp_group().world_size
+        except AssertionError:
+            # DCP might not be initialized in testing
+            dcp_world_size = 1
+
        self.block_tables = [
-            BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
-                                                      block_size),
+            BlockTable(block_size, max_num_reqs,
+                       cdiv(max_model_len, block_size * dcp_world_size),
                       max_num_batched_tokens, pin_memory, device)
            for block_size in block_sizes
        ]
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@ -250,6 +250,11 @@ class InputBatch:

        self.pooling_params: dict[str, PoolingParams] = {}

+        # Cached reference to the GPU tensor of previously sampled tokens
+        self.prev_sampled_token_ids: Optional[torch.Tensor] = None
+        self.prev_sampled_token_ids_invalid_indices: Optional[set[int]] = None
+        self.prev_req_id_to_index: Optional[dict[str, int]] = None
+
    @property
    def req_ids(self) -> list[str]:
        # None elements should only be present transiently
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -56,6 +56,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                        GiB_bytes, LazyLoader, cdiv, check_use_alibi,
                        get_dtype_size, is_pin_memory_available, round_up,
                        supports_dynamo)
+from vllm.v1.attention.backends.mla.flashmla import FlashMLABackend
 from vllm.v1.attention.backends.utils import (
    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
    create_fast_prefill_custom_backend,
@ -67,8 +68,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec,
                                        FullAttentionSpec, KVCacheConfig,
                                        KVCacheGroupSpec, KVCacheSpec,
                                        MambaSpec, SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             LogprobsTensors, ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
+                             DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@ -100,6 +101,53 @@ else:
 logger = init_logger(__name__)


+# Wrapper for ModelRunnerOutput to support overlapped execution.
+class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
+
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampled_token_ids: torch.Tensor,
+        invalid_req_indices: list[int],
+        async_output_copy_stream: torch.cuda.Stream,
+    ):
+        self._model_runner_output = model_runner_output
+        self._invalid_req_indices = invalid_req_indices
+
+        # Event on the copy stream so we can synchronize the non-blocking copy.
+        self._async_copy_ready_event = torch.cuda.Event()
+
+        # Keep a reference to the device tensor to avoid it being
+        # deallocated until we finish copying it to the host.
+        self._sampled_token_ids = sampled_token_ids
+
+        # Initiate the copy on a separate stream, but do not synchronize it.
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(async_output_copy_stream):
+            async_output_copy_stream.wait_stream(default_stream)
+            self._sampled_token_ids_cpu = self._sampled_token_ids.to(
+                'cpu', non_blocking=True)
+            self._async_copy_ready_event.record()
+
+    def get_output(self) -> ModelRunnerOutput:
+        """Copy the device tensors to the host and return a ModelRunnerOutput.
+        
+        This function blocks until the copy is finished.
+        """
+        self._async_copy_ready_event.synchronize()
+
+        # Release the device tensor once the copy has completed
+        del self._sampled_token_ids
+
+        valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
+        for i in self._invalid_req_indices:
+            valid_sampled_token_ids[i].clear()
+
+        output = self._model_runner_output
+        output.sampled_token_ids = valid_sampled_token_ids
+        return output
+
+
 class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

    def __init__(
@ -140,6 +188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            model_config.is_multimodal_raw_input_only_model)

        self.max_model_len = model_config.max_model_len
+        self.dcp_world_size = self.parallel_config.decode_context_parallel_size
        self.max_num_tokens = scheduler_config.max_num_batched_tokens
        self.max_num_reqs = scheduler_config.max_num_seqs

@ -230,6 +279,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            is_pooling_model=self.is_pooling_model,
        )

+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+        self.async_output_copy_stream = torch.cuda.Stream() if \
+            self.use_async_scheduling else None
+
        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
        # The convention is different.
        # self.cudagraph_batch_sizes sorts in ascending order.
@ -250,10 +303,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
                                                 dtype=torch.int32)
        self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-        self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+        # Because inputs_embeds may be bfloat16 and we don't need a numpy
+        # version of this tensor, avoid a RuntimeError by not creating a
+        # numpy buffer.
+        self.inputs_embeds = self._make_buffer(self.max_num_tokens,
+                                               self.hidden_size,
+                                               dtype=self.dtype,
+                                               numpy=False)

        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
        if self.uses_mrope:
@ -321,11 +377,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            device="cpu",
            pin_memory=self.pin_memory)

-    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(*args,
+    def _make_buffer(self,
+                     *size: Union[int, torch.SymInt],
+                     dtype: torch.dtype,
+                     numpy: bool = True) -> CpuGpuBuffer:
+        # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
+        # if a bfloat16 buffer is needed without a corresponding numpy array,
+        # don't bother instantiating the numpy array.
+        return CpuGpuBuffer(*size,
                            dtype=dtype,
                            device=self.device,
-                            pin_memory=self.pin_memory)
+                            pin_memory=self.pin_memory,
+                            with_numpy=numpy)

    def _init_model_kwargs(self, num_tokens: int):
        model_kwargs = dict[str, Any]()
@ -377,6 +440,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            return

        if self.reorder_batch_threshold is not None:
+            if self.dcp_world_size > 1:
+                assert self.reorder_batch_threshold == 1, \
+                    "DCP not support reorder_batch_threshold > 1 now."
            reorder_batch_to_split_decodes_and_prefills(
                self.input_batch,
                scheduler_output,
@ -654,6 +720,73 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

        return cu_num_tokens, arange

+    def _prepare_input_ids(self, total_num_scheduled_tokens: int,
+                           cu_num_tokens: np.ndarray) -> None:
+        """Prepare the input IDs for the current batch.
+        
+        Carefully handles the `prev_sampled_token_ids` which can be cached
+        from the previous engine iteration, in which case those tokens on the
+        GPU need to be copied into the corresponding slots into input_ids."""
+
+        if self.input_batch.prev_sampled_token_ids is None:
+            # Normal scheduling case
+            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+            return
+
+        # Async scheduling case, where some decode requests from the previous
+        # iteration won't have entries in input_ids_cpu and need to be copied
+        # on the GPU from prev_sampled_token_ids.
+        prev_req_id_to_index = self.input_batch.prev_req_id_to_index
+        assert prev_req_id_to_index is not None
+        flattened_indices = []
+        prev_common_req_indices = []
+        indices_match = True
+        max_flattened_index = -1
+        for req_id, cur_index in self.input_batch.req_id_to_index.items():
+            if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
+                prev_common_req_indices.append(prev_index)
+                # We need to compute the flattened input_ids index of the
+                # last token in each common request.
+                flattened_index = cu_num_tokens[cur_index].item() - 1
+                flattened_indices.append(flattened_index)
+                indices_match &= (prev_index == flattened_index)
+                max_flattened_index = max(max_flattened_index, flattened_index)
+        num_commmon_tokens = len(flattened_indices)
+        if num_commmon_tokens < total_num_scheduled_tokens:
+            # If not all requests are decodes from the last iteration,
+            # We need to copy the input_ids_cpu to the GPU first.
+            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+        if num_commmon_tokens == 0:
+            # No requests in common with the previous iteration
+            # So input_ids_cpu will have all the input ids.
+            return
+        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+            # Common-case optimization: the batch is unchanged
+            # and no reordering happened.
+            # The indices are both the same permutation of 0..N-1 so
+            # we can copy directly using a single slice.
+            self.input_ids.gpu[:num_commmon_tokens].copy_(
+                self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
+                                                        0],
+                non_blocking=True)
+            return
+        # Upload the index tensors asynchronously
+        # so the scatter can be non-blocking.
+        input_ids_index_tensor = torch.tensor(flattened_indices,
+                                              dtype=torch.int64,
+                                              pin_memory=self.pin_memory).to(
+                                                  self.device,
+                                                  non_blocking=True)
+        prev_common_req_indices_tensor = torch.tensor(
+            prev_common_req_indices,
+            dtype=torch.int64,
+            pin_memory=self.pin_memory).to(self.device, non_blocking=True)
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=input_ids_index_tensor,
+            src=self.input_batch.prev_sampled_token_ids[
+                prev_common_req_indices_tensor, 0])
+
    def _prepare_inputs(
        self,
        scheduler_output: "SchedulerOutput",
@ -740,7 +873,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        max_seq_len = self.seq_lens.np[:num_reqs].max().item()

        # Copy the tensors to the GPU.
-        self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+
        if self.uses_mrope:
            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
            self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
@ -1458,7 +1592,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        self,
        scheduler_output: "SchedulerOutput",
        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, IntermediateTensors]:
+    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
        self._update_states(scheduler_output)
        if not scheduler_output.total_num_scheduled_tokens:
            if not has_kv_transfer_group():
@ -1521,11 +1655,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            )

            # TODO(woosuk): Avoid the copy. Optimize.
-            self.inputs_embeds[:num_scheduled_tokens].copy_(
+            self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(
                inputs_embeds_scheduled)

            input_ids = None
-            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
            model_kwargs = {
                **self._init_model_kwargs(num_scheduled_tokens),
                **self._extract_mm_kwargs(scheduler_output),
@ -1673,6 +1807,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                # so that we could clear the sampled tokens before returning.
                discard_sampled_tokens_req_indices.append(i)

+        # Copy some objects so they don't get modified after returning.
+        # This is important when using async scheduling.
+        req_ids_output_copy = self.input_batch.req_ids.copy()
+        req_id_to_index_output_copy = \
+            self.input_batch.req_id_to_index.copy()
+
        # NOTE: GPU -> CPU Sync happens here.
        # Move as many CPU operations as possible before this sync point.
        logprobs_tensors = sampler_output.logprobs_tensors
@ -1685,21 +1825,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            scheduler_output.num_scheduled_tokens,
        )

-        # Get the valid generated tokens.
+        num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
        sampled_token_ids = sampler_output.sampled_token_ids
-        max_gen_len = sampled_token_ids.shape[-1]
-        if max_gen_len == 1:
-            # No spec decode tokens.
-            valid_sampled_token_ids = self._to_list(sampled_token_ids)
+        if not self.use_async_scheduling:
+            # Get the valid generated tokens.
+            max_gen_len = sampled_token_ids.shape[-1]
+            if max_gen_len == 1:
+                # No spec decode tokens.
+                valid_sampled_token_ids = self._to_list(sampled_token_ids)
+            else:
+                # Includes spec decode tokens.
+                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                    sampled_token_ids,
+                    self.input_batch.vocab_size,
+                )
+            # Mask out the sampled tokens that should not be sampled.
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
        else:
-            # Includes spec decode tokens.
-            valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                sampled_token_ids,
-                self.input_batch.vocab_size,
-            )
-        # Mask out the sampled tokens that should not be sampled.
-        for i in discard_sampled_tokens_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids = []
+            invalid_req_indices = list(discard_sampled_tokens_req_indices)
+            invalid_req_indices_set = set(invalid_req_indices)
+            assert sampled_token_ids.shape[-1] == 1
+
+            # Cache the sampled tokens on the GPU and avoid CPU sync.
+            # These will be copied into input_ids in the next step
+            # when preparing inputs.
+            self.input_batch.prev_sampled_token_ids = \
+                sampled_token_ids
+            self.input_batch.prev_sampled_token_ids_invalid_indices = \
+                invalid_req_indices_set
+            self.input_batch.prev_req_id_to_index = {
+                req_id: i
+                for i, req_id in enumerate(self.input_batch.req_ids)
+                if i not in invalid_req_indices_set
+            }

        # Cache the sampled tokens in the model runner, so that the scheduler
        # doesn't need to send them back.
@ -1707,7 +1867,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # the sampled tokens back, because there's no direct communication
        # between the first-stage worker and the last-stage worker.
        req_ids = self.input_batch.req_ids
-        for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+        for req_idx in range(num_sampled_tokens):
+            if self.use_async_scheduling:
+                sampled_ids = [-1] if \
+                    req_idx not in invalid_req_indices_set else None
+            else:
+                sampled_ids = valid_sampled_token_ids[req_idx]
            if not sampled_ids:
                continue

@ -1722,6 +1887,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                           start_idx:end_idx] = sampled_ids
            self.input_batch.num_tokens_no_spec[req_idx] = end_idx
            self.input_batch.num_tokens[req_idx] = end_idx
+
            req_id = req_ids[req_idx]
            req_state = self.requests[req_id]
            req_state.output_token_ids.extend(sampled_ids)
@ -1741,9 +1907,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

        self.eplb_step()

-        return ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
+        output = ModelRunnerOutput(
+            req_ids=req_ids_output_copy,
+            req_id_to_index=req_id_to_index_output_copy,
            sampled_token_ids=valid_sampled_token_ids,
            logprobs=logprobs_lists,
            prompt_logprobs_dict=prompt_logprobs_dict,
@ -1752,6 +1918,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            num_nans_in_logits=num_nans_in_logits,
        )

+        if not self.use_async_scheduling:
+            return output
+
+        return AsyncGPUModelRunnerOutput(
+            model_runner_output=output,
+            sampled_token_ids=sampled_token_ids,
+            invalid_req_indices=invalid_req_indices,
+            async_output_copy_stream=self.async_output_copy_stream,
+        )
+
    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
        if self._draft_token_ids is None:
            return None
@ -2318,7 +2494,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                            num_scheduled_tokens, remove_lora):
            if self.supports_mm_inputs:
                input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
                model_kwargs = {
                    **self._init_model_kwargs(num_tokens),
                    **self._dummy_mm_kwargs(num_reqs),
@ -2665,7 +2841,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # Disable cudagraph capturing globally, so any unexpected cudagraph
        # capturing will be detected and raise an error after here.
        # Note: We don't put it into graph_capture context manager because
-        # we may doing lazy capturing in future that still allows capturing
+        # we may do lazy capturing in future that still allows capturing
        # after here.
        set_cudagraph_capturing_enabled(False)

@ -3144,6 +3320,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                get_kv_transfer_group().set_host_xfer_buffer_ops(
                    copy_kv_blocks)

+        if self.dcp_world_size > 1:
+            assert self.attn_groups[0][0].backend is FlashMLABackend, (
+                "DCP only support flashmla now."
+                "For a mla backend want to enable DCP, it is mandatory that the"
+                "corresponding decode attn kernel return the softmax lse.")
+
    def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
        """
        Add encoder-only layers to the KV cache config.
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -5,7 +5,7 @@ import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union

 import torch
 import torch.distributed
@ -28,8 +28,8 @@ from vllm.tasks import SupportedTask
 from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
+                             DraftTokenIds, ModelRunnerOutput)
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.worker_base import WorkerBase
@ -355,7 +355,7 @@ class Worker(WorkerBase):
    def execute_model(
        self,
        scheduler_output: "SchedulerOutput",
-    ) -> Optional[ModelRunnerOutput]:
+    ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
        intermediate_tensors = None
        forward_pass = scheduler_output.total_num_scheduled_tokens > 0
        if forward_pass and not get_pp_group().is_first_rank:
@ -365,7 +365,7 @@ class Worker(WorkerBase):

        output = self.model_runner.execute_model(scheduler_output,
                                                 intermediate_tensors)
-        if isinstance(output, ModelRunnerOutput):
+        if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
            return output

        assert isinstance(output, IntermediateTensors)
@ -616,7 +616,9 @@ def init_worker_distributed_environment(
    init_distributed_environment(parallel_config.world_size, rank,
                                 distributed_init_method, local_rank, backend)

-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    ensure_model_parallel_initialized(
+        parallel_config.tensor_parallel_size,
+        parallel_config.pipeline_parallel_size,
+        parallel_config.decode_context_parallel_size)

    ensure_kv_transfer_initialized(vllm_config)
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@ -539,8 +539,10 @@ def init_worker_distributed_environment(
    init_distributed_environment(parallel_config.world_size, rank,
                                 distributed_init_method, local_rank,
                                 current_platform.dist_backend)
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    ensure_model_parallel_initialized(
+        parallel_config.tensor_parallel_size,
+        parallel_config.pipeline_parallel_size,
+        parallel_config.decode_context_parallel_size)

    ensure_kv_transfer_initialized(vllm_config)