Merge branch 'main' into elvischenv/update-flashinfer

This commit is contained in:
Pavani Majety 2025-12-22 10:55:04 -08:00 committed by GitHub
commit 20d87ee2f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
129 changed files with 4552 additions and 1686 deletions

View File

@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
**Benchmarking Duration**: about 1hr.
@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Runtime environment variables:
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@ -34,8 +34,9 @@ Runtime environment variables:
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
>
> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
### Latency test
Here is an example of one test inside `latency-tests.json`:

View File

@ -49,7 +49,11 @@ check_cpus() {
echo "Need at least 1 NUMA to run benchmarking."
exit 1
fi
declare -g gpu_type="cpu"
if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
declare -g gpu_type="arm64-cpu"
else
declare -g gpu_type="cpu"
fi
echo "GPU type is $gpu_type"
}
@ -207,8 +211,8 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@ -276,8 +280,8 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@ -393,8 +397,8 @@ run_serving_tests() {
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
if [[ "$ON_CPU" == "1" ]]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
@ -496,9 +500,9 @@ run_serving_tests() {
main() {
local ARCH
ARCH=''
if [ "$ON_CPU" == "1" ];then
check_cpus
ARCH='-cpu'
if [[ "$ON_CPU" == "1" ]]; then
check_cpus
ARCH="-$gpu_type"
else
check_gpus
ARCH="$arch_suffix"

View File

@ -0,0 +1,26 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"num_iters_warmup": 5,
"num_iters": 15
}
}
]

View File

@ -0,0 +1,130 @@
{
"defaults": {
"qps_list": [
"inf"
],
"max_concurrency_list": [
12,
16,
24,
32,
64,
128,
200
],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"ignore-eos": "",
"num_prompts": 200
}
},
"tests": [
{
"test_name": "serving_llama8B_tp1_sharegpt",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 1
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"server_parameters": {
"tensor_parallel_size": 2
},
"client_parameters": {
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128
}
}
]
}

View File

@ -0,0 +1,27 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -291,6 +291,7 @@ if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@ -318,6 +319,12 @@ if __name__ == "__main__":
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--wheel-dir",
type=str,
default=None,
help="Directory containing wheel files (default to be same as `version`)",
)
parser.add_argument(
"--alias-to-default",
type=str,
@ -372,7 +379,7 @@ if __name__ == "__main__":
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# keep only "official" files for a non-nightly version (specifed by cli args)
# keep only "official" files for a non-nightly version (specified by cli args)
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
if PY_VERSION_RE.match(version):
# upload-wheels.sh ensures no "dev" is in args.version
@ -384,9 +391,10 @@ if __name__ == "__main__":
print("Nightly version detected, keeping all wheel files.")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/{wheel_dir}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version
wheel_dir = args.wheel_dir or version
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
index_base_dir = Path(output_dir)
generate_index_and_metadata(

View File

@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@ -349,7 +349,9 @@ steps:
- label: V1 Test e2e + engine # 65min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- vllm/
@ -964,7 +966,7 @@ steps:
- pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
timeout_in_minutes: 100
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@ -973,13 +975,15 @@ steps:
- vllm/
- tests/models/multimodal
commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
timeout_in_minutes: 180
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
timeout_in_minutes: 10
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
@ -989,7 +993,9 @@ steps:
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
- label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120
@ -1001,10 +1007,13 @@ steps:
- vllm/
- tests/models/multimodal
commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2
- label: Multi-Modal Models Test (Extended) 2 #60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
@ -1013,6 +1022,8 @@ steps:
- vllm/
- tests/models/multimodal
commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
@ -1026,6 +1037,8 @@ steps:
- vllm/
- tests/models/multimodal
commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
@ -1243,13 +1256,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@ -1497,7 +1510,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####

View File

@ -319,7 +319,10 @@ steps:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
@ -1106,13 +1109,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@ -1331,7 +1334,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
@ -1356,6 +1359,7 @@ steps:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy

View File

@ -145,7 +145,7 @@ steps:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
@ -171,7 +171,7 @@ steps:
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30

1
.github/CODEOWNERS vendored
View File

@ -15,6 +15,7 @@
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson

View File

@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
"if you intend on running FP8 quantized MoE models on Blackwell.")
else()
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# Machete kernels

View File

@ -0,0 +1,177 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.triton_utils import triton
from vllm.utils.flashinfer import flashinfer_fp4_quantize
if not current_platform.has_device_capability(100):
raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
PROVIDER_CFGS = {
"vllm": dict(backend="vllm", enabled=True),
"flashinfer": dict(backend="flashinfer", enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
"""Compute global scale for FP4 quantization."""
amax = torch.abs(tensor).max().to(torch.float32)
return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="us (lower is better)",
plot_name="NVFP4 Input Quantization Latency (us)",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
# Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype)
# Compute global scale for activation
a_global_scale = compute_global_scale(a)
quantiles = [0.5, 0.2, 0.8]
cfg = PROVIDER_CFGS[provider]
if cfg["backend"] == "vllm":
# vLLM's FP4 quantization
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: ops.scaled_fp4_quant(a, a_global_scale),
quantiles=quantiles,
)
elif cfg["backend"] == "flashinfer":
# FlashInfer's FP4 quantization
# Use is_sf_swizzled_layout=True to match vLLM's output format
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True
),
quantiles=quantiles,
)
# Convert ms to us for better readability at small batch sizes
to_us = lambda t_ms: t_ms * 1000
return to_us(ms), to_us(max_ms), to_us(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
# Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype)
# Compute global scale
a_global_scale = compute_global_scale(a)
# vLLM quantization
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
# FlashInfer quantization (with swizzled layout to match vLLM's output)
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True
)
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
# Compare outputs
torch.testing.assert_close(
vllm_fp4,
flashinfer_fp4,
)
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
def test_accuracy():
"""Run accuracy tests across various shapes."""
print("\n" + "=" * 60)
print("Running accuracy tests: vLLM vs FlashInfer")
print("=" * 60)
device = "cuda"
dtype = torch.bfloat16
# Test various batch sizes and hidden dimensions
Ms = [1, 1024]
Ks = [4096]
for M in Ms:
for K in Ks:
_test_accuracy_once(M, K, dtype, device)
print("\nAll accuracy tests passed!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
parser.add_argument(
"--save-path",
type=str,
default=None,
help="Path to save benchmark results",
)
parser.add_argument(
"--accuracy",
action="store_true",
help="Run accuracy tests",
)
args = parser.parse_args()
if args.accuracy:
test_accuracy()
for K, N, model in prepare_shapes(args):
print(f"\n{model}, N={N} K={K}")
benchmark.run(
print_data=True,
save_path=args.save_path,
N=N,
K=K,
)
print("\nBenchmark finished!")

View File

@ -37,10 +37,12 @@ struct VecTypeTrait<c10::BFloat16> {
};
#endif
#if !defined(__powerpc__)
template <>
struct VecTypeTrait<c10::Half> {
using vec_t = vec_op::FP16Vec16;
};
#endif
struct Counter {
std::atomic<int64_t> counter;

View File

@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
void const* k_weight_void, // RMSNorm weights for key
void const* cos_sin_cache_void, // Pre-computed cos/sin cache
int64_t const* position_ids, // Position IDs for RoPE
int const num_tokens // Number of tokens
int const num_tokens, // Number of tokens
int const rotary_dim // Dimension for RoPE
) {
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
// Calculate cache pointer for this position - similar to
// pos_encoding_kernels.cu
T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
int const embed_dim = head_dim / 2;
T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
int const embed_dim = rotary_dim / 2;
T_cache const* cos_ptr = cache_ptr;
T_cache const* sin_ptr = cache_ptr + embed_dim;
if constexpr (interleave) {
// Perform interleaving. Use pre-computed cos/sin values.
int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range
if (laneId < rotary_lanes) {
if constexpr (interleave) {
// Perform interleaving. Use pre-computed cos/sin values.
#pragma unroll
for (int i = 0; i < numElemsPerThread / 2; ++i) {
int const idx0 = 2 * i;
int const idx1 = 2 * i + 1;
for (int i = 0; i < numElemsPerThread / 2; ++i) {
int const idx0 = 2 * i;
int const idx1 = 2 * i + 1;
// Global dimension index in the head
int const dim_idx = laneId * numElemsPerThread + idx0;
float const val0 = elements[idx0];
float const val1 = elements[idx1];
float const val0 = elements[idx0];
float const val1 = elements[idx1];
int const dim_idx = laneId * numElemsPerThread + idx0;
int const half_dim = dim_idx / 2;
float const cos_val =
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float const sin_val =
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
int const half_dim = dim_idx / 2;
float const cos_val =
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float const sin_val =
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[idx0] = val0 * cos_val - val1 * sin_val;
elements[idx1] = val0 * sin_val + val1 * cos_val;
}
} else {
// Before data exchange with in warp, we need to sync.
__syncwarp();
// Get the data from the other half of the warp. Use pre-computed cos/sin
// values.
#pragma unroll
for (int i = 0; i < numElemsPerThread; i++) {
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
if (laneId < 16) {
elements2[i] = -elements2[i];
elements[idx0] = val0 * cos_val - val1 * sin_val;
elements[idx1] = val0 * sin_val + val1 * cos_val;
}
} else {
// Before data exchange with in warp, we need to sync.
__syncwarp();
int pairOffset = (rotary_dim / 2) / numElemsPerThread;
// Get the data from the other half of the warp. Use pre-computed
// cos/sin values.
#pragma unroll
for (int i = 0; i < numElemsPerThread; i++) {
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
int dim_idx = laneId * numElemsPerThread + i;
dim_idx = (dim_idx * 2) % head_dim;
int half_dim = dim_idx / 2;
// Use pre-computed cos/sin from cache
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
if (laneId < pairOffset) {
elements2[i] = -elements2[i];
}
int dim_idx = laneId * numElemsPerThread + i;
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
dim_idx = (dim_idx * 2) % rotary_dim;
int half_dim = dim_idx / 2;
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
}
// __shfl_xor_sync does not provide memfence. Need to sync again.
__syncwarp();
}
// __shfl_xor_sync does not provide memfence. Need to sync again.
__syncwarp();
}
// Store.
{
vec_T vec;
@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
void launchFusedQKNormRope(void* qkv, int const num_tokens,
int const num_heads_q, int const num_heads_k,
int const num_heads_v, int const head_dim,
float const eps, void const* q_weight,
void const* k_weight, void const* cos_sin_cache,
bool const interleave, int64_t const* position_ids,
cudaStream_t stream) {
int const rotary_dim, float const eps,
void const* q_weight, void const* k_weight,
void const* cos_sin_cache, bool const interleave,
int64_t const* position_ids, cudaStream_t stream) {
constexpr int blockSize = 256;
int const warpsPerBlock = blockSize / 32;
@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens);
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
});
break;
case 128:
@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens);
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
});
break;
case 256:
@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens);
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
});
break;
default:
@ -392,8 +396,11 @@ void fused_qk_norm_rope(
"Query weights size must match head dimension");
TORCH_CHECK(k_weight.size(0) == head_dim,
"Key weights size must match head dimension");
TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
"Cos/sin cache dimension must match head_dim");
TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
"rotary_dim must be less than or equal to head_dim");
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
qkv.scalar_type() == k_weight.scalar_type(),
"qkv, q_weight and k_weight must have the same dtype");
@ -419,7 +426,8 @@ void fused_qk_norm_rope(
qkv.data_ptr(), static_cast<int>(num_tokens),
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
static_cast<int>(num_heads_v), static_cast<int>(head_dim),
static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
q_weight.data_ptr(), k_weight.data_ptr(),
cos_sin_cache.data_ptr(), !is_neox,
reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
stream);

View File

@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
// Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)).
@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout);
rowIdx, colIdx, numKTiles, SFout);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
sf_out);

View File

@ -25,6 +25,7 @@
#include <cuda_fp8.h>
#include "dispatch_utils.h"
#include "cuda_utils.h"
#include "nvfp4_utils.cuh"
#include "launch_bounds_utils.h"
@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
// (448.f / (Alpha_A / 6.f)).
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
int factor = CVT_FP4_SF_VEC_SIZE * 4;
// The actual output_scales dim is computed from the padded numCols.
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
uint32_t* SFout_in_expert =
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
}
@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
extern __shared__ uint32_t shared_input_offsets[];
// Load input offsets into shared memory.
@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
int factor = CVT_FP4_SF_VEC_SIZE * 4;
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
uint32_t* SFout_in_expert =
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
}
@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input,
void* input_global_scale, void* input_offset_by_experts,
void* output_scale_offset_by_experts, int m_topk, int k,
int n_experts, cudaStream_t stream) {
// TODO: this multiProcessorCount should be cached.
int device;
cudaGetDevice(&device);
int multiProcessorCount;
cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
device);
int multiProcessorCount =
get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
// Grid, Block size.
// Each thread converts 8 values.

View File

@ -38,6 +38,12 @@ __host__ __device__ inline Int round_up(Int x, Int y) {
return (x + y - 1) / y * y;
}
// Compute effective rows for grid configuration with swizzled SF layouts.
inline int computeEffectiveRows(int m) {
constexpr int ROW_TILE = 128;
return round_up(m, ROW_TILE);
}
// Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
@ -49,6 +55,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
int sf_m = round_up<int>(numRows, 128);
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
@ -79,7 +88,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout);
rowIdx, colIdx, numKTiles, SFout);
out_pos =
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
@ -87,43 +96,6 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
}
}
template <typename T>
void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
int64_t* output, int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount, cudaStream_t stream) {
// Grid, Block size.
// Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
// Get number of blocks per SM
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
// Launch the cvt kernel.
if (useUE8M0) {
cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
reinterpret_cast<uint32_t*>(SFOuput));
} else {
cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
reinterpret_cast<uint32_t*>(SFOuput));
}
}
// Instantiate the function.
template void invokeFP4Quantization(int m, int n, half const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
cudaStream_t stream);
template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
cudaStream_t stream);
} // namespace vllm
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@ -147,13 +119,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
// We don't support e8m0 scales at this moment.
bool useUE8M0 = false;
// Grid, Block size. Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
int effectiveRows = vllm::computeEffectiveRows(m);
dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
sf_out, useUE8M0, multiProcessorCount, stream);
// NOTE: We don't support e8m0 scales at this moment.
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out));
});
}

View File

@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
return b;
}
// Compute SF output offset for swizzled tensor core layout.
// SF layout: [numMTiles, numKTiles, 32, 4, 4]
// Caller must precompute: numKTiles = (numCols + 63) / 64
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
int numCols,
SFType* SFout) {
__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
CVT_FP4_NUM_THREADS_PER_SF == 2);
// One pair of threads write one SF to global memory.
// TODO: stage through smem for packed STG.32
// is it better than STG.8 from 4 threads ?
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
// SF vector index (16 elements share one SF in the K dimension).
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
int32_t mIdx = rowIdx;
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
int32_t mTileIdx = mIdx / (32 * 4);
// SF vector size 16.
int factor = CVT_FP4_SF_VEC_SIZE * 4;
int32_t numKTiles = (numCols + factor - 1) / factor;
int64_t mTileStride = numKTiles * 32 * 4 * 4;
int32_t kTileIdx = (kIdx / 4);
int64_t kTileStride = 32 * 4 * 4;
// M tile layout [32, 4] is column-major.
int32_t outerMIdx = (mIdx % 32);
int64_t outerMStride = 4 * 4;
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
int64_t innerMStride = 4;
int32_t innerKIdx = (kIdx % 4);
int64_t innerKStride = 1;
// Compute the global offset.
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
outerMIdx * outerMStride + innerMIdx * innerMStride +
innerKIdx * innerKStride;
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
return nullptr;
}
return nullptr;
// SF vector index (16 elements share one SF in the K dimension).
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
int32_t mIdx = rowIdx;
// Decompose indices using bitwise ops (all divisors are powers of 2).
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
int32_t mTileIdx = mIdx >> 7; // mIdx / 128
int32_t outerMIdx = mIdx & 31; // mIdx % 32
int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4
int32_t kTileIdx = kIdx >> 2; // kIdx / 4
int32_t innerKIdx = kIdx & 3; // kIdx % 4
// Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
// outerMIdx * 16 + innerMIdx * 4 + innerKIdx
// Use bitwise OR for non-overlapping lower bits.
int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
<< 9 |
(outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
}
// Quantizes the provided PackedVec into the uint32_t output

View File

@ -1,373 +0,0 @@
#include "core/registration.h"
#include <torch/all.h>
#include <cutlass/arch/arch.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/group_array_problem_shape.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/gett.hpp"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include <cassert>
using namespace cute;
template <typename ElementAB, typename ElementC, typename ElementAccumulator,
typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
__global__ void get_ggemm_starts(
int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
ElementAB* b_base_as_int, ElementC* out_base_as_int,
ElementAccumulator* a_scale_base_as_int,
ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
int expert_id = threadIdx.x;
if (expert_id >= gridDim.x * blockDim.x) {
return;
}
int m = problem_sizes[expert_id * 3];
int n = problem_sizes[expert_id * 3 + 1];
int k = problem_sizes[expert_id * 3 + 2];
int32_t expert_offset = expert_offsets[expert_id];
int a_stride = expert_offset * k;
int b_stride = expert_id * k * n;
int a_scale_stride = expert_offset * k / 128;
int b_scale_stride = expert_id * k * n / 128 / 128;
a_offsets[expert_id] = a_base_as_int + a_stride;
b_offsets[expert_id] = b_base_as_int + b_stride;
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
*layout_sfa_ptr =
ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
*layout_sfb_ptr =
ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
}
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
ScaleConfig) \
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA, \
LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>( \
static_cast<int32_t*>(expert_offsets.data_ptr()), \
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()), \
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
static_cast<float**>(a_scales_ptrs.data_ptr()), \
static_cast<float**>(b_scales_ptrs.data_ptr()), \
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()), \
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
static_cast<float*>(a_scales.data_ptr()), \
static_cast<float*>(b_scales.data_ptr()), \
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()), \
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()), \
static_cast<int*>(problem_sizes.data_ptr())); \
}
template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
void run_get_ggemm_starts(
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
torch::Tensor out_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
int num_experts = (int)expert_offsets.size(0);
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
if (false) {
}
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
LayoutSFB, ScaleConfig)
__CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
LayoutSFB, ScaleConfig)
else {
TORCH_CHECK(false, "Unsupported output tensor type");
}
}
template <typename OutType, typename ScheduleConfig, typename LayoutD>
void run_blockwise_scaled_group_mm(
torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
const torch::Tensor& stride_b, const torch::Tensor& stride_c,
const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
// Types
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = OutType;
using ElementD = ElementC;
using ElementAccumulator = float;
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = LayoutD;
// Alignments
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
using ArchTag = cutlass::arch::Sm100;
using OperatorClass = cutlass::arch::OpClassTensorOp;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
typename ScheduleConfig::ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass, ElementA,
cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
AlignmentA, ElementB,
cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
typename ScheduleConfig::ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
typename ScheduleConfig::KernelSchedule>::CollectiveOp;
using GemmKernel =
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
CollectiveEpilogue, void>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
int num_experts = (int)expert_offsets.size(0);
Gemm gemm_op;
// Mainloop Arguments
typename GemmKernel::MainloopArguments mainloop_args{
static_cast<const ElementA**>(a_ptrs.data_ptr()),
static_cast<StrideA*>(stride_a.data_ptr()),
static_cast<const ElementB**>(b_ptrs.data_ptr()),
static_cast<StrideB*>(stride_b.data_ptr()),
static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
layout_sfa.data_ptr()),
static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
layout_sfb.data_ptr())};
int device_id = a_ptrs.device().index();
static const cutlass::KernelHardwareInfo hw_info{
device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
device_id)};
// Epilogue Arguments
typename GemmKernel::EpilogueArguments epilogue_args{
{}, // epilogue.thread
nullptr,
static_cast<StrideC*>(stride_c.data_ptr()),
static_cast<ElementD**>(out_ptrs.data_ptr()),
static_cast<StrideC*>(stride_c.data_ptr())};
UnderlyingProblemShape* problem_sizes_as_shapes =
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
// Gemm Arguments
typename GemmKernel::Arguments args{
cutlass::gemm::GemmUniversalMode::kGrouped,
{num_experts, problem_sizes_as_shapes, nullptr},
mainloop_args,
epilogue_args,
hw_info};
at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
const cudaStream_t stream =
at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM");
size_t workspace_size = gemm_op.get_workspace_size(args);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
auto workspace = torch::empty(workspace_size, workspace_options);
auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
status = gemm_op.run(stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
template <typename OutType>
void blockwise_scaled_group_mm_dispatch_shape(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
struct MmaConfig {
using ElementA = cutlass::float_e4m3_t;
using KernelSchedule =
cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
using LayoutC = cutlass::layout::RowMajor;
using MmaTileShape = Shape<_128, _128, _128>;
using ClusterShape = Shape<_1, _1, _1>;
};
int num_experts = (int)expert_offsets.size(0);
auto a_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto b_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto out_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto a_scales_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto b_scales_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto layout_sfa = torch::empty(
{num_experts, 5},
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
auto layout_sfb = torch::empty(
{num_experts, 5},
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
auto stride_a = torch::full(
{num_experts}, a.size(1),
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto stride_b = torch::full(
{num_experts}, a.size(1),
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto stride_c = torch::full(
{num_experts}, output.size(1),
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
torch::TensorOptions options_int =
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
typename MmaConfig::LayoutSFB,
typename MmaConfig::ScaleConfig>(
expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
run_blockwise_scaled_group_mm<OutType, MmaConfig,
typename MmaConfig::LayoutC>(
out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
expert_offsets);
}
void cutlass_blockwise_scaled_grouped_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
TORCH_CHECK(problem_sizes.size(1) == 3,
"problem_sizes must have shape (num_experts, 3)");
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
"Number of experts in problem_sizes must match expert_offsets");
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
"problem_sizes must be int32");
TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
"a must be kFloat8_e4m3fn");
TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
"b must be kFloat8_e4m3fn");
TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
output.scalar_type() == torch::kHalf,
"output must be bfloat16 or half");
TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
"scales_a must be float32");
TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
"scales_b must be float32");
TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
"expert_offsets must be int32");
TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
TORCH_CHECK(problem_sizes.size(1) == 3,
"problem_sizes must have shape (num_experts, 3)");
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
"Number of experts in problem_sizes must match expert_offsets");
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
"problem_sizes must be int32");
TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
if (output.scalar_type() == torch::kBFloat16) {
blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
} else if (output.scalar_type() == torch::kFloat16) {
blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
} else {
TORCH_CHECK(false, "Unsupported output tensor type");
}
#endif
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("cutlass_blockwise_scaled_grouped_mm",
&cutlass_blockwise_scaled_grouped_mm);
}

View File

@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor alpha) -> ()");
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
// cutlass blockwise scaledgroup GEMM
ops.def(
"cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
"Tensor scales_a, Tensor scales_b, "
"Tensor problem_sizes, Tensor expert_offsets) -> ()");
// conditionally compiled so impl registration is in source file
// cutlass nvfp4 block scaled group GEMM
ops.def(
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"

View File

@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& uv pip install --system *.whl
ARG COMMON_WORKDIR
ARG BASE_IMAGE
# Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1
# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
# Workaround for ROCm profiler limits
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
CMD ["/bin/bash"]

View File

@ -1,15 +1,15 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="1c57644d"
ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_BRANCH="89075173"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_BRANCH="v0.24.1"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="59bd8ff2"
ARG AITER_BRANCH="6af8b687"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
## Manually Trigger the benchmark
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
For CPU environment, please use the image with "-cpu" postfix.
For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.
Here is an example for docker run command for CPU.
Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.
```bash
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
export HF_TOKEN=<valid Hugging Face token>
if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
IMG_SUFFIX="arm64-cpu"
else
IMG_SUFFIX="cpu"
fi
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
```
Then, run below command inside the docker instance.
@ -26,7 +33,7 @@ When run, benchmark script generates results under **benchmark/results** folder,
### Runtime environment variables
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).

View File

@ -77,25 +77,20 @@ This complicates the process as we cannot use the out-of-the-box
- `.buildkite/release-pipeline.yaml`
- `.buildkite/scripts/upload-wheels.sh`
## Address long vLLM build time
## Manually running vLLM builds on BuildKiteCI
When building vLLM with a new PyTorch/CUDA version, no cache will exist
in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
it doesn't populate the cache, so re-running it to warm up the cache
is ineffective.
When building vLLM with a new PyTorch/CUDA version, the vLLM sccache S3 bucket
will not have any cached artifacts, which can cause CI build jobs to exceed 5 hours.
Furthermore, vLLM's fastcheck pipeline operates in read-only mode and does not
populate the cache, making it ineffective for cache warm-up purposes.
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
when manually triggering a build on Buildkite. This branch accomplishes two things:
To address this, manually trigger a build on Buildkite to accomplish two objectives:
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
to warm it up so that future builds are faster.
1. Run the complete test suite against the PyTorch RC build by setting the environment variables: `RUN_ALL=1` and `NIGHTLY=1`
2. Populate the vLLM sccache S3 bucket with compiled artifacts, enabling faster subsequent builds
<p align="center" width="100%">
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/3b07f71b-bb18-4ca3-aeaf-da0fe79d315f" />
</p>
## Update all the different vLLM platforms

View File

@ -139,18 +139,18 @@ token data.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
```
<figure markdown="span">
![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/query.png" alt="query" width="70%" />
</p>
Each thread defines its own `q_ptr` which points to the assigned
query token data on global memory. For example, if `VEC_SIZE` is 4
and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
total of 128 elements divided into 128 / 4 = 32 vecs.
<figure markdown="span">
![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/q_vecs.png" alt="q_vecs" width="70%" />
</p>
```cpp
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr`
points to key token data based on `k_cache` at assigned block,
assigned head and assigned token.
<figure markdown="span">
![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/key.png" alt="key" width="70%" />
</p>
The diagram above illustrates the memory layout for key data. It
assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
elements for one token) that will be processed by 2 threads (one
thread group) separately.
<figure markdown="span">
![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/k_vecs.png" alt="k_vecs" width="70%" />
</p>
```cpp
K_vec k_vecs[NUM_VECS_PER_THREAD]
@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of
## Value
<figure markdown="span">
![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/value.png" alt="value" width="70%" />
</p>
<figure markdown="span">
![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/logits_vec.png" alt="logits_vec" width="50%" />
</p>
<figure markdown="span">
![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/v_vec.png" alt="v_vec" width="70%" />
</p>
Now we need to retrieve the value data and perform dot multiplication
with `logits`. Unlike query and key, there is no thread group

View File

@ -8,6 +8,16 @@ We recommend installing the library with:
pip install nvidia-modelopt
```
## Supported ModelOpt checkpoint formats
vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
following `quantization.quant_algo` values:
- `FP8`: per-tensor weight scale (+ optional static activation scale).
- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
## Quantizing HuggingFace Models with PTQ
You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
if __name__ == "__main__":
main()
```
## Running the OpenAI-compatible server
To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
```bash
vllm serve <path_to_exported_checkpoint> \
--quantization modelopt \
--host 0.0.0.0 --port 8000
```
## Testing (local checkpoints)
vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
by default in CI. To run the tests locally:
```bash
export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
pytest -q tests/quantization/test_modelopt.py
```

View File

@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
### How FP8 KV Cache Works
The FP8 KV cache implementation follows this workflow:
1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
### Performance Impact
The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:

View File

@ -352,10 +352,17 @@ Supported models:
* `zai-org/GLM-4.5`
* `zai-org/GLM-4.5-Air`
* `zai-org/GLM-4.6`
* `zai-org/GLM-4.6-Air`
Flags: `--tool-call-parser glm45`
### GLM-4.7 Models (`glm47`)
Supported models:
* `zai-org/GLM-4.7`
Flags: `--tool-call-parser glm47`
### Qwen3-Coder Models (`qwen3_xml`)
Supported models:

View File

@ -19,12 +19,12 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
```bash
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
```
??? console "pip"
```bash
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
```
!!! warning "set `LD_PRELOAD`"

View File

@ -23,12 +23,12 @@ Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
# use uv
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --torch-backend cpu
```
??? console "pip"
```bash
# use pip
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu
pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cpu
```
!!! warning "set `LD_PRELOAD`"
Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:

View File

@ -387,7 +387,7 @@ th {
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
@ -415,9 +415,10 @@ th {
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | | ✅︎ |
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |

View File

@ -5,130 +5,91 @@ Usage:
Single node:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2
-dp=2 \
-tp=2
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
--node-rank=0 \
--master-addr=10.99.48.128 \
--master-port=13345
-dp=2 \
-tp=2 \
--dp-num-nodes=2 \
--dp-node-rank=0 \
--dp-master-addr=10.99.48.128 \
--dp-master-port=13345
Node 1:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
--node-rank=1 \
--master-addr=10.99.48.128 \
--master-port=13345
-dp=2 \
-tp=2 \
--dp-num-nodes=2 \
--dp-node-rank=1 \
--dp-master-addr=10.99.48.128 \
--dp-master-port=13345
"""
import os
from time import sleep
from vllm import LLM, SamplingParams
from vllm import LLM, EngineArgs, SamplingParams
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import get_open_port
def parse_args():
import argparse
def create_parser():
parser = FlexibleArgumentParser(description="Data Parallel Inference")
parser = argparse.ArgumentParser(description="Data Parallel Inference")
# Add all engine args
EngineArgs.add_cli_args(parser)
parser.set_defaults(
model="ibm-research/PowerMoE-3b",
enable_expert_parallel=True,
)
# Add DP-specific args (separate from engine args to avoid conflicts)
parser.add_argument(
"--model",
"--dp-num-nodes",
type=int,
default=1,
help="Total number of nodes for data parallel.",
)
parser.add_argument(
"--dp-node-rank",
type=int,
default=0,
help="Rank of the current node for data parallel.",
)
parser.add_argument(
"--dp-master-addr",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path",
)
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
parser.add_argument(
"--node-size", type=int, default=1, help="Total number of nodes"
default="",
help="Master node IP address for DP coordination.",
)
parser.add_argument(
"--node-rank", type=int, default=0, help="Rank of the current node"
)
parser.add_argument(
"--master-addr", type=str, default="", help="Master node IP address"
)
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
parser.add_argument(
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
)
parser.add_argument(
"--trust-remote-code", action="store_true", help="Trust remote code."
)
parser.add_argument(
"--max-num-seqs",
"--dp-master-port",
type=int,
default=64,
help=("Maximum number of sequences to be processed in a single iteration."),
)
parser.add_argument(
"--max-model-len",
type=int,
help=("Maximum number of tokens to be processed in a single iteration."),
default=0,
help="Master node port for DP coordination.",
)
parser.add_argument(
"--timeout",
type=int,
default=300,
help=("Number of seconds before unresponsive process is killed."),
help="Number of seconds before unresponsive process is killed.",
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
parser.add_argument(
"--enable-dbo",
action="store_true",
help=("Enable microbatched execution"),
)
parser.add_argument(
"--compilation-config",
type=int,
help=("Compilation optimization (O) mode 0-3."),
)
parser.add_argument(
"--quantization",
type=str,
)
parser.add_argument(
"--disable-expert-parallel",
dest="enable_expert_parallel",
action="store_false",
help="Disable expert parallel (default: enabled).",
)
parser.set_defaults(enable_expert_parallel=True)
return parser.parse_args()
return parser
def main(
model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
GPUs_per_dp_rank,
enforce_eager,
enable_expert_parallel,
trust_remote_code,
max_num_seqs,
max_model_len,
compilation_config,
gpu_memory_utilization,
enable_dbo,
quantization,
engine_args,
):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@ -173,19 +134,7 @@ def main(
)
# Create an LLM.
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=enforce_eager,
enable_expert_parallel=enable_expert_parallel,
trust_remote_code=trust_remote_code,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enable_dbo=enable_dbo,
quantization=quantization,
compilation_config=compilation_config,
)
llm = LLM(**engine_args)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for i, output in enumerate(outputs):
@ -204,22 +153,29 @@ def main(
if __name__ == "__main__":
args = parse_args()
parser = create_parser()
args = vars(parser.parse_args())
dp_size = args.dp_size
tp_size = args.tp_size
node_size = args.node_size
node_rank = args.node_rank
# Extract DP-specific args (pop to remove from engine_args)
dp_size = args.pop("data_parallel_size")
dp_num_nodes = args.pop("dp_num_nodes")
dp_node_rank = args.pop("dp_node_rank")
dp_master_addr = args.pop("dp_master_addr")
dp_master_port = args.pop("dp_master_port")
timeout = args.pop("timeout")
if node_size == 1:
# Remaining args are engine args
engine_args = args
if dp_num_nodes == 1:
dp_master_ip = "127.0.0.1"
dp_master_port = get_open_port()
dp_master_port_val = get_open_port()
else:
dp_master_ip = args.master_addr
dp_master_port = args.master_port
dp_master_ip = dp_master_addr
dp_master_port_val = dp_master_port
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
dp_per_node = dp_size // node_size
assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
dp_per_node = dp_size // dp_num_nodes
from multiprocessing import Process
@ -230,34 +186,24 @@ if __name__ == "__main__":
procs = []
for local_dp_rank, global_dp_rank in enumerate(
range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
):
proc = Process(
target=main,
args=(
args.model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
tp_size,
args.enforce_eager,
args.enable_expert_parallel,
args.trust_remote_code,
args.max_num_seqs,
args.max_model_len,
args.compilation_config,
args.gpu_memory_utilization,
args.enable_dbo,
args.quantization,
dp_master_port_val,
engine_args,
),
)
proc.start()
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join(timeout=args.timeout)
proc.join(timeout=timeout)
if proc.exitcode is None:
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
proc.kill()

View File

@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
- `--max-num-batched-tokens=<large value>` **(default: 2048)** This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
## Local media inputs
To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:

View File

@ -313,7 +313,7 @@ async def test_chat_streaming_input_audio(
"format": "wav",
},
},
{"type": "text", "text": "What's happening in this audio?"},
{"type": "text", "text": "What's a short title for this audio?"},
],
}
]

View File

@ -0,0 +1,223 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Embedding shape validation in multimodal APIs.
Tests verify that embeddings with correct ndim but incorrect hidden_size
are rejected before they can cause crashes during model inference.
Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
classes, not by CompletionRenderer or MediaIO classes.
"""
import pytest
import torch
from vllm.multimodal.parse import (
AudioEmbeddingItems,
ImageEmbeddingItems,
MultiModalDataParser,
VideoEmbeddingItems,
)
class TestMultiModalParserShapeValidation:
"""Test hidden_size validation in MultiModalDataParser."""
def test_image_embeddings_correct_hidden_size_accepted(self):
"""Baseline: Image embeddings with correct hidden_size should work."""
expected_hidden_size = 768
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
valid_embeds = torch.randn(2, 100, expected_hidden_size)
result = parser.parse_mm_data({"image": valid_embeds})
assert "image" in result
assert isinstance(result["image"], ImageEmbeddingItems)
assert result["image"].get_count() == 2
def test_image_embeddings_wrong_hidden_size_rejected(self):
"""Security: Image embeddings with wrong hidden_size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 4096
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
with pytest.raises(ValueError) as exc_info:
parser.parse_mm_data({"image": invalid_embeds})
error_msg = str(exc_info.value).lower()
assert "image" in error_msg
assert "hidden dimension mismatch" in error_msg
def test_audio_embeddings_wrong_hidden_size_rejected(self):
"""Security: Audio embeddings with wrong hidden_size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 2048
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
with pytest.raises(ValueError) as exc_info:
parser.parse_mm_data({"audio": invalid_embeds})
error_msg = str(exc_info.value).lower()
assert "audio" in error_msg
assert "hidden dimension mismatch" in error_msg
def test_video_embeddings_wrong_hidden_size_rejected(self):
"""Security: Video embeddings with wrong hidden_size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 512
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
with pytest.raises(ValueError) as exc_info:
parser.parse_mm_data({"video": invalid_embeds})
error_msg = str(exc_info.value).lower()
assert "video" in error_msg
assert "hidden dimension mismatch" in error_msg
def test_list_of_embeddings_validates_each(self):
"""Security: Each embedding in list should be validated."""
expected_hidden_size = 768
wrong_hidden_size = 1024
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
# List with second tensor having wrong hidden_size
invalid_embeds = [
torch.randn(100, expected_hidden_size),
torch.randn(100, wrong_hidden_size),
]
with pytest.raises(ValueError) as exc_info:
parser.parse_mm_data({"image": invalid_embeds})
# Should identify which embedding failed
assert "[1]" in str(exc_info.value)
def test_validation_disabled_allows_any_size(self):
"""When validation disabled (legacy), any hidden_size allowed."""
parser = MultiModalDataParser(expected_hidden_size=None)
any_hidden_size = 12345
embeds = torch.randn(2, 100, any_hidden_size)
# Should not raise
result = parser.parse_mm_data({"image": embeds})
assert "image" in result
assert isinstance(result["image"], ImageEmbeddingItems)
class TestEmbeddingItemsDirectValidation:
"""Direct tests for EmbeddingItems hidden_size validation."""
def test_image_embedding_items_validates_batched_tensor(self):
"""Test validation for batched (3D) image embeddings."""
expected = 768
wrong = 1024
# Valid
valid = torch.randn(2, 100, expected)
items = ImageEmbeddingItems(valid, expected_hidden_size=expected)
assert items.get_count() == 2
# Invalid
invalid = torch.randn(2, 100, wrong)
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(invalid, expected_hidden_size=expected)
assert str(wrong) in str(exc_info.value)
assert str(expected) in str(exc_info.value)
def test_image_embedding_items_validates_list_of_tensors(self):
"""Test validation for list of 2D image embeddings."""
expected = 768
wrong = 512
# Valid list
valid_list = [torch.randn(100, expected), torch.randn(50, expected)]
items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected)
assert items.get_count() == 2
# Invalid list
invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)]
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(invalid_list, expected_hidden_size=expected)
assert "[1]" in str(exc_info.value)
def test_audio_embedding_items_validates(self):
"""Test validation for audio embeddings."""
expected = 768
wrong = 256
invalid = torch.randn(2, 100, wrong)
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(invalid, expected_hidden_size=expected)
assert "audio" in str(exc_info.value).lower()
def test_video_embedding_items_validates(self):
"""Test validation for video embeddings."""
expected = 768
wrong = 384
invalid = torch.randn(2, 100, wrong)
with pytest.raises(ValueError) as exc_info:
VideoEmbeddingItems(invalid, expected_hidden_size=expected)
assert "video" in str(exc_info.value).lower()
class TestShapeValidationIntegration:
"""Integration tests verifying attack scenarios are blocked."""
def test_attack_scenario_multimodal_image(self):
"""
Simulate attack through Chat API with image embeddings.
Verifies validation occurs in multimodal parser path.
"""
expected_hidden_size = 768
wrong_hidden_size = 4096
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
with pytest.raises(ValueError):
parser.parse_mm_data({"image": attack_tensor})
def test_attack_scenario_multimodal_audio(self):
"""
Simulate attack through Chat API with audio embeddings.
Verifies validation occurs in multimodal parser path.
"""
expected_hidden_size = 768
wrong_hidden_size = 2048
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
with pytest.raises(ValueError):
parser.parse_mm_data({"audio": attack_tensor})
def test_attack_scenario_multimodal_video(self):
"""
Simulate attack through Chat API with video embeddings.
Verifies validation occurs in multimodal parser path.
"""
expected_hidden_size = 768
wrong_hidden_size = 1024
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
with pytest.raises(ValueError):
parser.parse_mm_data({"video": attack_tensor})

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.util
import json
import time
@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
assert (
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
response = await client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
temperature=0.0,
max_tokens=250,
)
choice = response.choices[0]
assert choice.finish_reason == "length", (
f"Expected finish_reason='length', got {choice.finish_reason}"
)
assert choice.message.content is not None, (
"Content should not be None when truncated"
)
assert len(choice.message.content) > 0, "Content should not be empty"

View File

@ -15,6 +15,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@ -52,8 +53,19 @@ def with_tool_parser(request) -> bool:
return request.param
@pytest.fixture(
scope="module",
params=[True],
ids=["exclude_tools_when_tool_choice_none"],
)
def exclude_tools_when_tool_choice_none(request) -> bool:
return request.param
@pytest.fixture(scope="module")
def default_server_args(with_tool_parser: bool):
def default_server_args(
with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
):
args = [
# use half precision for speed and memory savings in CI environment
"--enforce-eager",
@ -72,6 +84,8 @@ def default_server_args(with_tool_parser: bool):
"--enable-auto-tool-choice",
]
)
if exclude_tools_when_tool_choice_none:
args.append("--exclude-tools-when-tool-choice-none")
return args
@ -335,6 +349,69 @@ async def test_gpt_oss_tool_message_array_content(
assert response_multi_array.choices[0].message is not None
@pytest.mark.asyncio
async def test_gpt_oss_tool_choice_none(
gptoss_client: OpenAI,
with_tool_parser: bool,
exclude_tools_when_tool_choice_none: bool,
):
if not (with_tool_parser and exclude_tools_when_tool_choice_none):
pytest.skip(
"skip tool_choice tests when non-tool or "
"--exclude-tools-when-tool-choice-none not set"
)
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string"},
"state": {"type": "string"},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}
]
messages = [
{
"role": "user",
"content": "What's the temperature(in degrees Celsius) in Dallas?",
},
]
tool_choice_auto = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools,
tool_choice="auto",
temperature=0.0,
)
msg = tool_choice_auto.choices[0].message
assert len(msg.tool_calls) == 1
tool_choice_none = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools,
tool_choice="none",
temperature=0.0,
)
msg = tool_choice_none.choices[0].message
assert len(msg.tool_calls) == 0
MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}"
@ -878,7 +955,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
],
)
@ -906,7 +982,6 @@ class TestServingChatWithHarmony:
input_messages_2,
[
{"role": "system"},
{"role": "developer"},
{"role": "user"},
# The analysis message should be dropped on subsequent inputs because
# of the subsequent assistant message to the final channel.
@ -966,7 +1041,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1047,7 +1122,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1128,7 +1203,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1178,7 +1253,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
verify_harmony_messages(
input_messages_3,
@ -1241,7 +1316,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
verify_harmony_messages(
input_messages_4,
@ -1297,7 +1372,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
# The reasoning that would have resulted in an analysis message is
# dropped because of a later assistant message to the final channel.
@ -1329,7 +1403,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
{
"role": "assistant",
@ -1359,7 +1432,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
{
"role": "assistant",
@ -1368,3 +1440,69 @@ class TestServingChatWithHarmony:
},
],
)
@pytest.mark.asyncio
async def test_tool_choice_validation_without_parser():
"""Test that tool_choice='required' or named tool without tool_parser
returns an appropriate error message."""
mock_engine = MagicMock(spec=AsyncLLM)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
models = OpenAIServingModels(
engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
)
# Create serving_chat without tool_parser (enable_auto_tools=False)
serving_chat = OpenAIServingChat(
mock_engine,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None,
enable_auto_tools=False, # No tool parser
)
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the weather in a given location",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}},
"required": ["location"],
},
},
}
]
# Test tool_choice="required" without tool_parser
req_required = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "What's the weather?"}],
tools=tools,
tool_choice="required",
)
response_required = await serving_chat.create_chat_completion(req_required)
assert isinstance(response_required, ErrorResponse)
assert "tool_choice" in response_required.error.message
assert "--tool-call-parser" in response_required.error.message
# Test named tool_choice without tool_parser
req_named = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{"role": "user", "content": "What's the weather?"}],
tools=tools,
tool_choice={"type": "function", "function": {"name": "get_weather"}},
)
response_named = await serving_chat.create_chat_completion(req_named)
assert isinstance(response_named, ErrorResponse)
assert "tool_choice" in response_named.error.message
assert "--tool-call-parser" in response_named.error.message

View File

@ -0,0 +1,212 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for harmony streaming delta extraction.
"""
from dataclasses import dataclass, field
from unittest.mock import patch
import pytest
from vllm.entrypoints.openai.serving_chat_stream_harmony import (
extract_harmony_streaming_delta,
)
@dataclass
class MockMessage:
"""Mock message object for testing."""
channel: str | None = None
recipient: str | None = None
@dataclass
class MockStreamableParser:
"""Mock StreamableParser for testing without openai_harmony dependency."""
messages: list[MockMessage] = field(default_factory=list)
class TestExtractHarmonyStreamingDelta:
"""Tests for extract_harmony_streaming_delta function."""
@pytest.mark.parametrize(
"delta_text,expected_content",
[
("Hello, world!", "Hello, world!"),
("", ""),
],
)
def test_final_channel_returns_content_delta(self, delta_text, expected_content):
"""Test that final channel returns a DeltaMessage with content."""
parser = MockStreamableParser()
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel="final",
cur_recipient=None,
prev_recipient=None,
delta_text=delta_text,
include_reasoning=False,
)
assert delta_message is not None
assert delta_message.content == expected_content
assert tools_streamed is False
@pytest.mark.parametrize(
"include_reasoning,expected_has_message",
[
(True, True),
(False, False),
],
)
def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message):
"""Test analysis channel respects include_reasoning flag."""
parser = MockStreamableParser()
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel="analysis",
cur_recipient=None,
prev_recipient=None,
delta_text="Let me think...",
include_reasoning=include_reasoning,
)
if expected_has_message:
assert delta_message is not None
assert delta_message.reasoning == "Let me think..."
else:
assert delta_message is None
assert tools_streamed is False
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
@patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
def test_new_tool_call(self, mock_make_tool_call_id, channel):
"""Test new tool call creation when recipient changes."""
mock_make_tool_call_id.return_value = "call_test123"
parser = MockStreamableParser()
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel=channel,
cur_recipient="functions.get_weather",
prev_recipient=None,
delta_text="",
include_reasoning=False,
)
assert delta_message is not None
assert len(delta_message.tool_calls) == 1
tool_call = delta_message.tool_calls[0]
assert tool_call.id == "call_test123"
assert tool_call.type == "function"
assert tool_call.function.name == "get_weather"
assert tool_call.function.arguments == ""
assert tool_call.index == 0
assert tools_streamed is True
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
def test_tool_call_argument_streaming(self, channel):
"""Test streaming tool call arguments (same recipient)."""
parser = MockStreamableParser()
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel=channel,
cur_recipient="functions.get_weather",
prev_recipient="functions.get_weather",
delta_text='{"location": "Paris"}',
include_reasoning=False,
)
assert delta_message is not None
tool_call = delta_message.tool_calls[0]
assert tool_call.id is None
assert tool_call.function.arguments == '{"location": "Paris"}'
assert tool_call.index == 0
assert tools_streamed is True
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
def test_tool_call_empty_arguments_returns_none(self, channel):
"""Test empty delta_text with same recipient returns None."""
parser = MockStreamableParser()
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel=channel,
cur_recipient="functions.get_weather",
prev_recipient="functions.get_weather",
delta_text="",
include_reasoning=False,
)
assert delta_message is None
assert tools_streamed is False
def test_tool_call_index_from_previous_messages(self):
"""Test tool call index accounts for previous function messages."""
messages = [
MockMessage(channel="analysis", recipient=None), # Not counted
MockMessage(channel="commentary", recipient="functions.tool1"), # Counted
MockMessage(channel="final", recipient=None), # Not counted
]
parser = MockStreamableParser(messages=messages)
delta_message, _ = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel="commentary",
cur_recipient="functions.tool2",
prev_recipient="functions.tool2",
delta_text="args",
include_reasoning=False,
)
assert delta_message.tool_calls[0].index == 1
@pytest.mark.parametrize(
"channel,recipient",
[
("commentary", None),
("commentary", "browser.search"),
],
)
def test_returns_tool_call_preambles(self, channel, recipient):
"""Test that invalid channel/recipient combinations return None."""
parser = MockStreamableParser()
delta_text = "some text"
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel=channel,
cur_recipient=recipient,
prev_recipient=None,
delta_text=delta_text,
include_reasoning=True,
)
assert delta_message.content == delta_text
assert tools_streamed is False
@pytest.mark.parametrize(
"channel,recipient",
[
(None, None),
("unknown_channel", None),
],
)
def test_returns_none_for_invalid_inputs(self, channel, recipient):
"""Test that invalid channel/recipient combinations return None."""
parser = MockStreamableParser()
delta_message, tools_streamed = extract_harmony_streaming_delta(
harmony_parser=parser,
cur_channel=channel,
cur_recipient=recipient,
prev_recipient=None,
delta_text="some text",
include_reasoning=True,
)
assert delta_message is None
assert tools_streamed is False

View File

@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16]
IS_NEOX = [True, False]
EPS_VALUES = [1e-5, 1e-6]
SEEDS = [13]
PARTIAL_ROPE = [True, False]
CUDA_DEVICES = ["cuda:0"]
@ -52,6 +53,7 @@ def _apply_qk_norm_rope(
@pytest.mark.parametrize("is_neox", IS_NEOX)
@pytest.mark.parametrize("eps", EPS_VALUES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
@torch.inference_mode()
def test_fused_qk_norm_rope_matches_reference(
device: str,
@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference(
is_neox: bool,
eps: float,
seed: int,
rotary_ratio: float,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference(
k_norm.weight.data.normal_(mean=1.0, std=0.1)
q_weight = q_norm.weight.data
k_weight = k_norm.weight.data
rotary_dim = int(head_dim * rotary_ratio)
rope = RotaryEmbedding(
head_size=head_dim,
rotary_dim=head_dim,
rotary_dim=rotary_dim,
max_position_embeddings=4096,
base=10000.0,
is_neox_style=is_neox,

View File

@ -258,16 +258,16 @@ class Config:
f"{self.fe_supported_types()}."
)
# Check block quanization support
is_block_quatized = self.quant_block_shape is not None
if is_block_quatized and self.quant_dtype is None:
# Check block quantization support
is_block_quantized = self.quant_block_shape is not None
if is_block_quantized and self.quant_dtype is None:
return False, "No block quantization support."
if is_block_quatized and not self.is_block_quant_supported():
if is_block_quantized and not self.is_block_quant_supported():
return False, "Mismatched block quantization support."
# deep_gemm only works with block-quantized
if self.needs_deep_gemm() and not is_block_quatized:
if self.needs_deep_gemm() and not is_block_quantized:
return False, "Needs DeepGEMM but not block quantized."
# Check dependencies (turn into asserts?)

View File

@ -1,92 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# DeepGEMM Style Cutlass Grouped GEMM Test
# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
import random
import pytest
import torch
from tests.kernels.moe.utils import per_token_cast_to_fp8
from tests.kernels.utils import baseline_scaled_mm
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import cdiv
@pytest.mark.parametrize(
"num_groups, expected_m_per_group, k, n",
[
(4, 8192, 7168, 4096),
(4, 8192, 2048, 7168),
(8, 4096, 7168, 4096),
(8, 4096, 2048, 7168),
(32, 1024, 7168, 4096),
(32, 1024, 2048, 7168),
],
)
@pytest.mark.parametrize("out_dtype", [torch.float16])
@pytest.mark.skipif(
(lambda x: x is None or x.to_int() != 100)(
current_platform.get_device_capability()
),
reason="Block Scaled Grouped GEMM is only supported on SM100.",
)
def test_cutlass_grouped_gemm(
num_groups: int,
expected_m_per_group: int,
k: int,
n: int,
out_dtype: torch.dtype,
):
device = "cuda"
alignment = 128
group_ms = [
int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
]
m = sum([cdiv(m, alignment) * alignment for m in group_ms])
x = torch.randn((m, k), device=device, dtype=out_dtype)
y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
out = torch.empty((m, n), device=device, dtype=out_dtype)
ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
pb_size = []
for i in range(num_groups):
pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
x_fp8 = per_token_cast_to_fp8(x)
y_fp8 = (
torch.empty_like(y, dtype=torch.float8_e4m3fn),
torch.empty(
(num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
),
)
for i in range(num_groups):
y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
for i in range(num_groups):
a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
b = y_fp8[0][i].t()
b_scale = y_fp8[1][i].t()
baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
ops.cutlass_blockwise_scaled_grouped_mm(
out,
x_fp8[0],
y_fp8[0],
x_fp8[1],
y_fp8[1],
problem_sizes,
expert_offsets[:-1],
)
torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)

View File

@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
from vllm.v1.worker.workspace import init_workspace_manager
NUM_EXPERTS = [8, 64, 192]
EP_SIZE = [1, 4]
@ -487,6 +488,7 @@ def test_mixtral_moe(
monkeypatch.setenv("MASTER_ADDR", "localhost")
monkeypatch.setenv("MASTER_PORT", "12345")
init_distributed_environment()
init_workspace_manager(torch.cuda.current_device())
# Instantiate our and huggingface's MoE blocks
vllm_config.compilation_config.static_forward_context = dict()
@ -533,6 +535,11 @@ def test_mixtral_moe(
torch.cuda.synchronize()
torch.cuda.empty_cache()
# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
# Run forward passes for both MoE blocks
hf_states, _ = hf_moe.forward(hf_inputs)
vllm_states = vllm_moe.forward(vllm_inputs)

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
"""Pytest configuration for vLLM multimodal tests."""
import warnings
@ -9,16 +9,13 @@ import torch
from vllm.platforms import current_platform
def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
def pytest_collection_modifyitems(config, items):
"""Configure ROCm-specific settings based on collected tests."""
if not current_platform.is_rocm():
return
skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers

View File

@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
vllm_runner_kwargs={
"attention_config": {
"backend": "ROCM_AITER_FA",
},
}
if current_platform.is_rocm()
else None,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[
pytest.mark.core_model,
@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={
"model_impl": "transformers",
# TODO: [ROCm] Revert this once issue #30167 is resolved
**(
{
"mm_processor_kwargs": {
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
}
if current_platform.is_rocm()
else {}
),
},
marks=[large_gpu_mark(min_gb=32)],
marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
),
#### Extended model tests
"aria": VLMTestInfo(
@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)],
marks=[
large_gpu_mark(min_gb=80),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
),
"molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"],

View File

@ -39,7 +39,7 @@ models = [MODEL_NAME]
def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm():
return {"backend": "TRITON_ATTN"}
return {"backend": "ROCM_AITER_FA"}
return None

View File

@ -138,7 +138,7 @@ def create_batched_mm_kwargs(
)
# TODO(Isotr0py): Don't initalize model during test
# TODO(Isotr0py): Don't initialize model during test
@contextmanager
def initialize_dummy_model(
model_cls: type[nn.Module],

View File

@ -459,6 +459,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
"MiMoV2FlashForCausalLM": _HfExamplesInfo(
"XiaomiMiMo/MiMo-V2-Flash", trust_remote_code=True
),
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
}

View File

@ -0,0 +1,249 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for embedding shape validation.
Simple, fast unit tests that can run without server fixtures.
Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
"""
import pytest
import torch
from vllm.multimodal.parse import (
AudioEmbeddingItems,
ImageEmbeddingItems,
)
class TestImageEmbedBasicValidation:
"""Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
def test_valid_2d_tensor_accepted(self):
"""Baseline: 2D tensors should be accepted."""
valid_tensor = torch.randn(10, 768, dtype=torch.float32)
# Should not raise - 2D is valid
items = ImageEmbeddingItems(valid_tensor)
assert items.get_count() == 10
def test_valid_3d_tensor_accepted(self):
"""Baseline: 3D tensors should be accepted."""
valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
# Should not raise - 3D is valid
items = ImageEmbeddingItems(valid_tensor)
assert items.get_count() == 2
def test_valid_list_of_2d_tensors_accepted(self):
"""Baseline: List of 2D tensors should be accepted."""
tensors = [
torch.randn(10, 768, dtype=torch.float32),
torch.randn(15, 768, dtype=torch.float32),
]
# Should not raise
items = ImageEmbeddingItems(tensors)
assert items.get_count() == 2
def test_1d_tensor_rejected(self):
"""Security: 1D tensors should be rejected (invalid ndim)."""
invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(invalid_tensor)
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
def test_4d_tensor_rejected(self):
"""Security: 4D tensors should be rejected (invalid ndim)."""
invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32) # 4D
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(invalid_tensor)
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
def test_hidden_size_validation_correct_size(self):
"""Embeddings with correct hidden size should be accepted."""
expected_hidden_size = 768
valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
# Should not raise
items = ImageEmbeddingItems(
valid_tensor, expected_hidden_size=expected_hidden_size
)
assert items.get_count() == 10
def test_hidden_size_validation_wrong_size_rejected(self):
"""Embeddings with wrong hidden size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 4096
invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(
invalid_tensor, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
assert str(wrong_hidden_size) in error_msg
assert str(expected_hidden_size) in error_msg
class TestAudioEmbedBasicValidation:
"""Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
def test_valid_2d_tensor_accepted(self):
"""Baseline: 2D tensors should be accepted."""
valid_tensor = torch.randn(10, 768, dtype=torch.float32)
# Should not raise - 2D is valid
items = AudioEmbeddingItems(valid_tensor)
assert items.get_count() == 10
def test_valid_3d_tensor_accepted(self):
"""Baseline: 3D tensors should be accepted."""
valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
# Should not raise - 3D is valid
items = AudioEmbeddingItems(valid_tensor)
assert items.get_count() == 2
def test_valid_list_of_2d_tensors_accepted(self):
"""Baseline: List of 2D tensors should be accepted."""
tensors = [
torch.randn(10, 768, dtype=torch.float32),
torch.randn(15, 768, dtype=torch.float32),
]
# Should not raise
items = AudioEmbeddingItems(tensors)
assert items.get_count() == 2
def test_1d_tensor_rejected(self):
"""Security: 1D tensors should be rejected (invalid ndim)."""
invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(invalid_tensor)
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
def test_scalar_rejected(self):
"""Security: Scalar tensors should be rejected."""
invalid_tensor = torch.tensor(1.0) # 0D (scalar)
with pytest.raises(ValueError):
AudioEmbeddingItems(invalid_tensor)
def test_hidden_size_validation_correct_size(self):
"""Embeddings with correct hidden size should be accepted."""
expected_hidden_size = 768
valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
# Should not raise
items = AudioEmbeddingItems(
valid_tensor, expected_hidden_size=expected_hidden_size
)
assert items.get_count() == 10
def test_hidden_size_validation_wrong_size_rejected(self):
"""Embeddings with wrong hidden size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 4096
invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(
invalid_tensor, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
assert str(wrong_hidden_size) in error_msg
assert str(expected_hidden_size) in error_msg
class TestShapeValidationDoSPrevention:
"""
Tests for DoS prevention through shape validation.
Verifies that embeddings with incorrect shapes are rejected early,
preventing crashes during model inference.
"""
def test_prevent_crash_from_wrong_shape_image_embeds(self):
"""
Prevent crash scenario: wrong hidden size in image embeddings.
Without validation, this would pass initial checks but crash later
during model forward pass when dimensions don't match.
"""
expected_hidden_size = 768 # Typical model hidden size
wrong_hidden_size = 4096 # Wrong size (e.g., Llama-sized)
wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
# Should be rejected at instantiation time, not during inference
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(
wrong_embedding, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
assert str(expected_hidden_size) in error_msg # Expected
assert str(wrong_hidden_size) in error_msg # Received
def test_prevent_crash_from_wrong_shape_audio_embeds(self):
"""
Prevent crash scenario: wrong hidden size in audio embeddings.
"""
expected_hidden_size = 768
wrong_hidden_size = 4096
wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(
wrong_embedding, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
def test_extremely_large_hidden_size_rejected(self):
"""Security: Prevent DoS from extremely large embeddings."""
expected_hidden_size = 768
huge_hidden_size = 100000 # Large but not extreme to avoid test OOM
invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(
invalid_tensor, expected_hidden_size=expected_hidden_size
)
assert "hidden dimension mismatch" in str(exc_info.value).lower()
def test_batch_with_mixed_hidden_sizes_rejected(self):
"""All embeddings in a list must have the same hidden size."""
expected_hidden_size = 768
# One correct, one wrong
batch = [
torch.randn(10, expected_hidden_size, dtype=torch.float32),
torch.randn(10, expected_hidden_size + 100, dtype=torch.float32), # Wrong!
]
# Should fail on the second one
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
assert "hidden dimension mismatch" in str(exc_info.value).lower()
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])

View File

@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
with vllm_runner(model_path, enforce_eager=True) as llm:
@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs(
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:

View File

@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import (
Fp8Config,
Fp8KVCacheMethod,
Fp8LinearMethod,
Fp8MoeBackend,
Fp8MoEMethod,
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@ -216,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None:
ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
# Reference dynamic quantizaton
# Reference dynamic quantization
y = quantize_ref(x, inv_scale)
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
@ -324,7 +325,10 @@ def test_fp8_reloading(
weight_loader=default_weight_loader,
)
# Fp8LinearMethod uses use_marlin
# Fp8MoEMethod uses fp8_backend
method.use_marlin = use_marlin
method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None
# capture weights format during loading
original_metadata = [

View File

@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
"""
import os
from typing import NoReturn
import pytest
import torch
@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
def _skip(msg: str) -> NoReturn:
pytest.skip(msg)
raise RuntimeError(msg)
def _snapshot_download_or_skip(model_id: str) -> str:
try:
from huggingface_hub import snapshot_download
except Exception as e: # pragma: no cover
_skip(f"huggingface_hub is required to download {model_id}: {e}")
try:
return snapshot_download(
repo_id=model_id,
repo_type="model",
# These checkpoints are already small; download full repo for simplicity.
allow_patterns=["*"],
)
except Exception as e:
_skip(f"Failed to download {model_id} from the HF Hub: {e}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8 output: {output}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
model_path = _snapshot_download_or_skip(model_id)
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8PcPtLinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Per-channel scales; activations are dynamically scaled per token.
assert hasattr(qkv_proj, "weight_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.weight_scale.dim() == 1
assert not hasattr(qkv_proj, "input_scale")
assert hasattr(o_proj, "weight_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.weight_scale.dim() == 1
assert not hasattr(o_proj, "input_scale")
assert hasattr(gate_up_proj, "weight_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.weight_scale.dim() == 1
assert not hasattr(gate_up_proj, "input_scale")
assert hasattr(down_proj, "weight_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.weight_scale.dim() == 1
assert not hasattr(down_proj, "input_scale")
llm.apply_model(check_model)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8_PB_WO checkpoint setup."""
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
model_path = _snapshot_download_or_skip(model_id)
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8PbWoLinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
assert hasattr(qkv_proj, "weight_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.weight_scale.dim() == 2
assert hasattr(o_proj, "weight_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.weight_scale.dim() == 2
assert hasattr(gate_up_proj, "weight_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.weight_scale.dim() == 2
assert hasattr(down_proj, "weight_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.weight_scale.dim() == 2
llm.apply_model(check_model)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8_PB_WO output: {output}")

View File

@ -18,25 +18,37 @@ for i in {1..5}; do
echo "Checking metadata.json URL (attempt $i)..."
if curl --fail "$meta_json_url" > metadata.json; then
echo "INFO: metadata.json URL is valid."
# check whether it is valid json by python
# check whether it is valid json by python (printed to stdout)
if python3 -m json.tool metadata.json; then
echo "INFO: metadata.json is valid JSON. Proceeding with the test."
echo "INFO: metadata.json is valid JSON. Proceeding with the check."
# check whether there is an object in the json matching:
# "package_name": "vllm", and "platform_tag" matches the current architecture
# see `determine_wheel_url` in setup.py for more details
if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \
s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \
for o in d)))" 2>/dev/null; then
echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture."
break
else
echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture."
fi
else
echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
echo "INFO: metadata.json content:"
cat metadata.json
exit 1
fi
break
fi
# failure handling
# failure handling & retry logic
if [ $i -eq 5 ]; then
echo "ERROR: metadata.json URL is still not valid after 5 attempts."
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
echo "ERROR: metadata is still not available after 5 attempts."
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available."
echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
echo " NOTE: If it fails, please report in #sig-ci channel."
exit 1
else
echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
sleep 180
echo "WARNING: metadata is not available. Retrying after 5 minutes..."
sleep 300
fi
done

View File

@ -0,0 +1,169 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.attention.backends.abstract import (
AttentionBackend,
AttentionImpl,
)
from vllm.attention.backends.registry import (
AttentionBackendEnum,
MambaAttentionBackendEnum,
register_backend,
)
class CustomAttentionImpl(AttentionImpl):
"""Mock custom attention implementation for testing."""
def __init__(self, *args, **kwargs):
super().__init__()
def forward(self, *args, **kwargs):
"""Mock forward pass."""
pass
class CustomAttentionBackend(AttentionBackend):
"""Mock custom attention backend for testing."""
@staticmethod
def get_name():
return "CUSTOM"
@staticmethod
def get_impl_cls():
return CustomAttentionImpl
@staticmethod
def get_builder_cls():
"""Mock builder class."""
return None
@staticmethod
def get_required_kv_cache_layout():
"""Mock KV cache layout."""
return None
class CustomMambaAttentionImpl(AttentionImpl):
"""Mock custom mamba attention implementation for testing."""
def __init__(self, *args, **kwargs):
super().__init__()
def forward(self, *args, **kwargs):
"""Mock forward pass."""
pass
class CustomMambaAttentionBackend(AttentionBackend):
"""Mock custom mamba attention backend for testing."""
@staticmethod
def get_name():
return "CUSTOM_MAMBA"
@staticmethod
def get_impl_cls():
return CustomMambaAttentionImpl
@staticmethod
def get_builder_cls():
"""Mock builder class."""
return None
@staticmethod
def get_required_kv_cache_layout():
"""Mock KV cache layout."""
return None
def test_custom_is_not_alias_of_any_backend():
# Get all members of AttentionBackendEnum
all_backends = list(AttentionBackendEnum)
# Find any aliases of CUSTOM
aliases = []
for backend in all_backends:
if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM:
aliases.append(backend.name)
# CUSTOM should not be an alias of any other backend
assert len(aliases) == 0, (
f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n"
f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n"
f"This happens when CUSTOM has the same value as another backend.\n"
f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n"
f"All backend values:\n"
+ "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends)
)
# Verify CUSTOM has its own unique identity
assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", (
f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'"
)
def test_register_custom_backend_with_class_path():
# Register with explicit class path
register_backend(
backend=AttentionBackendEnum.CUSTOM,
class_path="tests.test_attention_backend_registry.CustomAttentionBackend",
is_mamba=False,
)
# Check that CUSTOM backend is registered
assert AttentionBackendEnum.CUSTOM.is_overridden(), (
"CUSTOM should be overridden after registration"
)
# Get the registered class path
class_path = AttentionBackendEnum.CUSTOM.get_path()
assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend"
# Get the backend class
backend_cls = AttentionBackendEnum.CUSTOM.get_class()
assert backend_cls.get_name() == "CUSTOM"
assert backend_cls.get_impl_cls() == CustomAttentionImpl
def test_mamba_custom_is_not_alias_of_any_backend():
# Get all mamba backends
all_backends = list(MambaAttentionBackendEnum)
# Find any aliases of CUSTOM
aliases = []
for backend in all_backends:
if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM:
aliases.append(backend.name)
# CUSTOM should not be an alias of any other backend
assert len(aliases) == 0, (
f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n"
f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n"
f"All mamba backend values:\n"
+ "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends)
)
def test_register_custom_mamba_backend_with_class_path():
# Register with explicit class path
register_backend(
backend=MambaAttentionBackendEnum.CUSTOM,
class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend",
is_mamba=True,
)
# Check that the backend is registered
assert MambaAttentionBackendEnum.CUSTOM.is_overridden()
# Get the registered class path
class_path = MambaAttentionBackendEnum.CUSTOM.get_path()
assert (
class_path
== "tests.test_attention_backend_registry.CustomMambaAttentionBackend"
)
# Get the backend class
backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class()
assert backend_cls.get_name() == "CUSTOM_MAMBA"
assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl

View File

@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
envs.environment_variables[env_name] = lambda s=strategy: s
# Test the select_experts method
topk_weights, topk_ids, _ = fused_moe.select_experts(
topk_weights, topk_ids = fused_moe.select_experts(
hidden_states=hidden_states,
router_logits=router_logits,
)

View File

@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import importlib
import os
import signal
import time
import uuid
from dataclasses import dataclass
from threading import Thread
from types import SimpleNamespace
from typing import Any
from unittest.mock import MagicMock
@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
from vllm.v1.engine.core_client import (
AsyncMPClient,
EngineCoreClient,
SyncMPClient,
)
from vllm.v1.engine.utils import CoreEngineProcManager
from vllm.v1.executor.abstract import Executor
@ -60,6 +66,91 @@ def make_request(
)
def _reload_envs_module():
import vllm.envs as envs_mod
cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
if cache_clear is not None:
cache_clear()
return importlib.reload(envs_mod)
def _reload_core_client_module():
module = importlib.import_module("vllm.v1.engine.core_client")
return importlib.reload(module)
def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
timeout_value = 654
monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
# Ensure that the environment variable is loaded if caching is enabled
_reload_envs_module()
core_client_mod = _reload_core_client_module()
poll_timeouts: list[int] = []
class ShadowSocket:
def poll(self, timeout: int) -> int:
# Capture the timeout value for each poll call
poll_timeouts.append(timeout)
return 1
def recv_multipart(self):
return (b"\x00\x00", b"ready")
class DummySocket:
def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
if track:
return SimpleNamespace(done=True)
def recv_multipart(self, *, copy: bool = False):
return (b"", b"")
def close(self, *, linger: int = 0):
pass
def bind(self, _address):
pass
def connect(self, _address):
pass
def setsockopt(self, *_args, **_kwargs):
pass
monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
monkeypatch.setattr(
core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
)
parallel_config = SimpleNamespace(
data_parallel_size=1,
data_parallel_rank=0,
data_parallel_size_local=1,
data_parallel_rank_local=None,
data_parallel_hybrid_lb=False,
data_parallel_external_lb=False,
)
vllm_config = SimpleNamespace(parallel_config=parallel_config)
client = core_client_mod.MPClient(
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=object,
log_stats=False,
client_addresses={
"input_address": "inproc://input",
"output_address": "inproc://output",
},
)
try:
# timeout_value is in seconds, but poll receives milliseconds
assert poll_timeouts == [timeout_value * 1000]
finally:
client.shutdown()
def loop_until_done(client: EngineCoreClient, outputs: dict):
while True:
engine_core_outputs = client.get_output().outputs

View File

@ -0,0 +1,56 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch.cuda
from vllm import LLM, SamplingParams
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
"""Test that preprocessing errors are handled gracefully."""
assert not torch.cuda.is_initialized(), (
"fork needs to be used for the engine "
"core process and this isn't possible if cuda is already initialized"
)
# Store original method to call for non-failing requests
original_preprocess = EngineCore.preprocess_add_request
# Monkeypatch to make preprocess_add_request raise an exception
# only for requests with "FAIL" in the first token
def conditional_failing_preprocess(self, request: EngineCoreRequest):
# Fail if the first token id is 333
if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
raise ValueError("Simulated preprocessing error!")
return original_preprocess(self, request)
monkeypatch.setattr(
EngineCore, "preprocess_add_request", conditional_failing_preprocess
)
llm = LLM(model=MODEL_NAME)
# Create a failing request by crafting a request with an invalid token
# We need to use a direct approach since LLM.generate tokenizes for us
from vllm.inputs import TokensPrompt
# This should raise an exception due to the preprocessing failure
# Special token id to trigger the failure
failing_prompt = TokensPrompt(prompt_token_ids=[333])
outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10)) # type: ignore
assert len(outputs) == 1
assert len(outputs[0].outputs[0].token_ids) == 0
assert outputs[0].finished
assert outputs[0].outputs[0].finish_reason == "error"
# Verify the engine is still functional with a normal request
outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
assert len(outputs) == 1
assert len(outputs[0].outputs[0].token_ids) > 0
assert outputs[0].outputs[0].finish_reason in ("stop", "length")

View File

@ -547,6 +547,13 @@ def test_spec_decode_logprobs(
sampling_params = SamplingParams(
temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
)
penalty_sampling_params = SamplingParams(
temperature=0,
logprobs=top_logprobs,
max_tokens=10,
ignore_eos=False,
presence_penalty=-1.0,
)
method, model_name, spec_model_name = model_setup
max_model_len = 256
@ -558,14 +565,17 @@ def test_spec_decode_logprobs(
seed=42,
logprobs_mode=logprobs_mode,
gpu_memory_utilization=0.4,
enable_prefix_caching=False,
)
ref_results = ref_llm.generate(
[prompt, prompt], [sampling_params, penalty_sampling_params]
)
ref_results = ref_llm.generate([prompt], sampling_params)
# Collect logprobs outputs from reference LLM.
ref_logprobs = []
for output in ref_results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
ref_logprobs.append(logprobs[token_id])
for results in ref_results:
for output in results.outputs:
for logprobs in output.logprobs:
ref_logprobs.extend(logprobs.values())
del ref_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
@ -587,14 +597,17 @@ def test_spec_decode_logprobs(
# Force prefill chunking
enable_chunked_prefill=True,
max_num_batched_tokens=32,
enable_prefix_caching=False,
)
spec_results = spec_llm.generate(
[prompt, prompt], [sampling_params, penalty_sampling_params]
)
spec_results = spec_llm.generate([prompt], sampling_params)
# Collect logprobs outputs from spec decode LLM.
spec_logprobs = []
for output in spec_results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
spec_logprobs.append(logprobs[token_id])
for results in spec_results:
for output in results.outputs:
for logprobs in output.logprobs:
spec_logprobs.extend(logprobs.values())
del spec_llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()

View File

@ -761,7 +761,7 @@ class rocm_aiter_ops:
@classmethod
@if_aiter_supported
def is_linear_fp8_enaled(cls) -> bool:
def is_linear_fp8_enabled(cls) -> bool:
return cls.is_linear_enabled()
@classmethod

View File

@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
def cutlass_blockwise_scaled_grouped_mm(
output: torch.Tensor,
a: torch.Tensor,
b: torch.Tensor,
scales_a: torch.Tensor,
scales_b: torch.Tensor,
problem_sizes: torch.Tensor,
expert_offsets: torch.Tensor,
):
torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
)
def cutlass_scaled_fp4_mm(
a: torch.Tensor,
b: torch.Tensor,

View File

@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
)
CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
# Placeholder for third-party/custom backends - must be registered before use
CUSTOM = ""
# set to None to avoid alias with other backend, whose value is an empty string
CUSTOM = None
def get_path(self, include_classname: bool = True) -> str:
"""Get the class path for this backend (respects overrides).
@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
# Placeholder for third-party/custom backends - must be registered before use
CUSTOM = ""
# set to None to avoid alias with other backend, whose value is an empty string
CUSTOM = None
def get_path(self, include_classname: bool = True) -> str:
"""Get the class path for this backend (respects overrides).

View File

@ -15,7 +15,7 @@ def merge_attn_states(
output_lse: torch.Tensor | None = None,
) -> None:
# NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
# is not support for FP8 dtype, fallback to use Triton kernel.
# does not support FP8 dtype, fallback to use Triton kernel.
def supported_dtypes(o: torch.Tensor) -> bool:
return o.dtype in [torch.float32, torch.half, torch.bfloat16]

View File

@ -189,9 +189,14 @@ def kernel_unified_attention_2d(
+ 1
)
# adjust for potential padding in the last q_block by considering the
# actual sequence length
max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
if USE_MM_PREFIX:
# image bidirectional attention ranges require a full range
# including q_block padding to make sure doc mask is correct
max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len)
else:
# adjust for potential padding in the last q_block by considering the
# actual sequence length
max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
# calculate the number of tiles that need to be processed to
# cover the longest sequence prefix (due to causal masking, tiles beyond
@ -202,7 +207,8 @@ def kernel_unified_attention_2d(
# Default: keep previous global behavior
tile_start = 0
tile_end = num_tiles
if SLIDING_WINDOW > 0:
# TODO(Isotr0py): sliding window pruning with image bidirectional mask
if SLIDING_WINDOW > 0 and not USE_MM_PREFIX:
# Query rows covered by this Q-block
qpos_lo = q_block_local_idx * BLOCK_Q
qpos_hi = tl.minimum(
@ -357,6 +363,12 @@ def kernel_unified_attention_2d(
L = L * alpha + l_j
M = m_j
if SLIDING_WINDOW:
qpos_lo = q_block_local_idx * BLOCK_Q
V = tl.where(
(context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
)
# acc : (BLOCK_M, HEAD_SIZE_PADDED)
acc += tl.dot(P.to(V.dtype), V)
@ -672,6 +684,12 @@ def kernel_unified_attention_3d(
L = L * alpha + l_j
M = m_j
if SLIDING_WINDOW:
qpos_lo = q_block_local_idx * BLOCK_Q
V = tl.where(
(context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
)
# acc : (BLOCK_M, HEAD_SIZE_PADDED)
acc += tl.dot(P.to(V.dtype), V)

View File

@ -18,6 +18,7 @@ from vllm.config.lora import LoRAConfig
from vllm.config.model import (
ModelConfig,
iter_architecture_defaults,
str_dtype_to_torch_dtype,
try_match_architecture_defaults,
)
from vllm.config.multimodal import MultiModalConfig
@ -72,6 +73,7 @@ __all__ = [
# From vllm.config.model
"ModelConfig",
"iter_architecture_defaults",
"str_dtype_to_torch_dtype",
"try_match_architecture_defaults",
# From vllm.config.multimodal
"MultiModalConfig",

View File

@ -71,7 +71,7 @@ else:
logger = init_logger(__name__)
RunnerOption = Literal["auto", RunnerType]
ConvertType = Literal["none", "embed", "classify", "reward"]
ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"]
ConvertOption = Literal["auto", ConvertType]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
@ -843,12 +843,18 @@ class ModelConfig:
producer_name = quant_cfg.get("producer", {}).get("name")
if producer_name == "modelopt":
quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
if quant_algo == "FP8":
quant_cfg["quant_method"] = "modelopt"
elif quant_algo == "NVFP4":
quant_cfg["quant_method"] = "modelopt_fp4"
elif quant_algo is not None:
raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
if quant_algo is not None:
quant_algo_upper = str(quant_algo).upper()
if quant_algo_upper in {
"FP8",
"FP8_PER_CHANNEL_PER_TOKEN",
"FP8_PB_WO",
}:
quant_cfg["quant_method"] = "modelopt"
elif quant_algo_upper == "NVFP4":
quant_cfg["quant_method"] = "modelopt_fp4"
else:
raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
return quant_cfg
@ -1849,6 +1855,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
"bfloat16": torch.bfloat16,
}
def str_dtype_to_torch_dtype(type: str):
return _STR_DTYPE_TO_TORCH_DTYPE.get(type)
# model_type -> reason
_FLOAT16_NOT_SUPPORTED_MODELS = {
"gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",

View File

@ -71,7 +71,11 @@ class EngineClient(ABC):
truncate_prompt_tokens: int | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model."""
"""Generate outputs for a request from a pooling model.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove this argument in v0.15.
"""
...
@abstractmethod

View File

@ -51,6 +51,9 @@ from vllm.entrypoints.openai.protocol import (
ToolCall,
UsageInfo,
)
from vllm.entrypoints.openai.serving_chat_stream_harmony import (
extract_harmony_streaming_delta,
)
from vllm.entrypoints.openai.serving_engine import (
GenerationError,
OpenAIServing,
@ -253,18 +256,31 @@ class OpenAIServingChat(OpenAIServing):
truncate_tool_call_ids(request)
validate_request_params(request)
if (
request.tool_choice == "auto"
and not (self.enable_auto_tools and tool_parser is not None)
# Check if tool parsing is unavailable (common condition)
tool_parsing_unavailable = (
tool_parser is None
and not isinstance(tokenizer, MistralTokenizer)
and not self.use_harmony
)
# Validate tool_choice when tool parsing is required but unavailable
if tool_parsing_unavailable and request.tool_choice not in (
None,
"none",
):
# for hf tokenizers, "auto" tools requires
# --enable-auto-tool-choice and --tool-call-parser
return self.create_error_response(
'"auto" tool choice requires '
"--enable-auto-tool-choice and --tool-call-parser to be set"
)
if request.tool_choice == "auto" and not self.enable_auto_tools:
# for hf tokenizers, "auto" tools requires
# --enable-auto-tool-choice and --tool-call-parser
return self.create_error_response(
'"auto" tool choice requires '
"--enable-auto-tool-choice and --tool-call-parser to be set"
)
elif request.tool_choice != "auto":
# "required" or named tool requires tool parser
return self.create_error_response(
f'tool_choice="{request.tool_choice}" requires '
"--tool-call-parser to be set"
)
if request.tools is None or (
request.tool_choice == "none"
@ -299,7 +315,10 @@ class OpenAIServingChat(OpenAIServing):
)
else:
# For GPT-OSS.
conversation, engine_prompts = self._make_request_with_harmony(request)
should_include_tools = tool_dicts is not None
conversation, engine_prompts = self._make_request_with_harmony(
request, should_include_tools
)
except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(f"{e} {e.__cause__}")
@ -792,6 +811,11 @@ class OpenAIServingChat(OpenAIServing):
delta_text += harmony_parser.last_content_delta or ""
cur_channel = harmony_parser.current_channel
cur_recipient = harmony_parser.current_recipient
# handle the case where several tokens where generated at once
# including the final token, leading to a delta in the text
# but the current channel to be empty (start state)
if not cur_channel and delta_text:
cur_channel = "final"
else:
delta_text = output.text
@ -821,64 +845,17 @@ class OpenAIServingChat(OpenAIServing):
current_token_ids = as_list(output.token_ids)
if self.use_harmony:
if cur_channel == "final":
delta_message = DeltaMessage(content=delta_text)
elif cur_channel == "analysis":
if request.include_reasoning:
delta_message = DeltaMessage(reasoning=delta_text)
else:
delta_message = None
elif (
cur_channel == "commentary"
and cur_recipient
and cur_recipient.startswith("functions.")
):
# Count completed tool calls to determine index
base_index = 0
for msg in harmony_parser.messages:
if (
msg.channel == "commentary"
and msg.recipient
and msg.recipient.startswith("functions.")
):
base_index += 1
if prev_recipient != cur_recipient:
tool_name = cur_recipient.split("functions.", 1)[1]
delta_message = DeltaMessage(
tool_calls=[
DeltaToolCall(
id=make_tool_call_id(),
type="function",
function=DeltaFunctionCall(
name=tool_name,
arguments="",
),
index=base_index,
)
]
)
elif delta_text:
delta_message = DeltaMessage(
tool_calls=[
DeltaToolCall(
index=base_index,
function=DeltaFunctionCall(
arguments=delta_text
),
)
]
)
else:
delta_message = None
if delta_message is not None:
harmony_tools_streamed[i] = True
elif cur_channel == "commentary":
# Tool call preambles meant to be shown to the user
delta_message = DeltaMessage(content=delta_text)
else:
delta_message = None
delta_message, tools_streamed_flag = (
extract_harmony_streaming_delta(
harmony_parser=harmony_parser,
cur_channel=cur_channel,
cur_recipient=cur_recipient,
prev_recipient=prev_recipient,
delta_text=delta_text,
include_reasoning=request.include_reasoning,
)
)
harmony_tools_streamed[i] |= tools_streamed_flag
# handle streaming deltas for tools with named tool_choice
elif tool_choice_function_name:
if (
@ -1833,6 +1810,7 @@ class OpenAIServingChat(OpenAIServing):
def _make_request_with_harmony(
self,
request: ChatCompletionRequest,
should_include_tools: bool = True,
):
messages: list[OpenAIMessage] = []
@ -1850,13 +1828,16 @@ class OpenAIServingChat(OpenAIServing):
reasoning_effort=request.reasoning_effort,
browser_description=None,
python_description=None,
with_custom_tools=request.tools is not None,
with_custom_tools=should_include_tools,
)
messages.append(sys_msg)
# Add developer message.
dev_msg = get_developer_message(tools=request.tools)
messages.append(dev_msg)
if request.tools:
dev_msg = get_developer_message(
tools=request.tools if should_include_tools else None
)
messages.append(dev_msg)
# Add user message.
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))

View File

@ -0,0 +1,101 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Harmony-specific streaming delta extraction for chat completions.
This module handles the extraction of DeltaMessage objects from
harmony parser state during streaming chat completions.
"""
from openai_harmony import StreamableParser
from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.protocol import (
DeltaFunctionCall,
DeltaMessage,
DeltaToolCall,
)
def extract_harmony_streaming_delta(
harmony_parser: StreamableParser,
cur_channel: str | None,
cur_recipient: str | None,
prev_recipient: str | None,
delta_text: str,
include_reasoning: bool,
) -> tuple[DeltaMessage | None, bool]:
"""
Extract a DeltaMessage from harmony parser state during streaming.
Args:
harmony_parser: The StreamableParser instance tracking parse state
cur_channel: Current channel ("final", "analysis", "commentary", etc.)
cur_recipient: Current recipient (e.g., "functions.my_func")
prev_recipient: Previous recipient for detecting tool call transitions
delta_text: The text delta to include in the message
include_reasoning: Whether to include reasoning content
Returns:
A tuple of (DeltaMessage or None, tools_streamed_flag)
"""
tools_streamed = False
if cur_channel == "final":
delta_message = DeltaMessage(content=delta_text)
elif (
(cur_channel == "commentary" or cur_channel == "analysis")
and cur_recipient
and cur_recipient.startswith("functions.")
):
# Count completed tool calls to determine index
base_index = 0
for msg in harmony_parser.messages:
if (
(msg.channel == "commentary" or msg.channel == "analysis")
and msg.recipient
and msg.recipient.startswith("functions.")
):
base_index += 1
if prev_recipient != cur_recipient:
tool_name = cur_recipient.split("functions.", 1)[1]
delta_message = DeltaMessage(
tool_calls=[
DeltaToolCall(
id=make_tool_call_id(),
type="function",
function=DeltaFunctionCall(
name=tool_name,
arguments="",
),
index=base_index,
)
]
)
elif delta_text:
delta_message = DeltaMessage(
tool_calls=[
DeltaToolCall(
index=base_index,
function=DeltaFunctionCall(arguments=delta_text),
)
]
)
else:
delta_message = None
if delta_message is not None:
tools_streamed = True
elif cur_channel == "commentary":
# Tool call preambles meant to be shown to the user
delta_message = DeltaMessage(content=delta_text)
elif cur_channel == "analysis":
if include_reasoning:
delta_message = DeltaMessage(reasoning=delta_text)
else:
delta_message = None
else:
delta_message = None
return delta_message, tools_streamed

View File

@ -24,6 +24,7 @@ if TYPE_CHECKING:
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: str | None = None
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
VLLM_ENGINE_READY_TIMEOUT_S: int = 600
VLLM_API_KEY: str | None = None
VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
S3_ACCESS_KEY_ID: str | None = None
@ -604,6 +605,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
),
# Timeout in seconds for waiting for engine cores to become ready
# during startup. Default is 600 seconds (10 minutes).
"VLLM_ENGINE_READY_TIMEOUT_S": lambda: int(
os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600")
),
# API key for vLLM API server
"VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None),
# Whether to log responses from API Server for debugging

View File

@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
UnquantizedFusedMoEMethod,
)
from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
ZeroExpertFusedMoE,
)
from vllm.triton_utils import HAS_TRITON
_config: dict[str, Any] | None = None
@ -54,6 +57,7 @@ __all__ = [
"FusedMoEPrepareAndFinalize",
"RoutingMethodType",
"SharedFusedMoE",
"ZeroExpertFusedMoE",
"activation_without_mul",
"override_config",
"get_config",

View File

@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
OCP_MX_Scheme,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_triton_kernels
from vllm.utils.math_utils import cdiv
@ -39,6 +40,7 @@ if has_triton_kernels():
def _get_config_dtype_str(
dtype: torch.dtype,
use_fp8_w8a8: bool = False,
use_fp8_w8a16: bool = False,
use_int8_w8a16: bool = False,
use_int4_w4a16: bool = False,
ocp_mx_scheme: str | None = None,
@ -50,6 +52,8 @@ def _get_config_dtype_str(
"""
if use_fp8_w8a8:
return "fp8_w8a8"
elif use_fp8_w8a16:
return "fp8_w8a16"
elif use_int8_w8a16:
return "int8_w8a16"
elif use_int4_w4a16:
@ -319,6 +323,10 @@ class FusedMoEQuantConfig:
def use_int8_w8a16(self) -> bool:
return self._a1.dtype is None and self._w1.dtype == torch.int8
@property
def use_fp8_w8a16(self) -> bool:
return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()
@property
def use_int4_w4a16(self) -> bool:
return self._a1.dtype is None and self._w1.dtype == "int4"
@ -362,6 +370,7 @@ class FusedMoEQuantConfig:
"""
return _get_config_dtype_str(
use_fp8_w8a8=self.use_fp8_w8a8,
use_fp8_w8a16=self.use_fp8_w8a16,
use_int8_w8a16=self.use_int8_w8a16,
use_int4_w4a16=self.use_int4_w4a16,
ocp_mx_scheme=self.ocp_mx_scheme,
@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config(
) -> FusedMoEQuantConfig:
"""
Construct a quant config for 16-bit float activations and int4 weights.
Note: Activations are pre-quantized.
"""
group_shape = GroupShape(*block_shape) if block_shape is not None else None
return FusedMoEQuantConfig(
@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config(
)
def fp8_w8a16_moe_quant_config(
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
block_shape: list[int] | None = None,
) -> FusedMoEQuantConfig:
"""
Construct a quant config for 16-bit float activations and fp8 weights.
"""
group_shape = GroupShape(*block_shape) if block_shape is not None else None
return FusedMoEQuantConfig(
_a1=FusedMoEQuantDesc(),
_a2=FusedMoEQuantDesc(),
_w1=FusedMoEQuantDesc(
current_platform.fp8_dtype(), group_shape, w1_scale, None, None
),
_w2=FusedMoEQuantDesc(
current_platform.fp8_dtype(), group_shape, w2_scale, None, None
),
)
def int8_w8a16_moe_quant_config(
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config(
) -> FusedMoEQuantConfig:
"""
Construct a quant config for 16-bit float activations and int8 weights.
Note: Activations are pre-quantized.
"""
group_shape = GroupShape(*block_shape) if block_shape is not None else None
return FusedMoEQuantConfig(

View File

@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
TopKWeightAndReduceNoOP,
)
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
from vllm.scalar_type import scalar_types
logger = init_logger(__name__)
@ -896,162 +896,6 @@ def cutlass_moe_fp4(
)
def _valid_cutlass_block_scaled_grouped_gemm(
w1: torch.Tensor,
w2: torch.Tensor,
inplace: bool,
activation: str,
apply_router_weight_on_input: bool,
expert_map: torch.Tensor | None,
) -> bool:
def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
return N % 128 == 0 and K % 128 == 0
_, K, N = w2.size()
if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K):
logger.debug_once(
"CutlassBlockScaledGroupedGemm disabled: unaligned problem size. "
"N: %s, K: %s",
N,
K,
)
return False
if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
logger.debug_once(
"CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
"w1.dtype: %s, w2.dtype: %s",
w1.dtype,
w2.dtype,
)
return False
if expert_map is not None:
logger.debug_once(
"CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported."
)
return False
if activation != "silu":
logger.debug_once(
"CutlassBlockScaledGroupedGemm disabled: only activation silu is supported."
)
return False
if apply_router_weight_on_input:
logger.debug_once(
"CutlassBlockScaledGroupedGemm disabled:"
" apply_router_weight_on_input is not supported."
)
return False
if inplace:
logger.debug_once(
"CutlassBlockScaledGroupedGemm disabled: inplace is not supported."
)
return False
return True
# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8.
def run_cutlass_block_scaled_fused_experts(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
) -> torch.Tensor:
w1_q = w1.transpose(1, 2)
w2_q = w2.transpose(1, 2)
w1_scale = w1_scale.transpose(1, 2)
w2_scale = w2_scale.transpose(1, 2)
assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
assert a.shape[0] == topk_ids.shape[0], (
"a and topk_ids must have the same batch size"
)
assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch"
assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch"
assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
out_dtype = a.dtype
num_experts = w1_q.size(0)
m = a.size(0)
k = w1_q.size(1)
n = w2_q.size(1)
topk = topk_ids.size(1)
a_q, a1_scale = _fp8_quantize(
a, A_scale=None, per_act_token=False, block_shape=[128, 128]
)
device = a_q.device
expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
ops.get_cutlass_moe_mm_data(
topk_ids,
expert_offsets,
problem_sizes1,
problem_sizes2,
a_map,
c_map,
num_experts,
n,
k,
)
rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
rep_a1_scales = a1_scale[a_map]
c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
ops.cutlass_blockwise_scaled_grouped_mm(
c1,
rep_a_q,
w1_q,
rep_a1_scales,
w1_scale,
problem_sizes1,
expert_offsets[:-1],
)
intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
torch.ops._C.silu_and_mul(intermediate, c1)
intermediate_q, a2_scale = _fp8_quantize(
intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128]
)
ops.cutlass_blockwise_scaled_grouped_mm(
c2,
intermediate_q,
w2_q,
a2_scale,
w2_scale,
problem_sizes2,
expert_offsets[:-1],
)
return (
c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
).sum(dim=1)
# W4A8
def run_cutlass_moe_w4a8_fp8(
output: torch.Tensor,

View File

@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
batched_moe_align_block_size,
moe_align_block_size,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
TopKWeightAndReduceNoOP,
@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_moe_intermediate_size,
marlin_quant_input,
)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
is_k_full: bool = True,
):
# TODO (varun) : Enable activation quantization
assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
"Supports only mxfp4_w4a16 or int4_w4a16"
)
assert (
quant_config.use_mxfp4_w4a16
or quant_config.use_int4_w4a16
or quant_config.use_fp8_w8a16
), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16"
self.w13_g_idx = w13_g_idx
self.w2_g_idx = w2_g_idx
self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
@property
def quant_type_id(self) -> int:
# uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
return (
scalar_types.uint4b8.id
if self.quant_config.use_int4_w4a16
else scalar_types.float4_e2m1f.id
)
if self.quant_config.use_int4_w4a16:
return scalar_types.uint4b8.id
elif self.quant_config.use_mxfp4_w4a16:
return scalar_types.float4_e2m1f.id
elif (
self.quant_config.use_fp8_w8a16
and current_platform.fp8_dtype() == torch.float8_e4m3fn
):
return scalar_types.float8_e4m3fn.id
else:
raise NotImplementedError("Unsupported quantization type.")
def moe_problem_size(
self,
@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase):
ops.moe_sum(input, output)
def modular_marlin_fused_moe(
quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
) -> mk.FusedMoEModularKernel:
return mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MarlinExperts(quant_config),
shared_experts,
)
class BatchedMarlinExperts(MarlinExpertsBase):
def __init__(
self,

View File

@ -25,10 +25,6 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
_get_config_dtype_str,
)
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
_valid_cutlass_block_scaled_grouped_gemm,
run_cutlass_block_scaled_fused_experts,
)
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
_valid_deep_gemm,
deep_gemm_moe_fp8,
@ -1678,11 +1674,9 @@ def fused_experts(
expert_map: torch.Tensor | None = None,
quant_config: FusedMoEQuantConfig | None = None,
allow_deep_gemm: bool = False,
allow_cutlass_block_scaled_grouped_gemm: bool = False,
) -> torch.Tensor:
if quant_config is None:
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
use_fp8_w8a8 = quant_config.use_fp8_w8a8
# For now, disable DeepGemm for small N (<= 512) until better
# permute/unpermute ops are available.
@ -1712,23 +1706,6 @@ def fused_experts(
a2_scale=quant_config.a2_scale,
apply_router_weight_on_input=apply_router_weight_on_input,
)
elif (
allow_cutlass_block_scaled_grouped_gemm
and use_fp8_w8a8
and _valid_cutlass_block_scaled_grouped_gemm(
w1, w2, inplace, activation, apply_router_weight_on_input, expert_map
)
):
assert quant_config is not None
return run_cutlass_block_scaled_fused_experts(
a=hidden_states,
w1=w1,
w2=w2,
w1_scale=quant_config.w1_scale,
w2_scale=quant_config.w2_scale,
topk_weights=topk_weights,
topk_ids=topk_ids,
)
else:
return dispatch_fused_experts_func(inplace)(
hidden_states=hidden_states,

View File

@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
expert_map=None if self.disable_expert_map else layer.expert_map,
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
return result, zero_expert_result
else:
return result
return result

View File

@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
RoutingMethodType,
)
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
init_aiter_topK_meta_data,
)
@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
num_redundant_experts: int = 0,
has_bias: bool = False,
is_sequence_parallel=False,
zero_expert_num: int | None = 0,
zero_expert_type: str | None = None,
expert_mapping: list[tuple[str, str, int, str]] | None = None,
n_shared_experts: int | None = None,
routing_method_type: int | None = None,
@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
self.global_num_experts = num_experts + num_redundant_experts
self.logical_num_experts = num_experts
self.zero_expert_num = zero_expert_num
self.zero_expert_type = zero_expert_type
# Expert mapping used in self.load_weights
self.expert_mapping = expert_mapping
@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Route the input hidden states to the top-k experts based on the
router logits.
Returns:
(topk_weights, topk_ids, zero_expert_result)
(tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
The weights, expert ids, and zero expert computation result.
(topk_weights, topk_ids)
(tuple[torch.Tensor, torch.Tensor]):
The weights and expert ids.
**Compatibility**: When EPLB is not enabled, the returned ids are
equivalent to global logical ids, so should be compatible with
@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
assert topk_ids.dtype == indices_type or indices_type is None
# Compute zero expert result if needed
if (
self.zero_expert_num is not None
and self.zero_expert_num > 0
and self.zero_expert_type is not None
and self.global_num_experts is not None
):
zero_expert_result = zero_experts_compute_triton(
expert_indices=topk_ids,
expert_scales=topk_weights,
num_experts=self.global_num_experts,
zero_expert_type=self.zero_expert_type,
hidden_states=hidden_states,
)
else:
zero_expert_result = None
return topk_weights, topk_ids, zero_expert_result
return topk_weights, topk_ids
def must_reduce_shared_expert_outputs(self) -> bool:
"""
@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp):
fused_output = torch.ops.vllm.moe_forward(
hidden_states, router_logits, self.layer_name
)
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(fused_output, tuple)
fused_output, zero_expert_result = fused_output
return (reduce_output(fused_output) + zero_expert_result)[
..., :og_hidden_states
]
else:
return reduce_output(fused_output)[..., :og_hidden_states]
return reduce_output(fused_output)[..., :og_hidden_states]
else:
if current_platform.is_tpu() or current_platform.is_cpu():
# TODO: Once the OOM issue for the TPU backend is resolved, we
@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp):
final_hidden_states,
)
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, tuple)
assert self.shared_experts is None
final_hidden_states, zero_expert_result = final_hidden_states
if zero_expert_result is not None:
final_hidden_states += zero_expert_result
if not skip_result_store:
if self.shared_experts is None:
full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp):
shared_output,
final_hidden_states,
)
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, tuple)
final_hidden_states, zero_expert_result = final_hidden_states
def combine_output(states: torch.Tensor) -> torch.Tensor:
if do_naive_dispatch_combine:
@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp):
final_hidden_states[0],
combine_output(final_hidden_states[1]),
)
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, torch.Tensor)
return (combine_output(final_hidden_states), zero_expert_result)
else:
return combine_output(final_hidden_states)

View File

@ -6,6 +6,7 @@ import torch
import torch.nn.functional as F
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._aiter_ops import rocm_aiter_ops
from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp
@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEPermuteExpertsUnpermute,
FusedMoEPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum
@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
if current_platform.is_cuda_alike():
from .fused_batched_moe import BatchedTritonExperts
from .fused_moe import TritonExperts, fused_experts
from .fused_moe import TritonExperts
else:
fused_experts = None # type: ignore
TritonExperts = None # type: ignore
if current_platform.is_tpu():
from .moe_pallas import fused_moe as fused_moe_pallas
@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
else:
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
elif current_platform.is_cuda_alike():
self.moe_quant_config = self.get_fused_moe_quant_config(layer)
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
TritonExperts(self.moe_quant_config),
shared_experts=None,
)
def apply(
self,
@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
router_logits=router_logits,
)
def get_fused_moe_quant_config(
self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
if self.moe.has_bias:
return biased_moe_quant_config(
layer.w13_bias,
@ -295,7 +304,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
else:
result = fused_experts(
result = self.kernel(
hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
@ -330,19 +339,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
topk_ids=topk_ids,
inplace=True,
activation=layer.activation,
quant_config=self.moe_quant_config,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
return result, zero_expert_result
else:
return result
return result
def forward_cpu(
self,

View File

@ -0,0 +1,189 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import contextmanager
import torch
from torch import nn
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
class ZeroExpertFusedMoE(FusedMoE):
"""
A FusedMoE operation that also computes the results of zero experts.
Zero experts perform identity operations (scaled pass-through) instead
of full MLP computations.
This class uses memoization to avoid redundant routing computation:
routing is computed once and reused for both zero expert computation
and the main FusedMoE forward pass.
"""
def __init__(
self,
zero_expert_num: int,
zero_expert_type: str,
router: nn.Module,
**kwargs,
):
# ZeroExpertFusedMoE manages its own custom_routing_function for memoization
assert (
"custom_routing_function" not in kwargs
or kwargs.get("custom_routing_function") is None
), (
"ZeroExpertFusedMoE does not support external custom_routing_function. "
"It manages its own for routing memoization."
)
# Automatically slice router's e_score_correction_bias to only include
# real experts (not zero_experts) for the base FusedMoE.
# The full bias will be used temporarily in forward() for routing.
if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
num_real_experts = kwargs["num_experts"]
router_bias = router.e_score_correction_bias
user_bias = kwargs.get("e_score_correction_bias")
# Use router's bias if:
# 1. User didn't provide bias, or
# 2. User provided full bias (same size as router)
if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
# FusedMoE no longer accepts zero_expert_num/zero_expert_type.
# We handle zero experts ourselves in forward().
super().__init__(**kwargs)
# Store the actual zero_expert_num and zero_expert_type for our own use
self._actual_zero_expert_num = zero_expert_num
self._actual_zero_expert_type = zero_expert_type
self._router = router # Full router (includes zero experts)
# Expose zero_expert_num and zero_expert_type as attributes for
# compatibility with quantization methods that check these attributes
self.zero_expert_num = 0
self.zero_expert_type = None
# Memoization state for routing results
self._memoized_topk_weights: torch.Tensor | None = None
self._memoized_topk_ids: torch.Tensor | None = None
# Create custom_routing_function to reuse memoized routing results
def custom_routing_function(hidden_states, gating_output, topk, renormalize):
"""Return memoized `topk_weights` and `topk_ids`."""
if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
raise RuntimeError(
"ZeroExpertFusedMoE: routing results not memoized. "
"Call select_experts first to compute routing."
)
return self._memoized_topk_weights, self._memoized_topk_ids
self.custom_routing_function = custom_routing_function
@contextmanager
def _temporarily_set_attrs(self, **attrs):
"""
Temporarily set attributes using object.__setattr__ and restore them.
This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
When PyTorch Dynamo traces the forward pass, it cannot handle
nn.Module.__setattr__ calls (which include parameter registration logic),
resulting in "Unsupported" errors. Using object.__setattr__ directly
sets the attribute without triggering nn.Module's custom __setattr__,
allowing Dynamo to trace the code successfully.
"""
originals = {key: getattr(self, key) for key in attrs}
try:
for key, value in attrs.items():
object.__setattr__(self, key, value)
yield
finally:
for key, value in originals.items():
object.__setattr__(self, key, value)
def _compute_zero_expert_result(
self,
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
) -> torch.Tensor | None:
"""Compute zero expert results using pre-computed routing."""
if (
self._actual_zero_expert_num is None
or self._actual_zero_expert_num <= 0
or self._actual_zero_expert_type is None
):
return None
return zero_experts_compute_triton(
expert_indices=topk_ids.clone(),
expert_scales=topk_weights.clone(),
num_experts=self.logical_num_experts,
zero_expert_type=self._actual_zero_expert_type,
hidden_states=hidden_states,
)
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor, # Full logits including zero experts
) -> torch.Tensor:
"""
Forward pass with zero expert support and routing memoization.
Args:
hidden_states: Input hidden states
router_logits: Full router logits (including zero experts)
Returns:
Combined output from real experts and zero experts
"""
# Prepare temporary attribute overrides for routing computation
temp_attrs = {
"custom_routing_function": None, # Disable for first routing
}
if self._router is not None:
temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
# Compute routing with temporary attributes
# Pass full router_logits (including zero experts) so that zero experts
# can be properly identified in topk_ids
with self._temporarily_set_attrs(**temp_attrs):
topk_weights, topk_ids = self.select_experts(
hidden_states=hidden_states,
router_logits=router_logits, # Full logits (includes zero experts)
)
# Compute zero expert result if needed
zero_expert_result = self._compute_zero_expert_result(
hidden_states=hidden_states,
topk_weights=topk_weights,
topk_ids=topk_ids,
)
# Memoize routing results for reuse in super().forward()
self._memoized_topk_weights = topk_weights
self._memoized_topk_ids = topk_ids
# Slice router_logits for real experts only
router_logits_sliced = router_logits[..., : self.logical_num_experts]
# Compute real expert results (will reuse memoized routing via
# custom_routing_function)
# zero_expert_num is already 0, so FusedMoE won't handle zero experts
fused_out = super().forward(
hidden_states=hidden_states,
router_logits=router_logits_sliced,
)
# Combine results
# Both zero_expert_result and fused_out are computed from the same
# hidden_states, so they should be on the same device.
if zero_expert_result is not None:
fused_out = fused_out + zero_expert_result
# Clear memoization after use
self._memoized_topk_weights = None
self._memoized_topk_ids = None
return fused_out

View File

@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [
"GPTQLinearMethod",
"FBGEMMFp8LinearMethod",
"ModelOptFp8LinearMethod",
"ModelOptFp8PcPtLinearMethod",
"ModelOptFp8PbWoLinearMethod",
"IPEXAWQLinearMethod",
"IPEXGPTQLinearMethod",
"HQQMarlinMethod",
@ -277,6 +279,7 @@ class LinearBase(CustomOp):
self.params_dtype = params_dtype
self.quant_config = quant_config
self.prefix = prefix
self.allow_fp8_block_shape_mismatch = False
if quant_config is None:
self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
else:
@ -475,6 +478,7 @@ class ColumnParallelLinear(LinearBase):
disable_tp=disable_tp,
)
self._maybe_allow_fp8_block_shape_mismatch()
self.gather_output = gather_output
if output_sizes is None:
@ -509,6 +513,33 @@ class ColumnParallelLinear(LinearBase):
self.register_parameter("bias", None)
self.update_param_tp_status()
def _maybe_allow_fp8_block_shape_mismatch(self) -> None:
quant_config = getattr(self, "quant_config", None)
weight_block = getattr(quant_config, "weight_block_size", None)
if (
weight_block is None
or len(weight_block) < 1
or len(self.output_partition_sizes) <= 1
):
return
try:
block_n = int(weight_block[0])
except (ValueError, TypeError):
return
if block_n <= 0:
return
if any(size % block_n != 0 for size in self.output_partition_sizes):
self.allow_fp8_block_shape_mismatch = True
logger.debug(
"Allowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)",
getattr(self, "prefix", "<unknown>"),
block_n,
self.output_partition_sizes,
)
def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
output_dim = getattr(param, "output_dim", None)
@ -906,9 +937,11 @@ class QKVParallelLinear(ColumnParallelLinear):
*,
return_bias: bool = True,
disable_tp: bool = False,
v_head_size: int | None = None,
):
self.hidden_size = hidden_size
self.head_size = head_size
self.v_head_size = v_head_size if v_head_size is not None else head_size
self.total_num_heads = total_num_heads
if total_num_kv_heads is None:
total_num_kv_heads = total_num_heads
@ -924,12 +957,14 @@ class QKVParallelLinear(ColumnParallelLinear):
self.num_kv_head_replicas = 1
input_size = self.hidden_size
output_size = (
(self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
)
self.num_heads * self.head_size
+ self.num_kv_heads * self.head_size
+ self.num_kv_heads * self.v_head_size
) * tp_size
self.output_sizes = [
self.num_heads * self.head_size * tp_size, # q_proj
self.num_kv_heads * self.head_size * tp_size, # k_proj
self.num_kv_heads * self.head_size * tp_size, # v_proj
self.num_kv_heads * self.v_head_size * tp_size, # v_proj
]
super().__init__(
@ -950,7 +985,8 @@ class QKVParallelLinear(ColumnParallelLinear):
"q": 0,
"k": self.num_heads * self.head_size,
"v": (self.num_heads + self.num_kv_heads) * self.head_size,
"total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
"total": (self.num_heads + self.num_kv_heads) * self.head_size
+ self.num_kv_heads * self.v_head_size,
}
return shard_offset_mapping.get(loaded_shard_id)
@ -958,7 +994,7 @@ class QKVParallelLinear(ColumnParallelLinear):
shard_size_mapping = {
"q": self.num_heads * self.head_size,
"k": self.num_kv_heads * self.head_size,
"v": self.num_kv_heads * self.head_size,
"v": self.num_kv_heads * self.v_head_size,
}
return shard_size_mapping.get(loaded_shard_id)
@ -985,7 +1021,7 @@ class QKVParallelLinear(ColumnParallelLinear):
(
"v",
(self.total_num_heads + self.total_num_kv_heads) * self.head_size,
self.total_num_kv_heads * self.head_size,
self.total_num_kv_heads * self.v_head_size,
),
]
@ -1110,7 +1146,7 @@ class QKVParallelLinear(ColumnParallelLinear):
(
"v",
(self.total_num_heads + self.total_num_kv_heads) * self.head_size,
self.total_num_kv_heads * self.head_size,
self.total_num_kv_heads * self.v_head_size,
),
]
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
@ -1139,11 +1175,12 @@ class QKVParallelLinear(ColumnParallelLinear):
"v": (
(self.total_num_heads + self.total_num_kv_heads)
* self.head_size,
self.total_num_kv_heads * self.head_size,
self.total_num_kv_heads * self.v_head_size,
),
"total": (
(self.total_num_heads + 2 * self.total_num_kv_heads)
* self.head_size,
(self.total_num_heads + self.total_num_kv_heads)
* self.head_size
+ self.total_num_kv_heads * self.v_head_size,
0,
),
}
@ -1170,7 +1207,7 @@ class QKVParallelLinear(ColumnParallelLinear):
shard_size = self.num_kv_heads * self.head_size
elif loaded_shard_id == "v":
shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
shard_size = self.num_kv_heads * self.head_size
shard_size = self.num_kv_heads * self.v_head_size
# Special case for Quantized Weights.
# If quantized, we need to adjust the offset and size to account
# for the packing.
@ -1199,10 +1236,11 @@ class QKVParallelLinear(ColumnParallelLinear):
),
"v": (
(self.num_heads + self.num_kv_heads) * self.head_size,
self.num_kv_heads * self.head_size,
self.num_kv_heads * self.v_head_size,
),
"total": (
(self.num_heads + 2 * self.num_kv_heads) * self.head_size,
(self.num_heads + self.num_kv_heads) * self.head_size
+ self.num_kv_heads * self.v_head_size,
0,
),
}

View File

@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert layer.activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
e_score_correction_bias=layer.e_score_correction_bias,
)
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
f"{layer.activation} not supported for Marlin MoE."
)
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
"EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
)
assert self.moe_quant_config is not None
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
)
self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
if self.weight_block_size is not None:
assert not self.is_static_input_scheme

View File

@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from enum import Enum
from functools import partial
from typing import TYPE_CHECKING, Any, Optional
import torch
@ -33,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
RoutingMethodType,
fp8_w8a8_moe_quant_config,
fp8_w8a16_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
from vllm.model_executor.layers.linear import (
LinearBase,
@ -51,7 +50,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
FlashinferMoeBackend,
apply_flashinfer_per_tensor_scale_fp8,
build_flashinfer_fp8_cutlass_moe_prepare_finalize,
flashinfer_cutlass_moe_fp8,
get_flashinfer_moe_backend,
register_moe_scaling_factors,
rotate_flashinfer_fp8_moe_weights,
@ -97,7 +95,6 @@ from vllm.model_executor.parameter import (
)
from vllm.model_executor.utils import replace_parameter, set_weight_attrs
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils.deep_gemm import (
is_deep_gemm_e8m0_used,
is_deep_gemm_supported,
@ -118,20 +115,21 @@ class Fp8MoeBackend(Enum):
FLASHINFER_TRTLLM = 1
FLASHINFER_CUTLASS = 2
DEEPGEMM = 3
CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
MARLIN = 5
TRITON = 6
MARLIN = 4
TRITON = 5
def get_fp8_moe_backend(
block_quant: bool,
moe_parallel_config: FusedMoEParallelConfig,
with_lora_support: bool,
) -> Fp8MoeBackend:
) -> Fp8MoeBackend | None:
"""
Select the primary FP8 MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
"""
if current_platform.is_xpu():
return None
if with_lora_support:
return Fp8MoeBackend.TRITON
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
@ -191,17 +189,6 @@ def get_fp8_moe_backend(
logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
return Fp8MoeBackend.DEEPGEMM
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
if (
current_platform.is_cuda()
and current_platform.is_device_capability_family(100)
and block_quant
):
logger.info_once(
"Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
)
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
# default to Triton
logger.info_once("Using Triton backend for FP8 MoE")
return Fp8MoeBackend.TRITON
@ -306,6 +293,13 @@ class Fp8Config(QuantizationConfig):
return UnquantizedLinearMethod()
return XPUFp8LinearMethod(fp8_config)
elif isinstance(layer, FusedMoE):
if is_layer_skipped(
prefix=prefix,
ignored_layers=self.ignored_layers,
fused_mapping=self.packed_modules_mapping,
):
return UnquantizedFusedMoEMethod(layer.moe_config)
return XPUFp8MoEMethod(fp8_config, layer)
elif isinstance(layer, Attention):
return Fp8KVCacheMethod(self)
@ -420,7 +414,7 @@ class Fp8LinearMethod(LinearMethodBase):
if vllm_is_batch_invariant():
self.use_marlin = False
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
self.use_deep_gemm = is_deep_gemm_supported()
self.weight_block_size = self.quant_config.weight_block_size
@ -734,27 +728,33 @@ class Fp8MoEMethod(FusedMoEMethodBase):
)
self.marlin_input_dtype = None
self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
if self.block_quant:
assert self.weight_block_size == [128, 128], (
f"Only support weight_block_size == [128, 128], "
f"got {self.weight_block_size}"
if self.block_quant and self.weight_block_size != [128, 128]:
raise NotImplementedError(
"FlashInfer CUTLASS FP8 MoE backend only supports block "
"size [128, 128]."
)
if not self.block_quant:
if layer.renormalize or layer.custom_routing_function is not None:
raise NotImplementedError(
"FlashInfer CUTLASS FP8 MoE backend does custom routing "
f"function or renormalization, but got {layer.renormalize} and "
f"{layer.custom_routing_function}."
)
if layer.scoring_func != "sigmoid":
raise NotImplementedError(
"FlashInfer CUTLASS FP8 MoE backend only supports "
f"'sigmoid' scoring function, but got {layer.scoring_func}."
)
if layer.activation != "silu":
raise NotImplementedError(
"FlashInfer CUTLASS FP8 MoE backend only supports SiLU "
"activation function, but got {layer.activation}."
)
self.flashinfer_moe_fn = partial(
flashinfer_cutlass_moe_fp8,
moe=self.moe,
use_deepseek_fp8_block_scale=self.block_quant,
)
self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
self.allow_cutlass_block_scaled_grouped_gemm = (
self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
)
def create_weights(
self,
@ -943,7 +943,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
# DeepGemm scales need to be transposed and aligned. We try to do
# it ahead of time for performance reasons.
if self.allow_deep_gemm:
if self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
dg_w13_weight, dg_w13_weight_scale_inv = (
deepgemm_post_process_fp8_weight_block(
wq=layer.w13_weight.data,
@ -1046,7 +1046,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
layer.w13_weight.data = w13_weight.data
if self.use_marlin:
if self.fp8_backend == Fp8MoeBackend.MARLIN:
prepare_moe_fp8_layer_for_marlin(
layer, False, input_dtype=self.marlin_input_dtype
)
@ -1054,13 +1054,82 @@ class Fp8MoEMethod(FusedMoEMethodBase):
del layer.w13_input_scale
del layer.w2_input_scale
# NOTE(rob): this is a WIP refactor. We are first migrating
# all of the kernels in the TP case to use mk. Once this is
# done, then we will initialzie the TP case and DP/EP case
# via the same code path (i.e. via maybe_init_modular_kernel).
# NOTE(rob): in progress migrating all into this format.
if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferAllGatherMoEPrepareAndFinalize,
)
config = self.get_fused_moe_quant_config(layer)
assert config is not None
self.moe_quant_config = config
self.kernel = mk.FusedMoEModularKernel(
FlashInferAllGatherMoEPrepareAndFinalize(
use_dp=(self.moe.dp_size > 1),
use_deepseek_fp8_block_scale=self.block_quant,
),
FlashInferExperts(
out_dtype=torch.get_default_dtype(),
quant_config=self.moe_quant_config,
ep_rank=self.moe.ep_rank,
ep_size=self.moe.ep_size,
tp_rank=self.moe.tp_rank,
tp_size=self.moe.tp_size,
use_dp=(self.moe.dp_size > 1),
use_deepseek_fp8_block_scale=self.block_quant,
),
)
self.use_inplace = False
elif self.fp8_backend in [
Fp8MoeBackend.DEEPGEMM,
Fp8MoeBackend.TRITON,
Fp8MoeBackend.MARLIN,
]:
from vllm.model_executor.layers.fused_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
MarlinExperts,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
config = self.get_fused_moe_quant_config(layer)
assert config is not None
self.moe_quant_config = config
use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
moe_kernel = (
MarlinExperts(quant_config=self.moe_quant_config)
if use_marlin
else TritonOrDeepGemmExperts(
quant_config=self.moe_quant_config,
allow_deep_gemm=allow_deep_gemm,
)
)
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), moe_kernel
)
self.use_inplace = True
def maybe_make_prepare_finalize(
self,
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEPrepareAndFinalize | None:
if (
self.rocm_aiter_moe_enabled
or self.use_marlin
or self.fp8_backend == Fp8MoeBackend.MARLIN
or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
):
return None
@ -1092,7 +1161,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
TritonOrDeepGemmExperts,
)
assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
assert (
self.fp8_backend != Fp8MoeBackend.MARLIN
) and not self.rocm_aiter_moe_enabled, (
"Marlin and ROCm AITER are not supported with all2all yet."
)
@ -1106,7 +1177,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
assert max_num_tokens_per_rank is not None
experts_impl = (
BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
BatchedDeepGemmExperts
if self.fp8_backend == Fp8MoeBackend.DEEPGEMM
else BatchedTritonExperts
)
logger.debug(
"%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
@ -1141,14 +1214,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
)
return TritonOrDeepGemmExperts(
quant_config=self.moe_quant_config,
allow_deep_gemm=self.allow_deep_gemm,
allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
)
def get_fused_moe_quant_config(
self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
if self.use_marlin:
return None
if self.fp8_backend == Fp8MoeBackend.MARLIN:
return fp8_w8a16_moe_quant_config(
w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale,
block_shape=self.weight_block_size,
)
return fp8_w8a8_moe_quant_config(
w1_scale=(
@ -1179,6 +1256,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
# TODO(rob): convert this to MK.
if layer.enable_eplb:
raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
assert layer.activation == "silu", (
@ -1231,18 +1309,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
select_result = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
topk_weights, topk_ids, zero_expert_result = select_result
if self.rocm_aiter_moe_enabled:
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
rocm_aiter_fused_experts,
)
# TODO(rob): convert this to MK.
result = rocm_aiter_fused_experts(
x,
layer.w13_weight,
@ -1254,80 +1331,21 @@ class Fp8MoEMethod(FusedMoEMethodBase):
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
)
elif self.use_marlin:
assert layer.activation == "silu", (
f"{layer.activation} not supported for Marlin MoE."
)
result = fused_marlin_moe(
else:
result = self.kernel(
x,
layer.w13_weight,
layer.w2_weight,
None,
None,
layer.w13_weight_scale,
layer.w2_weight_scale,
router_logits,
topk_weights,
topk_ids,
quant_type_id=scalar_types.float8_e4m3fn.id,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
input_dtype=self.marlin_input_dtype,
workspace=layer.workspace,
)
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
assert layer.activation == "silu", (
f"Expected 'silu' activation but got {layer.activation}"
)
if not self.block_quant:
assert (
not layer.renormalize and layer.custom_routing_function is not None
)
assert layer.scoring_func == "sigmoid", (
f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
)
# Delegate to CUTLASS FlashInfer path; function already bound with
# use_deepseek_fp8_block_scale for block-quant when applicable
result = self.flashinfer_moe_fn(
x,
layer,
topk_weights,
topk_ids,
inplace=False,
inplace=self.use_inplace,
activation=layer.activation,
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
else:
from vllm.model_executor.layers.fused_moe import fused_experts
result = fused_experts(
hidden_states=x,
w1=layer.w13_weight,
w2=layer.w2_weight,
topk_weights=topk_weights,
topk_ids=topk_ids,
inplace=True,
activation=layer.activation,
global_num_experts=layer.global_num_experts,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
allow_deep_gemm=self.allow_deep_gemm,
allow_cutlass_block_scaled_grouped_gemm=(
self.allow_cutlass_block_scaled_grouped_gemm
),
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
return result, zero_expert_result
else:
return result
return result
class Fp8OnlineMoEMethod(Fp8MoEMethod):
@ -1471,7 +1489,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
replace_parameter(layer, "w2_weight", shuffled_w2)
# Rushuffle weights for MARLIN if needed.
if self.use_marlin:
if self.fp8_backend == Fp8MoeBackend.MARLIN:
prepare_moe_fp8_layer_for_marlin(
layer, False, input_dtype=self.marlin_input_dtype
)

View File

@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
"fused GGUF MoE method."
)
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert layer.activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -51,7 +51,7 @@ class QuantFP8(CustomOp):
self.column_major_scales = column_major_scales
self.use_ue8m0 = use_ue8m0
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled()
self.is_group_quant = group_shape.is_per_group()
if self.is_group_quant:

View File

@ -6,13 +6,8 @@ from typing import Any, Optional
import torch
from packaging import version
from torch.nn import Module
from torch.nn.parameter import Parameter
from vllm._ipex_ops import ipex_ops as ops
from vllm.model_executor.layers.fused_moe import (
FusedMoEMethodBase,
FusedMoeWeightScaleSupported,
)
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.linear import (
LinearBase,
@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
QuantizationMethods,
)
from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
from vllm.model_executor.layers.quantization.fp8 import (
Fp8Config,
Fp8LinearMethod,
Fp8OnlineMoEMethod,
)
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
maybe_create_device_identity,
)
from vllm.model_executor.parameter import ModelWeightParameter
from vllm.model_executor.utils import set_weight_attrs
from vllm.model_executor.utils import replace_parameter
from vllm.platforms import current_platform
MIN_IPEX_VERSION = "2.6.0"
@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
def __init__(self, quant_config: Fp8Config):
super().__init__(quant_config)
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
maybe_create_device_identity()
output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
layer.logical_widths = output_partition_sizes
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
layer.orig_dtype = params_dtype
layer.weight_block_size = None
weight = ModelWeightParameter(
data=torch.empty(
output_size_per_partition,
input_size_per_partition,
dtype=params_dtype,
),
input_dim=1,
output_dim=0,
weight_loader=weight_loader,
)
layer.register_parameter("weight", weight)
def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False):
return
# If checkpoint not serialized fp8, quantize the weights.
if not self.quant_config.is_checkpoint_fp8_serialized:
qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
# Update the layer with the new values.
layer.weight = Parameter(qweight, requires_grad=False)
layer.weight_scale = Parameter(weight_scale, requires_grad=False)
replace_parameter(layer, "weight", qweight.data)
replace_parameter(layer, "weight_scale", weight_scale.data)
layer.input_scale = None
def apply(
@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
return output
class XPUFp8MoEMethod(FusedMoEMethodBase):
class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
super().__init__(layer.moe_config)
super().__init__(quant_config, layer)
self.quant_config = quant_config
def create_weights(
self,
layer: Module,
num_experts: int,
hidden_size: int,
intermediate_size_per_partition: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
layer.intermediate_size_per_partition = intermediate_size_per_partition
layer.hidden_size = hidden_size
layer.num_experts = num_experts
layer.orig_dtype = params_dtype
layer.weight_block_size = None
# WEIGHTS
w13_weight = torch.nn.Parameter(
torch.empty(
num_experts,
2 * intermediate_size_per_partition,
hidden_size,
dtype=params_dtype,
),
requires_grad=False,
)
layer.register_parameter("w13_weight", w13_weight)
set_weight_attrs(w13_weight, extra_weight_attrs)
w2_weight = torch.nn.Parameter(
torch.empty(
num_experts,
hidden_size,
intermediate_size_per_partition,
dtype=params_dtype,
),
requires_grad=False,
)
layer.register_parameter("w2_weight", w2_weight)
set_weight_attrs(w2_weight, extra_weight_attrs)
# Allocate 2 scales for w1 and w3 respectively.
# They will be combined to a single scale after weight loading.
w13_weight_scale = torch.nn.Parameter(
torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
)
w2_weight_scale = torch.nn.Parameter(
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
)
layer.register_parameter("w13_weight_scale", w13_weight_scale)
layer.register_parameter("w2_weight_scale", w2_weight_scale)
extra_weight_attrs.update(
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
)
# INPUT_SCALES
layer.w13_input_scale = None
layer.w2_input_scale = None
def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False):
return
if not self.quant_config.is_checkpoint_fp8_serialized:
fp8_dtype = current_platform.fp8_dtype()
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
)
layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
replace_parameter(layer, "w13_weight", w13_weight)
replace_parameter(layer, "w2_weight", w2_weight)
import intel_extension_for_pytorch as ipex
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts

View File

@ -30,7 +30,7 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
class MarlinLinearKernel(MPLinearKernel):
@classmethod
def get_min_capability(cls) -> int:
return 80
return 75
@classmethod
def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:

View File

@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
select_cutlass_fp8_gemm_impl,
swap_w13_to_w31,
)
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
get_marlin_input_dtype,
)
@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp,
cutlass_block_fp8_supported,
requantize_with_max_scale,
)
from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
from vllm.model_executor.parameter import (
BlockQuantScaleParameter,
ChannelQuantScaleParameter,
ModelWeightParameter,
PerTensorScaleParameter,
)
from vllm.scalar_type import scalar_types
from vllm.utils.flashinfer import (
flashinfer_scaled_fp4_mm,
@ -88,7 +97,16 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
QUANT_ALGOS = ["FP8", "NVFP4"]
QUANT_ALGOS = [
# FP8 (per-tensor weight + optional static activation scale).
"FP8",
# FP8 per-channel weight scale + per-token activation scale.
"FP8_PER_CHANNEL_PER_TOKEN",
# FP8 per-block weight-only (ModelOpt may emit this as lowercase).
"FP8_PB_WO",
# FP4
"NVFP4",
]
KV_CACHE_QUANT_ALGOS = ["FP8"]
@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig):
if not quant_method:
raise ValueError("Missing 'quant_algo' in quantization config")
# Normalize quant_algo for robust matching (ModelOpt may emit lowercase).
quant_method = str(quant_method).upper()
if kv_cache_quant_method is None:
# No KV cache quantization, keep this branch just to have this comment
pass
@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig):
f"kv_cache_quant_algo must be a string, got "
f"{type(kv_cache_quant_method)}"
)
else:
kv_cache_quant_method = kv_cache_quant_method.upper()
if not isinstance(exclude_modules, list):
raise ValueError(
@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
def __init__(
self,
quant_method: str,
is_checkpoint_fp8_serialized: bool,
kv_cache_quant_method: str | None,
exclude_modules: list[str],
) -> None:
super().__init__(exclude_modules)
self.quant_method = quant_method
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
self.kv_cache_quant_method = kv_cache_quant_method
if is_checkpoint_fp8_serialized:
logger.warning(
"Detected ModelOpt fp8 checkpoint. Please note that"
" the format is experimental and could change."
"Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note "
"that the format is experimental and could change.",
quant_method,
)
# Select LinearMethod implementation based on quant_algo.
if self.quant_method == "FP8":
self.LinearMethodCls = ModelOptFp8LinearMethod
elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN":
self.LinearMethodCls = ModelOptFp8PcPtLinearMethod
elif self.quant_method == "FP8_PB_WO":
self.LinearMethodCls = ModelOptFp8PbWoLinearMethod
else:
raise ValueError(
"Unsupported ModelOpt FP8 quant_algo for vLLM: "
f"{self.quant_method}. Supported: FP8 / "
"FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO."
)
def get_name(self) -> QuantizationMethods:
@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
if "quantization" in hf_quant_cfg:
quant_config = hf_quant_cfg["quantization"]
if isinstance(quant_config, dict):
quant_algo = quant_config.get("quant_algo", "")
if "FP8" in quant_algo:
quant_algo = str(quant_config.get("quant_algo", ""))
if "FP8" in quant_algo.upper():
return "modelopt"
else:
# Check for compressed-tensors style config with specific quant_algo
quant_algo = hf_quant_cfg.get("quant_algo", "")
if isinstance(quant_algo, str) and "FP8" in quant_algo:
quant_algo = str(hf_quant_cfg.get("quant_algo", ""))
if "FP8" in quant_algo.upper():
return "modelopt"
return None
@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
) -> "ModelOptFp8Config":
is_checkpoint_fp8_serialized = "FP8" in quant_method
return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
return cls(
quant_method,
is_checkpoint_fp8_serialized,
kv_cache_quant_method,
exclude_modules,
)
class ModelOptFp8LinearMethod(LinearMethodBase):
@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
)
class ModelOptFp8PcPtLinearMethod(LinearMethodBase):
"""Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.
Expected checkpoint structure (per Linear):
- weight: fp8-e4m3fn, shape [out, in]
- weight_scale: fp32, shape [out] (per-output-channel)
- no input_scale (activations are dynamically quantized per-token)
"""
def __init__(self, quant_config: ModelOptFp8Config) -> None:
self.quant_config = quant_config
self.fp8_linear = Fp8LinearOp(
act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
)
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
del input_size, output_size
if not self.quant_config.is_checkpoint_fp8_serialized:
raise ValueError(
"FP8_PER_CHANNEL_PER_TOKEN currently only supports "
"FP8-serialized checkpoints."
)
output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
layer.logical_widths = output_partition_sizes
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
weight = ModelWeightParameter(
data=torch.empty(
output_size_per_partition,
input_size_per_partition,
dtype=torch.float8_e4m3fn,
),
input_dim=1,
output_dim=0,
weight_loader=weight_loader,
)
layer.register_parameter("weight", weight)
weight_scale = ChannelQuantScaleParameter(
data=torch.empty(output_size_per_partition, dtype=torch.float32),
output_dim=0,
weight_loader=weight_loader,
)
weight_scale[:] = torch.finfo(torch.float32).min
layer.register_parameter("weight_scale", weight_scale)
def process_weights_after_loading(self, layer: Module) -> None:
layer.weight = Parameter(layer.weight.t(), requires_grad=False)
layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
return self.fp8_linear.apply(
input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=None,
bias=bias,
)
class ModelOptFp8PbWoLinearMethod(LinearMethodBase):
"""Linear method for ModelOpt FP8_PB_WO checkpoints.
ModelOpt exports `weight_scale` as a 4D tensor:
[out_blk, 1, in_blk, 1]
where block size is typically 128 for both dims.
vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
"""
_WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128)
def __init__(self, quant_config: ModelOptFp8Config) -> None:
self.quant_config = quant_config
block_n, block_k = self._WEIGHT_BLOCK_SIZE
self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE)
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(block_n, block_k),
act_quant_group_shape=GroupShape(1, block_k),
cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
use_aiter_and_is_supported=False,
)
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
del input_size, output_size
if not self.quant_config.is_checkpoint_fp8_serialized:
raise ValueError(
"FP8_PB_WO currently only supports FP8-serialized checkpoints."
)
output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
layer.logical_widths = output_partition_sizes
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
# Expose block size so the v2 weight loaders can translate offsets from
# element-space -> block-space for BlockQuantScaleParameter.
layer.weight_block_size = self.weight_block_size
weight = ModelWeightParameter(
data=torch.empty(
output_size_per_partition,
input_size_per_partition,
dtype=torch.float8_e4m3fn,
),
input_dim=1,
output_dim=0,
weight_loader=weight_loader,
)
layer.register_parameter("weight", weight)
block_n, block_k = self._WEIGHT_BLOCK_SIZE
if output_size_per_partition % block_n != 0:
raise ValueError(
"ModelOpt FP8_PB_WO requires out_features divisible by "
f"{block_n}, got {output_size_per_partition}."
)
if input_size_per_partition % block_k != 0:
raise ValueError(
"ModelOpt FP8_PB_WO requires in_features divisible by "
f"{block_k}, got {input_size_per_partition}."
)
out_blks = output_size_per_partition // block_n
in_blks = input_size_per_partition // block_k
# Match ModelOpt's exported shape so weight loading works without a
# custom loader: [out_blk, 1, in_blk, 1]
weight_scale = BlockQuantScaleParameter(
data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32),
input_dim=2,
output_dim=0,
weight_loader=weight_loader,
)
weight_scale[:] = torch.finfo(torch.float32).min
layer.register_parameter("weight_scale", weight_scale)
def process_weights_after_loading(self, layer: Module) -> None:
# Keep weight in [out, in] layout for W8A8BlockFp8LinearOp.
layer.weight = Parameter(layer.weight.data, requires_grad=False)
scale = layer.weight_scale
if scale.dim() == 4:
# [out_blk, 1, in_blk, 1] -> [out_blk, in_blk]
scale = scale.squeeze(1).squeeze(-1)
elif scale.dim() != 2:
raise ValueError(
"Unexpected ModelOpt FP8_PB_WO weight_scale shape: "
f"{tuple(scale.shape)}."
)
layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
return self.w8a8_block_fp8_linear.apply(
input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=None,
bias=bias,
)
class ModelOptFp8MoEMethod(FusedMoEMethodBase):
"""MoE method for ModelOpt FP8.
Supports loading FP8 checkpoints with static weight scale and
@ -796,7 +1038,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
)
# Expert selection
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1599,7 +1841,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
x_routing, _ = x
else:
x_routing = x
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x_routing,
router_logits=router_logits,
)

View File

@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
from vllm.model_executor.layers.fused_moe import fused_experts
assert layer.activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
raise NotImplementedError("EPLB is not supported for mxfp4")
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -989,7 +989,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
):
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.utils.deep_gemm import (
DeepGemmQuantScaleFMT,
fp8_gemm_nt,
is_deep_gemm_e8m0_used,
is_deep_gemm_supported,
@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp:
self.act_quant_group_shape = act_quant_group_shape
self.is_deep_gemm_supported = is_deep_gemm_supported()
self.is_hopper = current_platform.is_device_capability(90)
self.is_blackwell = current_platform.is_device_capability_family(100)
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
# Get the correct blockscale mul and input quant operations.
@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
weight: torch.Tensor,
weight_scale: torch.Tensor,
) -> torch.Tensor:
if self.use_deep_gemm_e8m0 and self.is_blackwell:
if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
input_2d,
group_size=self.act_quant_group_shape.col,
@ -1252,6 +1252,14 @@ def validate_fp8_block_shape(
"""Validate block quantization shapes for tensor parallelism."""
from vllm.distributed import get_tensor_model_parallel_world_size
if getattr(layer, "allow_fp8_block_shape_mismatch", False):
logger.debug(
"Skipping FP8 block shape validation for layer %s due to detected"
" mismatch allowance.",
getattr(layer, "prefix", "<unknown>"),
)
return
tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
block_n, block_k = block_size[0], block_size[1]

View File

@ -48,7 +48,7 @@ def query_marlin_supported_quant_types(
-1 if capability_tuple is None else capability_tuple.to_int()
)
if device_capability < 80:
if device_capability < 75:
return []
# - has_zp is True: return quant_types that has zero points
@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
a_scales = None
if input_dtype == torch.int8:
assert quant_type == scalar_types.uint4, (
"W8A8-INT8 is not supported by marlin kernel."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
a_scales = a_scales * input_global_scale
elif input_dtype == torch.float8_e4m3fn:
assert quant_type == scalar_types.uint4, (
"INT8 weight + FP8 activation is not supported."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
output = ops.gptq_marlin_gemm(
@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
a_scales = None
if input_dtype == torch.int8:
assert quant_type == scalar_types.uint4b8, (
"W8A8-INT8 is not supported by marlin kernel."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
a_scales = a_scales * input_global_scale
elif input_dtype == torch.float8_e4m3fn:
assert quant_type == scalar_types.uint4b8, (
"INT8 weight + FP8 activation is not supported."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
output = ops.gptq_marlin_gemm(

View File

@ -23,7 +23,7 @@ logger = init_logger(__name__)
def is_fp4_marlin_supported():
return current_platform.has_device_capability(80)
return current_platform.has_device_capability(75)
def nvfp4_marlin_process_scales(marlin_scales):
@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
)
is_nvfp4 = hasattr(layer, "weight_scale_2")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
elif input_dtype != torch.float8_e4m3fn:
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
group_size = 16 if is_nvfp4 else 32
part_size_n = layer.output_size_per_partition
@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
)
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
elif input_dtype != torch.float8_e4m3fn:
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
group_size = 16 if is_nvfp4 else 32
e = layer.num_experts

View File

@ -11,7 +11,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
marlin_make_workspace_new,
marlin_permute_bias,
marlin_permute_scales,
marlin_quant_input,
should_use_atomic_add_reduce,
)
from vllm.model_executor.utils import replace_parameter
@ -22,7 +21,7 @@ logger = init_logger(__name__)
def is_fp8_marlin_supported():
return current_platform.has_device_capability(80)
return current_platform.has_device_capability(75)
def fp8_fused_exponent_bias_into_scales(scales):
@ -63,13 +62,11 @@ def apply_fp8_marlin_linear(
inputs = reshaped_x
a_scales = None
if input_dtype is not None and input_dtype.itemsize == 1:
if input_dtype != torch.float8_e4m3fn:
raise RuntimeError("FP8 weight + INT8 activation is not supported.")
inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
# inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
raise RuntimeError("Marlin W8A8 is not supported.")
output = ops.gptq_marlin_gemm(
a=reshaped_x,
a=inputs,
c=None,
b_q_weight=weight,
b_bias=bias,
@ -102,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
if input_dtype is not None and input_dtype.itemsize == 1:
raise RuntimeError("Marlin W8A8 is not supported.")
part_size_n = layer.output_size_per_partition
part_size_k = layer.input_size_per_partition
@ -145,10 +144,20 @@ def prepare_fp8_layer_for_marlin(
# marlin kernel only support channel-wise and group-wise quantization
# we need to convert the scales
if weight_block_size is None:
logical_widths = getattr(layer, "logical_widths", [])
if scales.nelement() == 1:
# tensor-wise quantization -> channel-wise quantization
# (1, 1) =>(repeat)=> (1, size_n)
scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
elif scales.nelement() == len(logical_widths):
# tensor-wise quantization with logical_widths ->
# channel-wise quantization
assert sum(logical_widths) == part_size_n, (
f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
f"to part_size_n ({part_size_n})"
)
lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
elif scales.nelement() > 1 and scales.nelement() != part_size_n:
assert part_size_n % scales.nelement() == 0
s_size = scales.nelement()
@ -199,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
if input_dtype is not None and input_dtype.itemsize == 1:
raise RuntimeError("Marlin W8A8 is not supported.")
e = layer.num_experts
k = layer.hidden_size

View File

@ -178,6 +178,37 @@ class ApplyRotaryEmb(CustomOp):
output = output.to(origin_dtype)
return output
def _pre_process(
self,
x: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
origin_shape = x.shape
if len(origin_shape) == 3:
# x: [seq_len, num_heads, head_size]
x = x.unsqueeze(0)
origin_dtype = x.dtype
if self.enable_fp32_compute:
x = x.float()
cos = cos.float()
sin = sin.float()
return x, cos, sin, origin_shape, origin_dtype
def _post_process(
self,
output: torch.Tensor,
origin_shape: torch.Size,
origin_dtype: torch.dtype,
) -> torch.Tensor:
if len(origin_shape) == 3:
output = output.squeeze(0)
if self.enable_fp32_compute:
output = output.to(origin_dtype)
return output
def forward_native(
self,
x: torch.Tensor,
@ -197,16 +228,7 @@ class ApplyRotaryEmb(CustomOp):
) -> torch.Tensor:
from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
origin_dtype = x.dtype
if self.enable_fp32_compute:
x = x.float()
cos = cos.float()
sin = sin.float()
origin_shape = x.shape
if len(origin_shape) == 3:
# x: [seq_len, num_heads, head_size]
x = x.unsqueeze(0)
x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
"""
Arguments of apply_rotary_emb() in vllm_flash_attn:
@ -218,10 +240,7 @@ class ApplyRotaryEmb(CustomOp):
interleaved = not self.is_neox_style
output = apply_rotary_emb(x, cos, sin, interleaved)
if len(origin_shape) == 3:
output = output.squeeze(0)
if self.enable_fp32_compute:
output = output.to(origin_dtype)
output = self._post_process(output, origin_shape, origin_dtype)
return output
def forward_hip(
@ -231,16 +250,7 @@ class ApplyRotaryEmb(CustomOp):
sin: torch.Tensor,
) -> torch.Tensor:
if self.apply_rotary_emb_flash_attn is not None:
origin_dtype = x.dtype
if self.enable_fp32_compute:
x = x.float()
cos = cos.float()
sin = sin.float()
origin_shape = x.shape
if len(origin_shape) == 3:
# x: [seq_len, num_heads, head_size]
x = x.unsqueeze(0)
x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
"""
Arguments of apply_rotary() in flash_attn:
@ -254,10 +264,7 @@ class ApplyRotaryEmb(CustomOp):
x, cos, sin, interleaved=interleaved
).type_as(x)
if len(origin_shape) == 3:
output = output.squeeze(0)
if self.enable_fp32_compute:
output = output.to(origin_dtype)
output = self._post_process(output, origin_shape, origin_dtype)
else:
# Falling back to PyTorch native implementation.
output = self.forward_native(x, cos, sin)

Some files were not shown because too many files have changed in this diff Show More