mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-27 19:17:13 +08:00
Merge branch 'main' into elvischenv/update-flashinfer
This commit is contained in:
commit
20d87ee2f0
@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
|
||||
Runtime environment variables:
|
||||
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
|
||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||
@ -34,8 +34,9 @@ Runtime environment variables:
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||
>
|
||||
> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||
> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
|
||||
|
||||
### Latency test
|
||||
|
||||
Here is an example of one test inside `latency-tests.json`:
|
||||
|
||||
24
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Normal file → Executable file
24
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
Normal file → Executable file
@ -49,7 +49,11 @@ check_cpus() {
|
||||
echo "Need at least 1 NUMA to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type="cpu"
|
||||
if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then
|
||||
declare -g gpu_type="arm64-cpu"
|
||||
else
|
||||
declare -g gpu_type="cpu"
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@ -207,8 +211,8 @@ run_latency_tests() {
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@ -276,8 +280,8 @@ run_throughput_tests() {
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@ -393,8 +397,8 @@ run_serving_tests() {
|
||||
|
||||
# check if there is enough resources to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [ "$ON_CPU" == "1" ]; then
|
||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1')
|
||||
world_size=$(($tp*$pp))
|
||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||
@ -496,9 +500,9 @@ run_serving_tests() {
|
||||
main() {
|
||||
local ARCH
|
||||
ARCH=''
|
||||
if [ "$ON_CPU" == "1" ];then
|
||||
check_cpus
|
||||
ARCH='-cpu'
|
||||
if [[ "$ON_CPU" == "1" ]]; then
|
||||
check_cpus
|
||||
ARCH="-$gpu_type"
|
||||
else
|
||||
check_gpus
|
||||
ARCH="$arch_suffix"
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,130 @@
|
||||
{
|
||||
"defaults": {
|
||||
"qps_list": [
|
||||
"inf"
|
||||
],
|
||||
"max_concurrency_list": [
|
||||
12,
|
||||
16,
|
||||
24,
|
||||
32,
|
||||
64,
|
||||
128,
|
||||
200
|
||||
],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
"tests": [
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 2048
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||
"server_parameters": {
|
||||
"tensor_parallel_size": 2
|
||||
},
|
||||
"client_parameters": {
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 2048,
|
||||
"random-output-len": 128
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -291,6 +291,7 @@ if __name__ == "__main__":
|
||||
"""
|
||||
Arguments:
|
||||
--version <version> : version string for the current build (e.g., commit hash)
|
||||
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
@ -318,6 +319,12 @@ if __name__ == "__main__":
|
||||
required=True,
|
||||
help="Directory to store generated index files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wheel-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Directory containing wheel files (default to be same as `version`)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--alias-to-default",
|
||||
type=str,
|
||||
@ -372,7 +379,7 @@ if __name__ == "__main__":
|
||||
|
||||
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
|
||||
|
||||
# keep only "official" files for a non-nightly version (specifed by cli args)
|
||||
# keep only "official" files for a non-nightly version (specified by cli args)
|
||||
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
|
||||
if PY_VERSION_RE.match(version):
|
||||
# upload-wheels.sh ensures no "dev" is in args.version
|
||||
@ -384,9 +391,10 @@ if __name__ == "__main__":
|
||||
print("Nightly version detected, keeping all wheel files.")
|
||||
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{version}/<wheel files>
|
||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_base_dir = Path(output_dir).parent / version
|
||||
wheel_dir = args.wheel_dir or version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
|
||||
@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
|
||||
echo "Re-generating indices for /$pure_version/"
|
||||
rm -rf "$INDICES_OUTPUT_DIR/*"
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
|
||||
fi
|
||||
|
||||
@ -349,7 +349,9 @@ steps:
|
||||
- label: V1 Test e2e + engine # 65min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_4
|
||||
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
|
||||
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
|
||||
agent_pool: mi325_8
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -964,7 +966,7 @@ steps:
|
||||
- pytest -v -s models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
timeout_in_minutes: 100
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -973,13 +975,15 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
|
||||
timeout_in_minutes: 180
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
|
||||
timeout_in_minutes: 10
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -989,7 +993,9 @@ steps:
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
@ -1001,10 +1007,13 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2
|
||||
- label: Multi-Modal Models Test (Extended) 2 #60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
@ -1013,6 +1022,8 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
@ -1026,6 +1037,8 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- export MIOPEN_DEBUG_CONV_DIRECT=0
|
||||
- export MIOPEN_DEBUG_CONV_GEMM=0
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
@ -1243,13 +1256,13 @@ steps:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
@ -1497,7 +1510,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
|
||||
@ -319,7 +319,10 @@ steps:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
@ -1106,13 +1109,13 @@ steps:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
@ -1331,7 +1334,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
@ -1356,6 +1359,7 @@ steps:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
|
||||
@ -145,7 +145,7 @@ steps:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
@ -171,7 +171,7 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
|
||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -15,6 +15,7 @@
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
|
||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
||||
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
||||
else()
|
||||
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
|
||||
"in CUDA target architectures")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# Machete kernels
|
||||
|
||||
177
benchmarks/kernels/bench_nvfp4_quant.py
Normal file
177
benchmarks/kernels/bench_nvfp4_quant.py
Normal file
@ -0,0 +1,177 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils.flashinfer import flashinfer_fp4_quantize
|
||||
|
||||
if not current_platform.has_device_capability(100):
|
||||
raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
|
||||
|
||||
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"vllm": dict(backend="vllm", enabled=True),
|
||||
"flashinfer": dict(backend="flashinfer", enabled=True),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
|
||||
"""Compute global scale for FP4 quantization."""
|
||||
amax = torch.abs(tensor).max().to(torch.float32)
|
||||
return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
line_names=_enabled,
|
||||
ylabel="us (lower is better)",
|
||||
plot_name="NVFP4 Input Quantization Latency (us)",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# Create input tensor
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
|
||||
# Compute global scale for activation
|
||||
a_global_scale = compute_global_scale(a)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
|
||||
if cfg["backend"] == "vllm":
|
||||
# vLLM's FP4 quantization
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: ops.scaled_fp4_quant(a, a_global_scale),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif cfg["backend"] == "flashinfer":
|
||||
# FlashInfer's FP4 quantization
|
||||
# Use is_sf_swizzled_layout=True to match vLLM's output format
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
# Convert ms to us for better readability at small batch sizes
|
||||
to_us = lambda t_ms: t_ms * 1000
|
||||
return to_us(ms), to_us(max_ms), to_us(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
out = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
out.append(KN)
|
||||
return out
|
||||
|
||||
|
||||
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
|
||||
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
|
||||
# Create input tensor
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
|
||||
# Compute global scale
|
||||
a_global_scale = compute_global_scale(a)
|
||||
|
||||
# vLLM quantization
|
||||
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
|
||||
|
||||
# FlashInfer quantization (with swizzled layout to match vLLM's output)
|
||||
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
|
||||
a, a_global_scale, is_sf_swizzled_layout=True
|
||||
)
|
||||
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
|
||||
|
||||
# Compare outputs
|
||||
torch.testing.assert_close(
|
||||
vllm_fp4,
|
||||
flashinfer_fp4,
|
||||
)
|
||||
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
|
||||
|
||||
|
||||
def test_accuracy():
|
||||
"""Run accuracy tests across various shapes."""
|
||||
print("\n" + "=" * 60)
|
||||
print("Running accuracy tests: vLLM vs FlashInfer")
|
||||
print("=" * 60)
|
||||
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# Test various batch sizes and hidden dimensions
|
||||
Ms = [1, 1024]
|
||||
Ks = [4096]
|
||||
|
||||
for M in Ms:
|
||||
for K in Ks:
|
||||
_test_accuracy_once(M, K, dtype, device)
|
||||
|
||||
print("\nAll accuracy tests passed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
parser.add_argument(
|
||||
"--save-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save benchmark results",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--accuracy",
|
||||
action="store_true",
|
||||
help="Run accuracy tests",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.accuracy:
|
||||
test_accuracy()
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"\n{model}, N={N} K={K}")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
save_path=args.save_path,
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
||||
print("\nBenchmark finished!")
|
||||
@ -37,10 +37,12 @@ struct VecTypeTrait<c10::BFloat16> {
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__)
|
||||
template <>
|
||||
struct VecTypeTrait<c10::Half> {
|
||||
using vec_t = vec_op::FP16Vec16;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct Counter {
|
||||
std::atomic<int64_t> counter;
|
||||
|
||||
@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
|
||||
void const* k_weight_void, // RMSNorm weights for key
|
||||
void const* cos_sin_cache_void, // Pre-computed cos/sin cache
|
||||
int64_t const* position_ids, // Position IDs for RoPE
|
||||
int const num_tokens // Number of tokens
|
||||
int const num_tokens, // Number of tokens
|
||||
int const rotary_dim // Dimension for RoPE
|
||||
) {
|
||||
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
|
||||
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
|
||||
@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
|
||||
|
||||
// Calculate cache pointer for this position - similar to
|
||||
// pos_encoding_kernels.cu
|
||||
T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
|
||||
int const embed_dim = head_dim / 2;
|
||||
T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
|
||||
int const embed_dim = rotary_dim / 2;
|
||||
T_cache const* cos_ptr = cache_ptr;
|
||||
T_cache const* sin_ptr = cache_ptr + embed_dim;
|
||||
|
||||
if constexpr (interleave) {
|
||||
// Perform interleaving. Use pre-computed cos/sin values.
|
||||
int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range
|
||||
if (laneId < rotary_lanes) {
|
||||
if constexpr (interleave) {
|
||||
// Perform interleaving. Use pre-computed cos/sin values.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < numElemsPerThread / 2; ++i) {
|
||||
int const idx0 = 2 * i;
|
||||
int const idx1 = 2 * i + 1;
|
||||
for (int i = 0; i < numElemsPerThread / 2; ++i) {
|
||||
int const idx0 = 2 * i;
|
||||
int const idx1 = 2 * i + 1;
|
||||
// Global dimension index in the head
|
||||
int const dim_idx = laneId * numElemsPerThread + idx0;
|
||||
|
||||
float const val0 = elements[idx0];
|
||||
float const val1 = elements[idx1];
|
||||
float const val0 = elements[idx0];
|
||||
float const val1 = elements[idx1];
|
||||
|
||||
int const dim_idx = laneId * numElemsPerThread + idx0;
|
||||
int const half_dim = dim_idx / 2;
|
||||
float const cos_val =
|
||||
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||
float const sin_val =
|
||||
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||
int const half_dim = dim_idx / 2;
|
||||
float const cos_val =
|
||||
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||
float const sin_val =
|
||||
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||
|
||||
elements[idx0] = val0 * cos_val - val1 * sin_val;
|
||||
elements[idx1] = val0 * sin_val + val1 * cos_val;
|
||||
}
|
||||
} else {
|
||||
// Before data exchange with in warp, we need to sync.
|
||||
__syncwarp();
|
||||
// Get the data from the other half of the warp. Use pre-computed cos/sin
|
||||
// values.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < numElemsPerThread; i++) {
|
||||
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
|
||||
if (laneId < 16) {
|
||||
elements2[i] = -elements2[i];
|
||||
elements[idx0] = val0 * cos_val - val1 * sin_val;
|
||||
elements[idx1] = val0 * sin_val + val1 * cos_val;
|
||||
}
|
||||
} else {
|
||||
// Before data exchange with in warp, we need to sync.
|
||||
__syncwarp();
|
||||
int pairOffset = (rotary_dim / 2) / numElemsPerThread;
|
||||
// Get the data from the other half of the warp. Use pre-computed
|
||||
// cos/sin values.
|
||||
#pragma unroll
|
||||
for (int i = 0; i < numElemsPerThread; i++) {
|
||||
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
|
||||
|
||||
int dim_idx = laneId * numElemsPerThread + i;
|
||||
dim_idx = (dim_idx * 2) % head_dim;
|
||||
int half_dim = dim_idx / 2;
|
||||
// Use pre-computed cos/sin from cache
|
||||
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||
if (laneId < pairOffset) {
|
||||
elements2[i] = -elements2[i];
|
||||
}
|
||||
int dim_idx = laneId * numElemsPerThread + i;
|
||||
|
||||
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
|
||||
dim_idx = (dim_idx * 2) % rotary_dim;
|
||||
int half_dim = dim_idx / 2;
|
||||
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
|
||||
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
|
||||
|
||||
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
|
||||
}
|
||||
// __shfl_xor_sync does not provide memfence. Need to sync again.
|
||||
__syncwarp();
|
||||
}
|
||||
// __shfl_xor_sync does not provide memfence. Need to sync again.
|
||||
__syncwarp();
|
||||
}
|
||||
|
||||
// Store.
|
||||
{
|
||||
vec_T vec;
|
||||
@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
|
||||
void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
||||
int const num_heads_q, int const num_heads_k,
|
||||
int const num_heads_v, int const head_dim,
|
||||
float const eps, void const* q_weight,
|
||||
void const* k_weight, void const* cos_sin_cache,
|
||||
bool const interleave, int64_t const* position_ids,
|
||||
cudaStream_t stream) {
|
||||
int const rotary_dim, float const eps,
|
||||
void const* q_weight, void const* k_weight,
|
||||
void const* cos_sin_cache, bool const interleave,
|
||||
int64_t const* position_ids, cudaStream_t stream) {
|
||||
constexpr int blockSize = 256;
|
||||
|
||||
int const warpsPerBlock = blockSize / 32;
|
||||
@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
|
||||
<<<gridDim, blockDim, 0, stream>>>(
|
||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
|
||||
});
|
||||
break;
|
||||
case 128:
|
||||
@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
|
||||
<<<gridDim, blockDim, 0, stream>>>(
|
||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
|
||||
});
|
||||
break;
|
||||
case 256:
|
||||
@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
|
||||
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
|
||||
<<<gridDim, blockDim, 0, stream>>>(
|
||||
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens);
|
||||
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
|
||||
});
|
||||
break;
|
||||
default:
|
||||
@ -392,8 +396,11 @@ void fused_qk_norm_rope(
|
||||
"Query weights size must match head dimension");
|
||||
TORCH_CHECK(k_weight.size(0) == head_dim,
|
||||
"Key weights size must match head dimension");
|
||||
TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
|
||||
"Cos/sin cache dimension must match head_dim");
|
||||
|
||||
TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
|
||||
TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
|
||||
"rotary_dim must be less than or equal to head_dim");
|
||||
|
||||
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
|
||||
qkv.scalar_type() == k_weight.scalar_type(),
|
||||
"qkv, q_weight and k_weight must have the same dtype");
|
||||
@ -419,7 +426,8 @@ void fused_qk_norm_rope(
|
||||
qkv.data_ptr(), static_cast<int>(num_tokens),
|
||||
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
|
||||
static_cast<int>(num_heads_v), static_cast<int>(head_dim),
|
||||
static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
|
||||
static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
|
||||
q_weight.data_ptr(), k_weight.data_ptr(),
|
||||
cos_sin_cache.data_ptr(), !is_neox,
|
||||
reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
|
||||
stream);
|
||||
|
||||
@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
|
||||
// Precompute SF layout parameter (constant for entire kernel).
|
||||
int32_t const numKTiles = (numCols + 63) / 64;
|
||||
|
||||
// Get the global scaling factor, which will be applied to the SF.
|
||||
// Note SFScale is the same as next GEMM's alpha, which is
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx, colIdx, numCols, SFout);
|
||||
rowIdx, colIdx, numKTiles, SFout);
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
|
||||
sf_out);
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#include <cuda_fp8.h>
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
#include "launch_bounds_utils.h"
|
||||
|
||||
@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
|
||||
// Precompute SF layout parameter (constant for entire kernel).
|
||||
int32_t const numKTiles = (numCols + 63) / 64;
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
|
||||
|
||||
@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
// (448.f / (Alpha_A / 6.f)).
|
||||
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
|
||||
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
// The actual output_scales dim is computed from the padded numCols.
|
||||
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
|
||||
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
|
||||
uint32_t* SFout_in_expert =
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
|
||||
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
|
||||
// Precompute SF layout parameter (constant for entire kernel).
|
||||
int32_t const numKTiles = (numCols + 63) / 64;
|
||||
|
||||
extern __shared__ uint32_t shared_input_offsets[];
|
||||
|
||||
// Load input offsets into shared memory.
|
||||
@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
|
||||
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
|
||||
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
|
||||
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
|
||||
uint32_t* SFout_in_expert =
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
|
||||
SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
|
||||
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
|
||||
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input,
|
||||
void* input_global_scale, void* input_offset_by_experts,
|
||||
void* output_scale_offset_by_experts, int m_topk, int k,
|
||||
int n_experts, cudaStream_t stream) {
|
||||
// TODO: this multiProcessorCount should be cached.
|
||||
int device;
|
||||
cudaGetDevice(&device);
|
||||
int multiProcessorCount;
|
||||
cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
|
||||
device);
|
||||
int multiProcessorCount =
|
||||
get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
|
||||
|
||||
// Grid, Block size.
|
||||
// Each thread converts 8 values.
|
||||
|
||||
@ -38,6 +38,12 @@ __host__ __device__ inline Int round_up(Int x, Int y) {
|
||||
return (x + y - 1) / y * y;
|
||||
}
|
||||
|
||||
// Compute effective rows for grid configuration with swizzled SF layouts.
|
||||
inline int computeEffectiveRows(int m) {
|
||||
constexpr int ROW_TILE = 128;
|
||||
return round_up(m, ROW_TILE);
|
||||
}
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
@ -49,6 +55,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
|
||||
"Vec size is not matched.");
|
||||
|
||||
// Precompute SF layout parameter (constant for entire kernel).
|
||||
int32_t const numKTiles = (numCols + 63) / 64;
|
||||
|
||||
int sf_m = round_up<int>(numRows, 128);
|
||||
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
|
||||
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
|
||||
@ -79,7 +88,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
auto sf_out =
|
||||
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
|
||||
CVT_FP4_NUM_THREADS_PER_SF>(
|
||||
rowIdx, colIdx, numCols, SFout);
|
||||
rowIdx, colIdx, numKTiles, SFout);
|
||||
|
||||
out_pos =
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
|
||||
@ -87,43 +96,6 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
|
||||
int64_t* output, int32_t* SFOuput, bool useUE8M0,
|
||||
int multiProcessorCount, cudaStream_t stream) {
|
||||
// Grid, Block size.
|
||||
// Each thread converts 8 values.
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||
// Get number of blocks per SM
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
// Launch the cvt kernel.
|
||||
if (useUE8M0) {
|
||||
cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
|
||||
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(SFOuput));
|
||||
} else {
|
||||
cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
|
||||
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
|
||||
reinterpret_cast<uint32_t*>(SFOuput));
|
||||
}
|
||||
}
|
||||
|
||||
// Instantiate the function.
|
||||
template void invokeFP4Quantization(int m, int n, half const* input,
|
||||
float const* SFScale, int64_t* output,
|
||||
int32_t* SFOuput, bool useUE8M0,
|
||||
int multiProcessorCount,
|
||||
cudaStream_t stream);
|
||||
|
||||
template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
|
||||
float const* SFScale, int64_t* output,
|
||||
int32_t* SFOuput, bool useUE8M0,
|
||||
int multiProcessorCount,
|
||||
cudaStream_t stream);
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
@ -147,13 +119,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
|
||||
// We don't support e8m0 scales at this moment.
|
||||
bool useUE8M0 = false;
|
||||
// Grid, Block size. Each thread converts 8 values.
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
int effectiveRows = vllm::computeEffectiveRows(m);
|
||||
dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
|
||||
sf_out, useUE8M0, multiProcessorCount, stream);
|
||||
// NOTE: We don't support e8m0 scales at this moment.
|
||||
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
|
||||
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
}
|
||||
|
||||
@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
return b;
|
||||
}
|
||||
|
||||
// Compute SF output offset for swizzled tensor core layout.
|
||||
// SF layout: [numMTiles, numKTiles, 32, 4, 4]
|
||||
// Caller must precompute: numKTiles = (numCols + 63) / 64
|
||||
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
|
||||
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
|
||||
int numCols,
|
||||
SFType* SFout) {
|
||||
__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
|
||||
int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
|
||||
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
|
||||
CVT_FP4_NUM_THREADS_PER_SF == 2);
|
||||
|
||||
// One pair of threads write one SF to global memory.
|
||||
// TODO: stage through smem for packed STG.32
|
||||
// is it better than STG.8 from 4 threads ?
|
||||
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
|
||||
// SF vector index (16 elements share one SF in the K dimension).
|
||||
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
|
||||
int32_t mIdx = rowIdx;
|
||||
|
||||
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
|
||||
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
|
||||
|
||||
int32_t mTileIdx = mIdx / (32 * 4);
|
||||
// SF vector size 16.
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numKTiles = (numCols + factor - 1) / factor;
|
||||
int64_t mTileStride = numKTiles * 32 * 4 * 4;
|
||||
|
||||
int32_t kTileIdx = (kIdx / 4);
|
||||
int64_t kTileStride = 32 * 4 * 4;
|
||||
|
||||
// M tile layout [32, 4] is column-major.
|
||||
int32_t outerMIdx = (mIdx % 32);
|
||||
int64_t outerMStride = 4 * 4;
|
||||
|
||||
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
|
||||
int64_t innerMStride = 4;
|
||||
|
||||
int32_t innerKIdx = (kIdx % 4);
|
||||
int64_t innerKStride = 1;
|
||||
|
||||
// Compute the global offset.
|
||||
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
|
||||
outerMIdx * outerMStride + innerMIdx * innerMStride +
|
||||
innerKIdx * innerKStride;
|
||||
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
return nullptr;
|
||||
|
||||
// SF vector index (16 elements share one SF in the K dimension).
|
||||
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
|
||||
int32_t mIdx = rowIdx;
|
||||
|
||||
// Decompose indices using bitwise ops (all divisors are powers of 2).
|
||||
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
|
||||
int32_t mTileIdx = mIdx >> 7; // mIdx / 128
|
||||
int32_t outerMIdx = mIdx & 31; // mIdx % 32
|
||||
int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4
|
||||
int32_t kTileIdx = kIdx >> 2; // kIdx / 4
|
||||
int32_t innerKIdx = kIdx & 3; // kIdx % 4
|
||||
|
||||
// Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
|
||||
// outerMIdx * 16 + innerMIdx * 4 + innerKIdx
|
||||
// Use bitwise OR for non-overlapping lower bits.
|
||||
int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
|
||||
<< 9 |
|
||||
(outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
|
||||
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
}
|
||||
|
||||
// Quantizes the provided PackedVec into the uint32_t output
|
||||
|
||||
@ -1,373 +0,0 @@
|
||||
#include "core/registration.h"
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <cutlass/arch/arch.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/group_array_problem_shape.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/device/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/gett.hpp"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include <cassert>
|
||||
|
||||
using namespace cute;
|
||||
|
||||
template <typename ElementAB, typename ElementC, typename ElementAccumulator,
|
||||
typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
|
||||
__global__ void get_ggemm_starts(
|
||||
int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
|
||||
ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
|
||||
ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
|
||||
ElementAB* b_base_as_int, ElementC* out_base_as_int,
|
||||
ElementAccumulator* a_scale_base_as_int,
|
||||
ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
|
||||
LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
|
||||
int expert_id = threadIdx.x;
|
||||
|
||||
if (expert_id >= gridDim.x * blockDim.x) {
|
||||
return;
|
||||
}
|
||||
|
||||
int m = problem_sizes[expert_id * 3];
|
||||
int n = problem_sizes[expert_id * 3 + 1];
|
||||
int k = problem_sizes[expert_id * 3 + 2];
|
||||
|
||||
int32_t expert_offset = expert_offsets[expert_id];
|
||||
int a_stride = expert_offset * k;
|
||||
int b_stride = expert_id * k * n;
|
||||
int a_scale_stride = expert_offset * k / 128;
|
||||
int b_scale_stride = expert_id * k * n / 128 / 128;
|
||||
|
||||
a_offsets[expert_id] = a_base_as_int + a_stride;
|
||||
b_offsets[expert_id] = b_base_as_int + b_stride;
|
||||
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
|
||||
a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
|
||||
b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
|
||||
|
||||
LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
|
||||
LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
|
||||
|
||||
*layout_sfa_ptr =
|
||||
ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
|
||||
*layout_sfb_ptr =
|
||||
ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
|
||||
}
|
||||
|
||||
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
|
||||
ScaleConfig) \
|
||||
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
|
||||
get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA, \
|
||||
LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>( \
|
||||
static_cast<int32_t*>(expert_offsets.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()), \
|
||||
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
|
||||
static_cast<float**>(a_scales_ptrs.data_ptr()), \
|
||||
static_cast<float**>(b_scales_ptrs.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
|
||||
static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()), \
|
||||
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
|
||||
static_cast<float*>(a_scales.data_ptr()), \
|
||||
static_cast<float*>(b_scales.data_ptr()), \
|
||||
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()), \
|
||||
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()), \
|
||||
static_cast<int*>(problem_sizes.data_ptr())); \
|
||||
}
|
||||
|
||||
template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
|
||||
void run_get_ggemm_starts(
|
||||
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
|
||||
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
|
||||
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
|
||||
torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
|
||||
torch::Tensor out_tensors, torch::Tensor const& a_scales,
|
||||
torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
|
||||
torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
|
||||
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
|
||||
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
|
||||
TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
|
||||
TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
|
||||
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
|
||||
|
||||
if (false) {
|
||||
}
|
||||
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
|
||||
LayoutSFB, ScaleConfig)
|
||||
__CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
|
||||
LayoutSFB, ScaleConfig)
|
||||
else {
|
||||
TORCH_CHECK(false, "Unsupported output tensor type");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename OutType, typename ScheduleConfig, typename LayoutD>
|
||||
void run_blockwise_scaled_group_mm(
|
||||
torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
|
||||
const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
|
||||
const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
|
||||
const torch::Tensor& stride_b, const torch::Tensor& stride_c,
|
||||
const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
|
||||
|
||||
// Types
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using ElementB = cutlass::float_e4m3_t;
|
||||
using ElementC = OutType;
|
||||
using ElementD = ElementC;
|
||||
using ElementAccumulator = float;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = LayoutD;
|
||||
|
||||
// Alignments
|
||||
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
|
||||
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
|
||||
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||
|
||||
using ArchTag = cutlass::arch::Sm100;
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
|
||||
typename ScheduleConfig::ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||
ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
|
||||
AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, ElementA,
|
||||
cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
|
||||
AlignmentA, ElementB,
|
||||
cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
|
||||
AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
|
||||
typename ScheduleConfig::ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
typename ScheduleConfig::KernelSchedule>::CollectiveOp;
|
||||
|
||||
using GemmKernel =
|
||||
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
|
||||
CollectiveEpilogue, void>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||
|
||||
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
|
||||
Gemm gemm_op;
|
||||
|
||||
// Mainloop Arguments
|
||||
typename GemmKernel::MainloopArguments mainloop_args{
|
||||
static_cast<const ElementA**>(a_ptrs.data_ptr()),
|
||||
static_cast<StrideA*>(stride_a.data_ptr()),
|
||||
static_cast<const ElementB**>(b_ptrs.data_ptr()),
|
||||
static_cast<StrideB*>(stride_b.data_ptr()),
|
||||
static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
|
||||
layout_sfa.data_ptr()),
|
||||
static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
|
||||
layout_sfb.data_ptr())};
|
||||
|
||||
int device_id = a_ptrs.device().index();
|
||||
static const cutlass::KernelHardwareInfo hw_info{
|
||||
device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
device_id)};
|
||||
|
||||
// Epilogue Arguments
|
||||
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||
{}, // epilogue.thread
|
||||
nullptr,
|
||||
static_cast<StrideC*>(stride_c.data_ptr()),
|
||||
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
||||
static_cast<StrideC*>(stride_c.data_ptr())};
|
||||
|
||||
UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
|
||||
|
||||
// Gemm Arguments
|
||||
typename GemmKernel::Arguments args{
|
||||
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||
{num_experts, problem_sizes_as_shapes, nullptr},
|
||||
mainloop_args,
|
||||
epilogue_args,
|
||||
hw_info};
|
||||
|
||||
at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
|
||||
|
||||
auto can_implement_status = gemm_op.can_implement(args);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM");
|
||||
|
||||
size_t workspace_size = gemm_op.get_workspace_size(args);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
|
||||
auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
|
||||
|
||||
status = gemm_op.run(stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void blockwise_scaled_group_mm_dispatch_shape(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||
struct MmaConfig {
|
||||
using ElementA = cutlass::float_e4m3_t;
|
||||
using KernelSchedule =
|
||||
cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
|
||||
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
|
||||
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
|
||||
1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using MmaTileShape = Shape<_128, _128, _128>;
|
||||
using ClusterShape = Shape<_1, _1, _1>;
|
||||
};
|
||||
|
||||
int num_experts = (int)expert_offsets.size(0);
|
||||
|
||||
auto a_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto b_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto out_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto a_scales_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto b_scales_ptrs = torch::empty(
|
||||
{num_experts},
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
|
||||
auto layout_sfa = torch::empty(
|
||||
{num_experts, 5},
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
|
||||
auto layout_sfb = torch::empty(
|
||||
{num_experts, 5},
|
||||
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
|
||||
|
||||
auto stride_a = torch::full(
|
||||
{num_experts}, a.size(1),
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto stride_b = torch::full(
|
||||
{num_experts}, a.size(1),
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
auto stride_c = torch::full(
|
||||
{num_experts}, output.size(1),
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
|
||||
|
||||
torch::TensorOptions options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
|
||||
|
||||
run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
|
||||
typename MmaConfig::LayoutSFB,
|
||||
typename MmaConfig::ScaleConfig>(
|
||||
expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
|
||||
b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
|
||||
|
||||
run_blockwise_scaled_group_mm<OutType, MmaConfig,
|
||||
typename MmaConfig::LayoutC>(
|
||||
out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
|
||||
stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
|
||||
expert_offsets);
|
||||
}
|
||||
|
||||
void cutlass_blockwise_scaled_grouped_mm(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
|
||||
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
|
||||
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||
"problem_sizes must have shape (num_experts, 3)");
|
||||
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||
"Number of experts in problem_sizes must match expert_offsets");
|
||||
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||
"problem_sizes must be int32");
|
||||
TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
|
||||
"a must be kFloat8_e4m3fn");
|
||||
TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
|
||||
"b must be kFloat8_e4m3fn");
|
||||
TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
|
||||
output.scalar_type() == torch::kHalf,
|
||||
"output must be bfloat16 or half");
|
||||
TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
|
||||
"scales_a must be float32");
|
||||
TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
|
||||
"scales_b must be float32");
|
||||
TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
|
||||
"expert_offsets must be int32");
|
||||
|
||||
TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
|
||||
TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
|
||||
TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
|
||||
TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
|
||||
TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
|
||||
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
|
||||
TORCH_CHECK(problem_sizes.size(1) == 3,
|
||||
"problem_sizes must have shape (num_experts, 3)");
|
||||
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
|
||||
"Number of experts in problem_sizes must match expert_offsets");
|
||||
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
|
||||
"problem_sizes must be int32");
|
||||
TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
|
||||
|
||||
#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
|
||||
if (output.scalar_type() == torch::kBFloat16) {
|
||||
blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
|
||||
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
|
||||
} else if (output.scalar_type() == torch::kFloat16) {
|
||||
blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
|
||||
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported output tensor type");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("cutlass_blockwise_scaled_grouped_mm",
|
||||
&cutlass_blockwise_scaled_grouped_mm);
|
||||
}
|
||||
@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor alpha) -> ()");
|
||||
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
|
||||
|
||||
// cutlass blockwise scaledgroup GEMM
|
||||
ops.def(
|
||||
"cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
|
||||
"Tensor scales_a, Tensor scales_b, "
|
||||
"Tensor problem_sizes, Tensor expert_offsets) -> ()");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
// cutlass nvfp4 block scaled group GEMM
|
||||
ops.def(
|
||||
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
|
||||
|
||||
@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||
&& uv pip install --system *.whl
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
ARG BASE_IMAGE
|
||||
|
||||
# Copy over the benchmark scripts as well
|
||||
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
|
||||
@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1
|
||||
# Performance environment variable.
|
||||
ENV HIP_FORCE_DEV_KERNARG=1
|
||||
|
||||
# Workaround for ROCm profiler limits
|
||||
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
|
||||
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
|
||||
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -1,15 +1,15 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||
ARG TRITON_BRANCH="57c693b6"
|
||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||
ARG PYTORCH_BRANCH="1c57644d"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.23.0"
|
||||
ARG PYTORCH_BRANCH="89075173"
|
||||
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.24.1"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
|
||||
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="59bd8ff2"
|
||||
ARG AITER_BRANCH="6af8b687"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
|
||||
@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
|
||||
## Manually Trigger the benchmark
|
||||
|
||||
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
|
||||
For CPU environment, please use the image with "-cpu" postfix.
|
||||
For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.
|
||||
|
||||
Here is an example for docker run command for CPU.
|
||||
Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.
|
||||
|
||||
```bash
|
||||
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
|
||||
export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
|
||||
export HF_TOKEN=<valid Hugging Face token>
|
||||
if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
|
||||
IMG_SUFFIX="arm64-cpu"
|
||||
else
|
||||
IMG_SUFFIX="cpu"
|
||||
fi
|
||||
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
|
||||
```
|
||||
|
||||
Then, run below command inside the docker instance.
|
||||
@ -26,7 +33,7 @@ When run, benchmark script generates results under **benchmark/results** folder,
|
||||
|
||||
### Runtime environment variables
|
||||
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
|
||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||
|
||||
@ -77,25 +77,20 @@ This complicates the process as we cannot use the out-of-the-box
|
||||
- `.buildkite/release-pipeline.yaml`
|
||||
- `.buildkite/scripts/upload-wheels.sh`
|
||||
|
||||
## Address long vLLM build time
|
||||
## Manually running vLLM builds on BuildKiteCI
|
||||
|
||||
When building vLLM with a new PyTorch/CUDA version, no cache will exist
|
||||
in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours
|
||||
and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode,
|
||||
it doesn't populate the cache, so re-running it to warm up the cache
|
||||
is ineffective.
|
||||
When building vLLM with a new PyTorch/CUDA version, the vLLM sccache S3 bucket
|
||||
will not have any cached artifacts, which can cause CI build jobs to exceed 5 hours.
|
||||
Furthermore, vLLM's fastcheck pipeline operates in read-only mode and does not
|
||||
populate the cache, making it ineffective for cache warm-up purposes.
|
||||
|
||||
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
|
||||
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`)
|
||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||
To address this, manually trigger a build on Buildkite to accomplish two objectives:
|
||||
|
||||
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
|
||||
2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
|
||||
to warm it up so that future builds are faster.
|
||||
1. Run the complete test suite against the PyTorch RC build by setting the environment variables: `RUN_ALL=1` and `NIGHTLY=1`
|
||||
2. Populate the vLLM sccache S3 bucket with compiled artifacts, enabling faster subsequent builds
|
||||
|
||||
<p align="center" width="100%">
|
||||
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/3b07f71b-bb18-4ca3-aeaf-da0fe79d315f" />
|
||||
</p>
|
||||
|
||||
## Update all the different vLLM platforms
|
||||
|
||||
@ -139,18 +139,18 @@ token data.
|
||||
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
```
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="query" width="70%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/query.png" alt="query" width="70%" />
|
||||
</p>
|
||||
|
||||
Each thread defines its own `q_ptr` which points to the assigned
|
||||
query token data on global memory. For example, if `VEC_SIZE` is 4
|
||||
and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
|
||||
total of 128 elements divided into 128 / 4 = 32 vecs.
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="q_vecs" width="70%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/q_vecs.png" alt="q_vecs" width="70%" />
|
||||
</p>
|
||||
|
||||
```cpp
|
||||
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
|
||||
@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr`
|
||||
points to key token data based on `k_cache` at assigned block,
|
||||
assigned head and assigned token.
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="key" width="70%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/key.png" alt="key" width="70%" />
|
||||
</p>
|
||||
|
||||
The diagram above illustrates the memory layout for key data. It
|
||||
assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
|
||||
@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
|
||||
elements for one token) that will be processed by 2 threads (one
|
||||
thread group) separately.
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="k_vecs" width="70%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/k_vecs.png" alt="k_vecs" width="70%" />
|
||||
</p>
|
||||
|
||||
```cpp
|
||||
K_vec k_vecs[NUM_VECS_PER_THREAD]
|
||||
@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of
|
||||
|
||||
## Value
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="value" width="70%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/value.png" alt="value" width="70%" />
|
||||
</p>
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="logits_vec" width="50%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/logits_vec.png" alt="logits_vec" width="50%" />
|
||||
</p>
|
||||
|
||||
<figure markdown="span">
|
||||
{ align="center" alt="v_vec" width="70%" }
|
||||
</figure>
|
||||
<p align="center">
|
||||
<img src="../assets/design/paged_attention/v_vec.png" alt="v_vec" width="70%" />
|
||||
</p>
|
||||
|
||||
Now we need to retrieve the value data and perform dot multiplication
|
||||
with `logits`. Unlike query and key, there is no thread group
|
||||
|
||||
@ -8,6 +8,16 @@ We recommend installing the library with:
|
||||
pip install nvidia-modelopt
|
||||
```
|
||||
|
||||
## Supported ModelOpt checkpoint formats
|
||||
|
||||
vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
|
||||
following `quantization.quant_algo` values:
|
||||
|
||||
- `FP8`: per-tensor weight scale (+ optional static activation scale).
|
||||
- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
|
||||
- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
|
||||
- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
|
||||
|
||||
## Quantizing HuggingFace Models with PTQ
|
||||
|
||||
You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
|
||||
@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
## Running the OpenAI-compatible server
|
||||
|
||||
To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
|
||||
|
||||
```bash
|
||||
vllm serve <path_to_exported_checkpoint> \
|
||||
--quantization modelopt \
|
||||
--host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
## Testing (local checkpoints)
|
||||
|
||||
vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
|
||||
by default in CI. To run the tests locally:
|
||||
|
||||
```bash
|
||||
export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
|
||||
export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
|
||||
pytest -q tests/quantization/test_modelopt.py
|
||||
```
|
||||
|
||||
@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
|
||||
|
||||
For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
|
||||
|
||||
### How FP8 KV Cache Works
|
||||
|
||||
The FP8 KV cache implementation follows this workflow:
|
||||
|
||||
1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
|
||||
2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
|
||||
3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
|
||||
|
||||
This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
|
||||
|
||||
### Performance Impact
|
||||
|
||||
The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:
|
||||
|
||||
@ -352,10 +352,17 @@ Supported models:
|
||||
* `zai-org/GLM-4.5`
|
||||
* `zai-org/GLM-4.5-Air`
|
||||
* `zai-org/GLM-4.6`
|
||||
* `zai-org/GLM-4.6-Air`
|
||||
|
||||
Flags: `--tool-call-parser glm45`
|
||||
|
||||
### GLM-4.7 Models (`glm47`)
|
||||
|
||||
Supported models:
|
||||
|
||||
* `zai-org/GLM-4.7`
|
||||
|
||||
Flags: `--tool-call-parser glm47`
|
||||
|
||||
### Qwen3-Coder Models (`qwen3_xml`)
|
||||
|
||||
Supported models:
|
||||
|
||||
@ -19,12 +19,12 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c
|
||||
|
||||
```bash
|
||||
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
|
||||
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index
|
||||
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu
|
||||
pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl
|
||||
```
|
||||
|
||||
!!! warning "set `LD_PRELOAD`"
|
||||
|
||||
@ -23,12 +23,12 @@ Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To
|
||||
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
|
||||
|
||||
# use uv
|
||||
uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu
|
||||
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --torch-backend cpu
|
||||
```
|
||||
??? console "pip"
|
||||
```bash
|
||||
# use pip
|
||||
pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
!!! warning "set `LD_PRELOAD`"
|
||||
Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`:
|
||||
|
||||
@ -387,7 +387,7 @@ th {
|
||||
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
|
||||
@ -415,9 +415,10 @@ th {
|
||||
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ |
|
||||
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ |
|
||||
| `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ |
|
||||
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
|
||||
| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
|
||||
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
@ -5,130 +5,91 @@ Usage:
|
||||
Single node:
|
||||
python examples/offline_inference/data_parallel.py \
|
||||
--model="ibm-research/PowerMoE-3b" \
|
||||
--dp-size=2 \
|
||||
--tp-size=2
|
||||
-dp=2 \
|
||||
-tp=2
|
||||
|
||||
Multi-node:
|
||||
Node 0 (assume the node has ip of 10.99.48.128):
|
||||
python examples/offline_inference/data_parallel.py \
|
||||
--model="ibm-research/PowerMoE-3b" \
|
||||
--dp-size=2 \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
--node-rank=0 \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
-dp=2 \
|
||||
-tp=2 \
|
||||
--dp-num-nodes=2 \
|
||||
--dp-node-rank=0 \
|
||||
--dp-master-addr=10.99.48.128 \
|
||||
--dp-master-port=13345
|
||||
Node 1:
|
||||
python examples/offline_inference/data_parallel.py \
|
||||
--model="ibm-research/PowerMoE-3b" \
|
||||
--dp-size=2 \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
--node-rank=1 \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
-dp=2 \
|
||||
-tp=2 \
|
||||
--dp-num-nodes=2 \
|
||||
--dp-node-rank=1 \
|
||||
--dp-master-addr=10.99.48.128 \
|
||||
--dp-master-port=13345
|
||||
"""
|
||||
|
||||
import os
|
||||
from time import sleep
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
|
||||
def parse_args():
|
||||
import argparse
|
||||
def create_parser():
|
||||
parser = FlexibleArgumentParser(description="Data Parallel Inference")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Data Parallel Inference")
|
||||
# Add all engine args
|
||||
EngineArgs.add_cli_args(parser)
|
||||
parser.set_defaults(
|
||||
model="ibm-research/PowerMoE-3b",
|
||||
enable_expert_parallel=True,
|
||||
)
|
||||
|
||||
# Add DP-specific args (separate from engine args to avoid conflicts)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
"--dp-num-nodes",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Total number of nodes for data parallel.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dp-node-rank",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Rank of the current node for data parallel.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dp-master-addr",
|
||||
type=str,
|
||||
default="ibm-research/PowerMoE-3b",
|
||||
help="Model name or path",
|
||||
)
|
||||
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
|
||||
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
|
||||
parser.add_argument(
|
||||
"--node-size", type=int, default=1, help="Total number of nodes"
|
||||
default="",
|
||||
help="Master node IP address for DP coordination.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--node-rank", type=int, default=0, help="Rank of the current node"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--master-addr", type=str, default="", help="Master node IP address"
|
||||
)
|
||||
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
|
||||
parser.add_argument(
|
||||
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust-remote-code", action="store_true", help="Trust remote code."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-num-seqs",
|
||||
"--dp-master-port",
|
||||
type=int,
|
||||
default=64,
|
||||
help=("Maximum number of sequences to be processed in a single iteration."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-model-len",
|
||||
type=int,
|
||||
help=("Maximum number of tokens to be processed in a single iteration."),
|
||||
default=0,
|
||||
help="Master node port for DP coordination.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=300,
|
||||
help=("Number of seconds before unresponsive process is killed."),
|
||||
help="Number of seconds before unresponsive process is killed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpu-memory-utilization",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-dbo",
|
||||
action="store_true",
|
||||
help=("Enable microbatched execution"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compilation-config",
|
||||
type=int,
|
||||
help=("Compilation optimization (O) mode 0-3."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantization",
|
||||
type=str,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-expert-parallel",
|
||||
dest="enable_expert_parallel",
|
||||
action="store_false",
|
||||
help="Disable expert parallel (default: enabled).",
|
||||
)
|
||||
parser.set_defaults(enable_expert_parallel=True)
|
||||
return parser.parse_args()
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(
|
||||
model,
|
||||
dp_size,
|
||||
local_dp_rank,
|
||||
global_dp_rank,
|
||||
dp_master_ip,
|
||||
dp_master_port,
|
||||
GPUs_per_dp_rank,
|
||||
enforce_eager,
|
||||
enable_expert_parallel,
|
||||
trust_remote_code,
|
||||
max_num_seqs,
|
||||
max_model_len,
|
||||
compilation_config,
|
||||
gpu_memory_utilization,
|
||||
enable_dbo,
|
||||
quantization,
|
||||
engine_args,
|
||||
):
|
||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
|
||||
@ -173,19 +134,7 @@ def main(
|
||||
)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tensor_parallel_size=GPUs_per_dp_rank,
|
||||
enforce_eager=enforce_eager,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
trust_remote_code=trust_remote_code,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enable_dbo=enable_dbo,
|
||||
quantization=quantization,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
llm = LLM(**engine_args)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for i, output in enumerate(outputs):
|
||||
@ -204,22 +153,29 @@ def main(
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
parser = create_parser()
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
dp_size = args.dp_size
|
||||
tp_size = args.tp_size
|
||||
node_size = args.node_size
|
||||
node_rank = args.node_rank
|
||||
# Extract DP-specific args (pop to remove from engine_args)
|
||||
dp_size = args.pop("data_parallel_size")
|
||||
dp_num_nodes = args.pop("dp_num_nodes")
|
||||
dp_node_rank = args.pop("dp_node_rank")
|
||||
dp_master_addr = args.pop("dp_master_addr")
|
||||
dp_master_port = args.pop("dp_master_port")
|
||||
timeout = args.pop("timeout")
|
||||
|
||||
if node_size == 1:
|
||||
# Remaining args are engine args
|
||||
engine_args = args
|
||||
|
||||
if dp_num_nodes == 1:
|
||||
dp_master_ip = "127.0.0.1"
|
||||
dp_master_port = get_open_port()
|
||||
dp_master_port_val = get_open_port()
|
||||
else:
|
||||
dp_master_ip = args.master_addr
|
||||
dp_master_port = args.master_port
|
||||
dp_master_ip = dp_master_addr
|
||||
dp_master_port_val = dp_master_port
|
||||
|
||||
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
|
||||
dp_per_node = dp_size // node_size
|
||||
assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
|
||||
dp_per_node = dp_size // dp_num_nodes
|
||||
|
||||
from multiprocessing import Process
|
||||
|
||||
@ -230,34 +186,24 @@ if __name__ == "__main__":
|
||||
|
||||
procs = []
|
||||
for local_dp_rank, global_dp_rank in enumerate(
|
||||
range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
|
||||
range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
|
||||
):
|
||||
proc = Process(
|
||||
target=main,
|
||||
args=(
|
||||
args.model,
|
||||
dp_size,
|
||||
local_dp_rank,
|
||||
global_dp_rank,
|
||||
dp_master_ip,
|
||||
dp_master_port,
|
||||
tp_size,
|
||||
args.enforce_eager,
|
||||
args.enable_expert_parallel,
|
||||
args.trust_remote_code,
|
||||
args.max_num_seqs,
|
||||
args.max_model_len,
|
||||
args.compilation_config,
|
||||
args.gpu_memory_utilization,
|
||||
args.enable_dbo,
|
||||
args.quantization,
|
||||
dp_master_port_val,
|
||||
engine_args,
|
||||
),
|
||||
)
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
exit_code = 0
|
||||
for proc in procs:
|
||||
proc.join(timeout=args.timeout)
|
||||
proc.join(timeout=timeout)
|
||||
if proc.exitcode is None:
|
||||
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
|
||||
proc.kill()
|
||||
|
||||
@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
|
||||
|
||||
- `--max-num-batched-tokens=<large value>` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
|
||||
|
||||
- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
|
||||
|
||||
## Local media inputs
|
||||
|
||||
To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:
|
||||
|
||||
@ -313,7 +313,7 @@ async def test_chat_streaming_input_audio(
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
{"type": "text", "text": "What's a short title for this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
223
tests/entrypoints/openai/test_embedding_shape_validation.py
Normal file
223
tests/entrypoints/openai/test_embedding_shape_validation.py
Normal file
@ -0,0 +1,223 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Embedding shape validation in multimodal APIs.
|
||||
|
||||
Tests verify that embeddings with correct ndim but incorrect hidden_size
|
||||
are rejected before they can cause crashes during model inference.
|
||||
|
||||
Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
|
||||
classes, not by CompletionRenderer or MediaIO classes.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.parse import (
|
||||
AudioEmbeddingItems,
|
||||
ImageEmbeddingItems,
|
||||
MultiModalDataParser,
|
||||
VideoEmbeddingItems,
|
||||
)
|
||||
|
||||
|
||||
class TestMultiModalParserShapeValidation:
|
||||
"""Test hidden_size validation in MultiModalDataParser."""
|
||||
|
||||
def test_image_embeddings_correct_hidden_size_accepted(self):
|
||||
"""Baseline: Image embeddings with correct hidden_size should work."""
|
||||
expected_hidden_size = 768
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
valid_embeds = torch.randn(2, 100, expected_hidden_size)
|
||||
|
||||
result = parser.parse_mm_data({"image": valid_embeds})
|
||||
|
||||
assert "image" in result
|
||||
assert isinstance(result["image"], ImageEmbeddingItems)
|
||||
assert result["image"].get_count() == 2
|
||||
|
||||
def test_image_embeddings_wrong_hidden_size_rejected(self):
|
||||
"""Security: Image embeddings with wrong hidden_size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"image": invalid_embeds})
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "image" in error_msg
|
||||
assert "hidden dimension mismatch" in error_msg
|
||||
|
||||
def test_audio_embeddings_wrong_hidden_size_rejected(self):
|
||||
"""Security: Audio embeddings with wrong hidden_size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 2048
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"audio": invalid_embeds})
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "audio" in error_msg
|
||||
assert "hidden dimension mismatch" in error_msg
|
||||
|
||||
def test_video_embeddings_wrong_hidden_size_rejected(self):
|
||||
"""Security: Video embeddings with wrong hidden_size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 512
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"video": invalid_embeds})
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "video" in error_msg
|
||||
assert "hidden dimension mismatch" in error_msg
|
||||
|
||||
def test_list_of_embeddings_validates_each(self):
|
||||
"""Security: Each embedding in list should be validated."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 1024
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
# List with second tensor having wrong hidden_size
|
||||
invalid_embeds = [
|
||||
torch.randn(100, expected_hidden_size),
|
||||
torch.randn(100, wrong_hidden_size),
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"image": invalid_embeds})
|
||||
|
||||
# Should identify which embedding failed
|
||||
assert "[1]" in str(exc_info.value)
|
||||
|
||||
def test_validation_disabled_allows_any_size(self):
|
||||
"""When validation disabled (legacy), any hidden_size allowed."""
|
||||
parser = MultiModalDataParser(expected_hidden_size=None)
|
||||
|
||||
any_hidden_size = 12345
|
||||
embeds = torch.randn(2, 100, any_hidden_size)
|
||||
|
||||
# Should not raise
|
||||
result = parser.parse_mm_data({"image": embeds})
|
||||
assert "image" in result
|
||||
assert isinstance(result["image"], ImageEmbeddingItems)
|
||||
|
||||
|
||||
class TestEmbeddingItemsDirectValidation:
|
||||
"""Direct tests for EmbeddingItems hidden_size validation."""
|
||||
|
||||
def test_image_embedding_items_validates_batched_tensor(self):
|
||||
"""Test validation for batched (3D) image embeddings."""
|
||||
expected = 768
|
||||
wrong = 1024
|
||||
|
||||
# Valid
|
||||
valid = torch.randn(2, 100, expected)
|
||||
items = ImageEmbeddingItems(valid, expected_hidden_size=expected)
|
||||
assert items.get_count() == 2
|
||||
|
||||
# Invalid
|
||||
invalid = torch.randn(2, 100, wrong)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(invalid, expected_hidden_size=expected)
|
||||
|
||||
assert str(wrong) in str(exc_info.value)
|
||||
assert str(expected) in str(exc_info.value)
|
||||
|
||||
def test_image_embedding_items_validates_list_of_tensors(self):
|
||||
"""Test validation for list of 2D image embeddings."""
|
||||
expected = 768
|
||||
wrong = 512
|
||||
|
||||
# Valid list
|
||||
valid_list = [torch.randn(100, expected), torch.randn(50, expected)]
|
||||
items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected)
|
||||
assert items.get_count() == 2
|
||||
|
||||
# Invalid list
|
||||
invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)]
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(invalid_list, expected_hidden_size=expected)
|
||||
|
||||
assert "[1]" in str(exc_info.value)
|
||||
|
||||
def test_audio_embedding_items_validates(self):
|
||||
"""Test validation for audio embeddings."""
|
||||
expected = 768
|
||||
wrong = 256
|
||||
|
||||
invalid = torch.randn(2, 100, wrong)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AudioEmbeddingItems(invalid, expected_hidden_size=expected)
|
||||
|
||||
assert "audio" in str(exc_info.value).lower()
|
||||
|
||||
def test_video_embedding_items_validates(self):
|
||||
"""Test validation for video embeddings."""
|
||||
expected = 768
|
||||
wrong = 384
|
||||
|
||||
invalid = torch.randn(2, 100, wrong)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
VideoEmbeddingItems(invalid, expected_hidden_size=expected)
|
||||
|
||||
assert "video" in str(exc_info.value).lower()
|
||||
|
||||
|
||||
class TestShapeValidationIntegration:
|
||||
"""Integration tests verifying attack scenarios are blocked."""
|
||||
|
||||
def test_attack_scenario_multimodal_image(self):
|
||||
"""
|
||||
Simulate attack through Chat API with image embeddings.
|
||||
|
||||
Verifies validation occurs in multimodal parser path.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser.parse_mm_data({"image": attack_tensor})
|
||||
|
||||
def test_attack_scenario_multimodal_audio(self):
|
||||
"""
|
||||
Simulate attack through Chat API with audio embeddings.
|
||||
|
||||
Verifies validation occurs in multimodal parser path.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 2048
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser.parse_mm_data({"audio": attack_tensor})
|
||||
|
||||
def test_attack_scenario_multimodal_video(self):
|
||||
"""
|
||||
Simulate attack through Chat API with video embeddings.
|
||||
|
||||
Verifies validation occurs in multimodal parser path.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 1024
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser.parse_mm_data({"video": attack_tensor})
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib
|
||||
import importlib.util
|
||||
import json
|
||||
import time
|
||||
|
||||
@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
|
||||
assert (
|
||||
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
|
||||
response = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
|
||||
temperature=0.0,
|
||||
max_tokens=250,
|
||||
)
|
||||
|
||||
choice = response.choices[0]
|
||||
assert choice.finish_reason == "length", (
|
||||
f"Expected finish_reason='length', got {choice.finish_reason}"
|
||||
)
|
||||
assert choice.message.content is not None, (
|
||||
"Content should not be None when truncated"
|
||||
)
|
||||
assert len(choice.message.content) > 0, "Content should not be empty"
|
||||
|
||||
@ -15,6 +15,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ErrorResponse,
|
||||
RequestResponseMetadata,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
@ -52,8 +53,19 @@ def with_tool_parser(request) -> bool:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
scope="module",
|
||||
params=[True],
|
||||
ids=["exclude_tools_when_tool_choice_none"],
|
||||
)
|
||||
def exclude_tools_when_tool_choice_none(request) -> bool:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(with_tool_parser: bool):
|
||||
def default_server_args(
|
||||
with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
|
||||
):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--enforce-eager",
|
||||
@ -72,6 +84,8 @@ def default_server_args(with_tool_parser: bool):
|
||||
"--enable-auto-tool-choice",
|
||||
]
|
||||
)
|
||||
if exclude_tools_when_tool_choice_none:
|
||||
args.append("--exclude-tools-when-tool-choice-none")
|
||||
return args
|
||||
|
||||
|
||||
@ -335,6 +349,69 @@ async def test_gpt_oss_tool_message_array_content(
|
||||
assert response_multi_array.choices[0].message is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gpt_oss_tool_choice_none(
|
||||
gptoss_client: OpenAI,
|
||||
with_tool_parser: bool,
|
||||
exclude_tools_when_tool_choice_none: bool,
|
||||
):
|
||||
if not (with_tool_parser and exclude_tools_when_tool_choice_none):
|
||||
pytest.skip(
|
||||
"skip tool_choice tests when non-tool or "
|
||||
"--exclude-tools-when-tool-choice-none not set"
|
||||
)
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {"type": "string"},
|
||||
"state": {"type": "string"},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's the temperature(in degrees Celsius) in Dallas?",
|
||||
},
|
||||
]
|
||||
|
||||
tool_choice_auto = await gptoss_client.chat.completions.create(
|
||||
model=GPT_OSS_MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice="auto",
|
||||
temperature=0.0,
|
||||
)
|
||||
msg = tool_choice_auto.choices[0].message
|
||||
assert len(msg.tool_calls) == 1
|
||||
|
||||
tool_choice_none = await gptoss_client.chat.completions.create(
|
||||
model=GPT_OSS_MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice="none",
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
msg = tool_choice_none.choices[0].message
|
||||
assert len(msg.tool_calls) == 0
|
||||
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
MODEL_NAME_SHORT = "gpt2"
|
||||
CHAT_TEMPLATE = "Dummy chat template for testing {}"
|
||||
@ -878,7 +955,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
],
|
||||
)
|
||||
@ -906,7 +982,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages_2,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user"},
|
||||
# The analysis message should be dropped on subsequent inputs because
|
||||
# of the subsequent assistant message to the final channel.
|
||||
@ -966,7 +1041,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
@ -1047,7 +1122,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
@ -1128,7 +1203,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
@ -1178,7 +1253,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the third turn's input
|
||||
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
|
||||
verify_harmony_messages(
|
||||
input_messages_3,
|
||||
@ -1241,7 +1316,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the fourth turn's input
|
||||
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
|
||||
verify_harmony_messages(
|
||||
input_messages_4,
|
||||
@ -1297,7 +1372,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
# The reasoning that would have resulted in an analysis message is
|
||||
# dropped because of a later assistant message to the final channel.
|
||||
@ -1329,7 +1403,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
{
|
||||
"role": "assistant",
|
||||
@ -1359,7 +1432,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
{
|
||||
"role": "assistant",
|
||||
@ -1368,3 +1440,69 @@ class TestServingChatWithHarmony:
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_choice_validation_without_parser():
|
||||
"""Test that tool_choice='required' or named tool without tool_parser
|
||||
returns an appropriate error message."""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
models = OpenAIServingModels(
|
||||
engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
)
|
||||
# Create serving_chat without tool_parser (enable_auto_tools=False)
|
||||
serving_chat = OpenAIServingChat(
|
||||
mock_engine,
|
||||
models,
|
||||
response_role="assistant",
|
||||
chat_template=CHAT_TEMPLATE,
|
||||
chat_template_content_format="auto",
|
||||
request_logger=None,
|
||||
enable_auto_tools=False, # No tool parser
|
||||
)
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"location": {"type": "string"}},
|
||||
"required": ["location"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
# Test tool_choice="required" without tool_parser
|
||||
req_required = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "What's the weather?"}],
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
)
|
||||
response_required = await serving_chat.create_chat_completion(req_required)
|
||||
assert isinstance(response_required, ErrorResponse)
|
||||
assert "tool_choice" in response_required.error.message
|
||||
assert "--tool-call-parser" in response_required.error.message
|
||||
|
||||
# Test named tool_choice without tool_parser
|
||||
req_named = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "What's the weather?"}],
|
||||
tools=tools,
|
||||
tool_choice={"type": "function", "function": {"name": "get_weather"}},
|
||||
)
|
||||
response_named = await serving_chat.create_chat_completion(req_named)
|
||||
assert isinstance(response_named, ErrorResponse)
|
||||
assert "tool_choice" in response_named.error.message
|
||||
assert "--tool-call-parser" in response_named.error.message
|
||||
|
||||
212
tests/entrypoints/openai/test_serving_chat_stream_harmony.py
Normal file
212
tests/entrypoints/openai/test_serving_chat_stream_harmony.py
Normal file
@ -0,0 +1,212 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Unit tests for harmony streaming delta extraction.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.serving_chat_stream_harmony import (
|
||||
extract_harmony_streaming_delta,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockMessage:
|
||||
"""Mock message object for testing."""
|
||||
|
||||
channel: str | None = None
|
||||
recipient: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockStreamableParser:
|
||||
"""Mock StreamableParser for testing without openai_harmony dependency."""
|
||||
|
||||
messages: list[MockMessage] = field(default_factory=list)
|
||||
|
||||
|
||||
class TestExtractHarmonyStreamingDelta:
|
||||
"""Tests for extract_harmony_streaming_delta function."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"delta_text,expected_content",
|
||||
[
|
||||
("Hello, world!", "Hello, world!"),
|
||||
("", ""),
|
||||
],
|
||||
)
|
||||
def test_final_channel_returns_content_delta(self, delta_text, expected_content):
|
||||
"""Test that final channel returns a DeltaMessage with content."""
|
||||
parser = MockStreamableParser()
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel="final",
|
||||
cur_recipient=None,
|
||||
prev_recipient=None,
|
||||
delta_text=delta_text,
|
||||
include_reasoning=False,
|
||||
)
|
||||
|
||||
assert delta_message is not None
|
||||
assert delta_message.content == expected_content
|
||||
assert tools_streamed is False
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"include_reasoning,expected_has_message",
|
||||
[
|
||||
(True, True),
|
||||
(False, False),
|
||||
],
|
||||
)
|
||||
def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message):
|
||||
"""Test analysis channel respects include_reasoning flag."""
|
||||
parser = MockStreamableParser()
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel="analysis",
|
||||
cur_recipient=None,
|
||||
prev_recipient=None,
|
||||
delta_text="Let me think...",
|
||||
include_reasoning=include_reasoning,
|
||||
)
|
||||
|
||||
if expected_has_message:
|
||||
assert delta_message is not None
|
||||
assert delta_message.reasoning == "Let me think..."
|
||||
else:
|
||||
assert delta_message is None
|
||||
assert tools_streamed is False
|
||||
|
||||
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
|
||||
@patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
|
||||
def test_new_tool_call(self, mock_make_tool_call_id, channel):
|
||||
"""Test new tool call creation when recipient changes."""
|
||||
mock_make_tool_call_id.return_value = "call_test123"
|
||||
parser = MockStreamableParser()
|
||||
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel=channel,
|
||||
cur_recipient="functions.get_weather",
|
||||
prev_recipient=None,
|
||||
delta_text="",
|
||||
include_reasoning=False,
|
||||
)
|
||||
|
||||
assert delta_message is not None
|
||||
assert len(delta_message.tool_calls) == 1
|
||||
tool_call = delta_message.tool_calls[0]
|
||||
assert tool_call.id == "call_test123"
|
||||
assert tool_call.type == "function"
|
||||
assert tool_call.function.name == "get_weather"
|
||||
assert tool_call.function.arguments == ""
|
||||
assert tool_call.index == 0
|
||||
assert tools_streamed is True
|
||||
|
||||
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
|
||||
def test_tool_call_argument_streaming(self, channel):
|
||||
"""Test streaming tool call arguments (same recipient)."""
|
||||
parser = MockStreamableParser()
|
||||
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel=channel,
|
||||
cur_recipient="functions.get_weather",
|
||||
prev_recipient="functions.get_weather",
|
||||
delta_text='{"location": "Paris"}',
|
||||
include_reasoning=False,
|
||||
)
|
||||
|
||||
assert delta_message is not None
|
||||
tool_call = delta_message.tool_calls[0]
|
||||
assert tool_call.id is None
|
||||
assert tool_call.function.arguments == '{"location": "Paris"}'
|
||||
assert tool_call.index == 0
|
||||
assert tools_streamed is True
|
||||
|
||||
@pytest.mark.parametrize("channel", ["commentary", "analysis"])
|
||||
def test_tool_call_empty_arguments_returns_none(self, channel):
|
||||
"""Test empty delta_text with same recipient returns None."""
|
||||
parser = MockStreamableParser()
|
||||
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel=channel,
|
||||
cur_recipient="functions.get_weather",
|
||||
prev_recipient="functions.get_weather",
|
||||
delta_text="",
|
||||
include_reasoning=False,
|
||||
)
|
||||
|
||||
assert delta_message is None
|
||||
assert tools_streamed is False
|
||||
|
||||
def test_tool_call_index_from_previous_messages(self):
|
||||
"""Test tool call index accounts for previous function messages."""
|
||||
messages = [
|
||||
MockMessage(channel="analysis", recipient=None), # Not counted
|
||||
MockMessage(channel="commentary", recipient="functions.tool1"), # Counted
|
||||
MockMessage(channel="final", recipient=None), # Not counted
|
||||
]
|
||||
parser = MockStreamableParser(messages=messages)
|
||||
|
||||
delta_message, _ = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel="commentary",
|
||||
cur_recipient="functions.tool2",
|
||||
prev_recipient="functions.tool2",
|
||||
delta_text="args",
|
||||
include_reasoning=False,
|
||||
)
|
||||
|
||||
assert delta_message.tool_calls[0].index == 1
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"channel,recipient",
|
||||
[
|
||||
("commentary", None),
|
||||
("commentary", "browser.search"),
|
||||
],
|
||||
)
|
||||
def test_returns_tool_call_preambles(self, channel, recipient):
|
||||
"""Test that invalid channel/recipient combinations return None."""
|
||||
parser = MockStreamableParser()
|
||||
delta_text = "some text"
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel=channel,
|
||||
cur_recipient=recipient,
|
||||
prev_recipient=None,
|
||||
delta_text=delta_text,
|
||||
include_reasoning=True,
|
||||
)
|
||||
|
||||
assert delta_message.content == delta_text
|
||||
assert tools_streamed is False
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"channel,recipient",
|
||||
[
|
||||
(None, None),
|
||||
("unknown_channel", None),
|
||||
],
|
||||
)
|
||||
def test_returns_none_for_invalid_inputs(self, channel, recipient):
|
||||
"""Test that invalid channel/recipient combinations return None."""
|
||||
parser = MockStreamableParser()
|
||||
|
||||
delta_message, tools_streamed = extract_harmony_streaming_delta(
|
||||
harmony_parser=parser,
|
||||
cur_channel=channel,
|
||||
cur_recipient=recipient,
|
||||
prev_recipient=None,
|
||||
delta_text="some text",
|
||||
include_reasoning=True,
|
||||
)
|
||||
|
||||
assert delta_message is None
|
||||
assert tools_streamed is False
|
||||
@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16]
|
||||
IS_NEOX = [True, False]
|
||||
EPS_VALUES = [1e-5, 1e-6]
|
||||
SEEDS = [13]
|
||||
PARTIAL_ROPE = [True, False]
|
||||
CUDA_DEVICES = ["cuda:0"]
|
||||
|
||||
|
||||
@ -52,6 +53,7 @@ def _apply_qk_norm_rope(
|
||||
@pytest.mark.parametrize("is_neox", IS_NEOX)
|
||||
@pytest.mark.parametrize("eps", EPS_VALUES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
|
||||
@torch.inference_mode()
|
||||
def test_fused_qk_norm_rope_matches_reference(
|
||||
device: str,
|
||||
@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference(
|
||||
is_neox: bool,
|
||||
eps: float,
|
||||
seed: int,
|
||||
rotary_ratio: float,
|
||||
):
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference(
|
||||
k_norm.weight.data.normal_(mean=1.0, std=0.1)
|
||||
q_weight = q_norm.weight.data
|
||||
k_weight = k_norm.weight.data
|
||||
|
||||
rotary_dim = int(head_dim * rotary_ratio)
|
||||
rope = RotaryEmbedding(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position_embeddings=4096,
|
||||
base=10000.0,
|
||||
is_neox_style=is_neox,
|
||||
|
||||
@ -258,16 +258,16 @@ class Config:
|
||||
f"{self.fe_supported_types()}."
|
||||
)
|
||||
|
||||
# Check block quanization support
|
||||
is_block_quatized = self.quant_block_shape is not None
|
||||
if is_block_quatized and self.quant_dtype is None:
|
||||
# Check block quantization support
|
||||
is_block_quantized = self.quant_block_shape is not None
|
||||
if is_block_quantized and self.quant_dtype is None:
|
||||
return False, "No block quantization support."
|
||||
|
||||
if is_block_quatized and not self.is_block_quant_supported():
|
||||
if is_block_quantized and not self.is_block_quant_supported():
|
||||
return False, "Mismatched block quantization support."
|
||||
|
||||
# deep_gemm only works with block-quantized
|
||||
if self.needs_deep_gemm() and not is_block_quatized:
|
||||
if self.needs_deep_gemm() and not is_block_quantized:
|
||||
return False, "Needs DeepGEMM but not block quantized."
|
||||
|
||||
# Check dependencies (turn into asserts?)
|
||||
|
||||
@ -1,92 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# DeepGEMM Style Cutlass Grouped GEMM Test
|
||||
# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.kernels.moe.utils import per_token_cast_to_fp8
|
||||
from tests.kernels.utils import baseline_scaled_mm
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.deep_gemm import per_block_cast_to_fp8
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_groups, expected_m_per_group, k, n",
|
||||
[
|
||||
(4, 8192, 7168, 4096),
|
||||
(4, 8192, 2048, 7168),
|
||||
(8, 4096, 7168, 4096),
|
||||
(8, 4096, 2048, 7168),
|
||||
(32, 1024, 7168, 4096),
|
||||
(32, 1024, 2048, 7168),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("out_dtype", [torch.float16])
|
||||
@pytest.mark.skipif(
|
||||
(lambda x: x is None or x.to_int() != 100)(
|
||||
current_platform.get_device_capability()
|
||||
),
|
||||
reason="Block Scaled Grouped GEMM is only supported on SM100.",
|
||||
)
|
||||
def test_cutlass_grouped_gemm(
|
||||
num_groups: int,
|
||||
expected_m_per_group: int,
|
||||
k: int,
|
||||
n: int,
|
||||
out_dtype: torch.dtype,
|
||||
):
|
||||
device = "cuda"
|
||||
alignment = 128
|
||||
group_ms = [
|
||||
int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
|
||||
]
|
||||
m = sum([cdiv(m, alignment) * alignment for m in group_ms])
|
||||
|
||||
x = torch.randn((m, k), device=device, dtype=out_dtype)
|
||||
y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
|
||||
out = torch.empty((m, n), device=device, dtype=out_dtype)
|
||||
ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
|
||||
|
||||
ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
|
||||
pb_size = []
|
||||
for i in range(num_groups):
|
||||
pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
|
||||
problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
|
||||
expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
|
||||
|
||||
x_fp8 = per_token_cast_to_fp8(x)
|
||||
y_fp8 = (
|
||||
torch.empty_like(y, dtype=torch.float8_e4m3fn),
|
||||
torch.empty(
|
||||
(num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
|
||||
),
|
||||
)
|
||||
for i in range(num_groups):
|
||||
y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
|
||||
|
||||
for i in range(num_groups):
|
||||
a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
|
||||
a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
|
||||
b = y_fp8[0][i].t()
|
||||
b_scale = y_fp8[1][i].t()
|
||||
baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
|
||||
ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
|
||||
|
||||
ops.cutlass_blockwise_scaled_grouped_mm(
|
||||
out,
|
||||
x_fp8[0],
|
||||
y_fp8[0],
|
||||
x_fp8[1],
|
||||
y_fp8[1],
|
||||
problem_sizes,
|
||||
expert_offsets[:-1],
|
||||
)
|
||||
|
||||
torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)
|
||||
@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
|
||||
from vllm.model_executor.models.mixtral import MixtralMoE
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
from vllm.v1.worker.workspace import init_workspace_manager
|
||||
|
||||
NUM_EXPERTS = [8, 64, 192]
|
||||
EP_SIZE = [1, 4]
|
||||
@ -487,6 +488,7 @@ def test_mixtral_moe(
|
||||
monkeypatch.setenv("MASTER_ADDR", "localhost")
|
||||
monkeypatch.setenv("MASTER_PORT", "12345")
|
||||
init_distributed_environment()
|
||||
init_workspace_manager(torch.cuda.current_device())
|
||||
|
||||
# Instantiate our and huggingface's MoE blocks
|
||||
vllm_config.compilation_config.static_forward_context = dict()
|
||||
@ -533,6 +535,11 @@ def test_mixtral_moe(
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# FIXME (zyongye) fix this after we move self.kernel
|
||||
# assignment in FusedMoE.__init__
|
||||
|
||||
vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
|
||||
|
||||
# Run forward passes for both MoE blocks
|
||||
hf_states, _ = hf_moe.forward(hf_inputs)
|
||||
vllm_states = vllm_moe.forward(vllm_inputs)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Pytest configuration for vLLM tests."""
|
||||
"""Pytest configuration for vLLM multimodal tests."""
|
||||
|
||||
import warnings
|
||||
|
||||
@ -9,16 +9,13 @@ import torch
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
|
||||
Transformers accuracy issues.
|
||||
"""
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Configure ROCm-specific settings based on collected tests."""
|
||||
if not current_platform.is_rocm():
|
||||
return
|
||||
|
||||
skip_patterns = ["test_granite_speech.py"]
|
||||
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
|
||||
# Skip disabling SDP for Granite Speech tests on ROCm
|
||||
return
|
||||
|
||||
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
|
||||
@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
|
||||
vllm_runner_kwargs={
|
||||
"attention_config": {
|
||||
"backend": "ROCM_AITER_FA",
|
||||
},
|
||||
}
|
||||
if current_platform.is_rocm()
|
||||
else None,
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[
|
||||
pytest.mark.core_model,
|
||||
@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = {
|
||||
image_size_factors=[(0.25, 0.2, 0.15)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
# TODO: [ROCm] Revert this once issue #30167 is resolved
|
||||
**(
|
||||
{
|
||||
"mm_processor_kwargs": {
|
||||
"min_pixels": 256 * 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
},
|
||||
}
|
||||
if current_platform.is_rocm()
|
||||
else {}
|
||||
),
|
||||
},
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
|
||||
),
|
||||
#### Extended model tests
|
||||
"aria": VLMTestInfo(
|
||||
@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = {
|
||||
hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
|
||||
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
marks=[large_gpu_mark(min_gb=80)],
|
||||
marks=[
|
||||
large_gpu_mark(min_gb=80),
|
||||
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
|
||||
pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason=(
|
||||
"ROCm: Model too large for single GPU; "
|
||||
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
"molmo": VLMTestInfo(
|
||||
models=["allenai/Molmo-7B-D-0924"],
|
||||
|
||||
@ -39,7 +39,7 @@ models = [MODEL_NAME]
|
||||
def granite_speech_attention_config():
|
||||
"""Return attention config for Granite Speech tests on ROCm."""
|
||||
if current_platform.is_rocm():
|
||||
return {"backend": "TRITON_ATTN"}
|
||||
return {"backend": "ROCM_AITER_FA"}
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@ -138,7 +138,7 @@ def create_batched_mm_kwargs(
|
||||
)
|
||||
|
||||
|
||||
# TODO(Isotr0py): Don't initalize model during test
|
||||
# TODO(Isotr0py): Don't initialize model during test
|
||||
@contextmanager
|
||||
def initialize_dummy_model(
|
||||
model_cls: type[nn.Module],
|
||||
|
||||
@ -459,6 +459,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
),
|
||||
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
|
||||
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
|
||||
"MiMoV2FlashForCausalLM": _HfExamplesInfo(
|
||||
"XiaomiMiMo/MiMo-V2-Flash", trust_remote_code=True
|
||||
),
|
||||
"Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"),
|
||||
}
|
||||
|
||||
|
||||
249
tests/multimodal/test_embedding_shape_validation_unit.py
Normal file
249
tests/multimodal/test_embedding_shape_validation_unit.py
Normal file
@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Unit tests for embedding shape validation.
|
||||
|
||||
Simple, fast unit tests that can run without server fixtures.
|
||||
Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.parse import (
|
||||
AudioEmbeddingItems,
|
||||
ImageEmbeddingItems,
|
||||
)
|
||||
|
||||
|
||||
class TestImageEmbedBasicValidation:
|
||||
"""Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
|
||||
|
||||
def test_valid_2d_tensor_accepted(self):
|
||||
"""Baseline: 2D tensors should be accepted."""
|
||||
valid_tensor = torch.randn(10, 768, dtype=torch.float32)
|
||||
|
||||
# Should not raise - 2D is valid
|
||||
items = ImageEmbeddingItems(valid_tensor)
|
||||
assert items.get_count() == 10
|
||||
|
||||
def test_valid_3d_tensor_accepted(self):
|
||||
"""Baseline: 3D tensors should be accepted."""
|
||||
valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
|
||||
|
||||
# Should not raise - 3D is valid
|
||||
items = ImageEmbeddingItems(valid_tensor)
|
||||
assert items.get_count() == 2
|
||||
|
||||
def test_valid_list_of_2d_tensors_accepted(self):
|
||||
"""Baseline: List of 2D tensors should be accepted."""
|
||||
tensors = [
|
||||
torch.randn(10, 768, dtype=torch.float32),
|
||||
torch.randn(15, 768, dtype=torch.float32),
|
||||
]
|
||||
|
||||
# Should not raise
|
||||
items = ImageEmbeddingItems(tensors)
|
||||
assert items.get_count() == 2
|
||||
|
||||
def test_1d_tensor_rejected(self):
|
||||
"""Security: 1D tensors should be rejected (invalid ndim)."""
|
||||
invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(invalid_tensor)
|
||||
|
||||
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
|
||||
|
||||
def test_4d_tensor_rejected(self):
|
||||
"""Security: 4D tensors should be rejected (invalid ndim)."""
|
||||
invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32) # 4D
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(invalid_tensor)
|
||||
|
||||
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
|
||||
|
||||
def test_hidden_size_validation_correct_size(self):
|
||||
"""Embeddings with correct hidden size should be accepted."""
|
||||
expected_hidden_size = 768
|
||||
valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
|
||||
|
||||
# Should not raise
|
||||
items = ImageEmbeddingItems(
|
||||
valid_tensor, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
assert items.get_count() == 10
|
||||
|
||||
def test_hidden_size_validation_wrong_size_rejected(self):
|
||||
"""Embeddings with wrong hidden size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(
|
||||
invalid_tensor, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
|
||||
error_msg = str(exc_info.value)
|
||||
assert "hidden dimension mismatch" in error_msg.lower()
|
||||
assert str(wrong_hidden_size) in error_msg
|
||||
assert str(expected_hidden_size) in error_msg
|
||||
|
||||
|
||||
class TestAudioEmbedBasicValidation:
|
||||
"""Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
|
||||
|
||||
def test_valid_2d_tensor_accepted(self):
|
||||
"""Baseline: 2D tensors should be accepted."""
|
||||
valid_tensor = torch.randn(10, 768, dtype=torch.float32)
|
||||
|
||||
# Should not raise - 2D is valid
|
||||
items = AudioEmbeddingItems(valid_tensor)
|
||||
assert items.get_count() == 10
|
||||
|
||||
def test_valid_3d_tensor_accepted(self):
|
||||
"""Baseline: 3D tensors should be accepted."""
|
||||
valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
|
||||
|
||||
# Should not raise - 3D is valid
|
||||
items = AudioEmbeddingItems(valid_tensor)
|
||||
assert items.get_count() == 2
|
||||
|
||||
def test_valid_list_of_2d_tensors_accepted(self):
|
||||
"""Baseline: List of 2D tensors should be accepted."""
|
||||
tensors = [
|
||||
torch.randn(10, 768, dtype=torch.float32),
|
||||
torch.randn(15, 768, dtype=torch.float32),
|
||||
]
|
||||
|
||||
# Should not raise
|
||||
items = AudioEmbeddingItems(tensors)
|
||||
assert items.get_count() == 2
|
||||
|
||||
def test_1d_tensor_rejected(self):
|
||||
"""Security: 1D tensors should be rejected (invalid ndim)."""
|
||||
invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AudioEmbeddingItems(invalid_tensor)
|
||||
|
||||
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
|
||||
|
||||
def test_scalar_rejected(self):
|
||||
"""Security: Scalar tensors should be rejected."""
|
||||
invalid_tensor = torch.tensor(1.0) # 0D (scalar)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
AudioEmbeddingItems(invalid_tensor)
|
||||
|
||||
def test_hidden_size_validation_correct_size(self):
|
||||
"""Embeddings with correct hidden size should be accepted."""
|
||||
expected_hidden_size = 768
|
||||
valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
|
||||
|
||||
# Should not raise
|
||||
items = AudioEmbeddingItems(
|
||||
valid_tensor, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
assert items.get_count() == 10
|
||||
|
||||
def test_hidden_size_validation_wrong_size_rejected(self):
|
||||
"""Embeddings with wrong hidden size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AudioEmbeddingItems(
|
||||
invalid_tensor, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
|
||||
error_msg = str(exc_info.value)
|
||||
assert "hidden dimension mismatch" in error_msg.lower()
|
||||
assert str(wrong_hidden_size) in error_msg
|
||||
assert str(expected_hidden_size) in error_msg
|
||||
|
||||
|
||||
class TestShapeValidationDoSPrevention:
|
||||
"""
|
||||
Tests for DoS prevention through shape validation.
|
||||
|
||||
Verifies that embeddings with incorrect shapes are rejected early,
|
||||
preventing crashes during model inference.
|
||||
"""
|
||||
|
||||
def test_prevent_crash_from_wrong_shape_image_embeds(self):
|
||||
"""
|
||||
Prevent crash scenario: wrong hidden size in image embeddings.
|
||||
|
||||
Without validation, this would pass initial checks but crash later
|
||||
during model forward pass when dimensions don't match.
|
||||
"""
|
||||
expected_hidden_size = 768 # Typical model hidden size
|
||||
wrong_hidden_size = 4096 # Wrong size (e.g., Llama-sized)
|
||||
|
||||
wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
|
||||
|
||||
# Should be rejected at instantiation time, not during inference
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(
|
||||
wrong_embedding, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
|
||||
error_msg = str(exc_info.value)
|
||||
assert "hidden dimension mismatch" in error_msg.lower()
|
||||
assert str(expected_hidden_size) in error_msg # Expected
|
||||
assert str(wrong_hidden_size) in error_msg # Received
|
||||
|
||||
def test_prevent_crash_from_wrong_shape_audio_embeds(self):
|
||||
"""
|
||||
Prevent crash scenario: wrong hidden size in audio embeddings.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
|
||||
wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AudioEmbeddingItems(
|
||||
wrong_embedding, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
|
||||
error_msg = str(exc_info.value)
|
||||
assert "hidden dimension mismatch" in error_msg.lower()
|
||||
|
||||
def test_extremely_large_hidden_size_rejected(self):
|
||||
"""Security: Prevent DoS from extremely large embeddings."""
|
||||
expected_hidden_size = 768
|
||||
huge_hidden_size = 100000 # Large but not extreme to avoid test OOM
|
||||
|
||||
invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(
|
||||
invalid_tensor, expected_hidden_size=expected_hidden_size
|
||||
)
|
||||
|
||||
assert "hidden dimension mismatch" in str(exc_info.value).lower()
|
||||
|
||||
def test_batch_with_mixed_hidden_sizes_rejected(self):
|
||||
"""All embeddings in a list must have the same hidden size."""
|
||||
expected_hidden_size = 768
|
||||
|
||||
# One correct, one wrong
|
||||
batch = [
|
||||
torch.randn(10, expected_hidden_size, dtype=torch.float32),
|
||||
torch.randn(10, expected_hidden_size + 100, dtype=torch.float32), # Wrong!
|
||||
]
|
||||
|
||||
# Should fail on the second one
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
|
||||
|
||||
assert "hidden dimension mismatch" in str(exc_info.value).lower()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "--tb=short"])
|
||||
@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
current_platform.is_rocm()
|
||||
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
|
||||
):
|
||||
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
|
||||
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
|
||||
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs(
|
||||
current_platform.is_rocm()
|
||||
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
|
||||
):
|
||||
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
|
||||
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
|
||||
|
||||
if use_aiter:
|
||||
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
|
||||
@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
||||
current_platform.is_rocm()
|
||||
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
|
||||
):
|
||||
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
|
||||
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
|
||||
|
||||
if use_aiter:
|
||||
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
|
||||
|
||||
@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import (
|
||||
Fp8Config,
|
||||
Fp8KVCacheMethod,
|
||||
Fp8LinearMethod,
|
||||
Fp8MoeBackend,
|
||||
Fp8MoEMethod,
|
||||
)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
@ -216,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None:
|
||||
ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
|
||||
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
|
||||
|
||||
# Reference dynamic quantizaton
|
||||
# Reference dynamic quantization
|
||||
y = quantize_ref(x, inv_scale)
|
||||
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
|
||||
|
||||
@ -324,7 +325,10 @@ def test_fp8_reloading(
|
||||
weight_loader=default_weight_loader,
|
||||
)
|
||||
|
||||
# Fp8LinearMethod uses use_marlin
|
||||
# Fp8MoEMethod uses fp8_backend
|
||||
method.use_marlin = use_marlin
|
||||
method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None
|
||||
|
||||
# capture weights format during loading
|
||||
original_metadata = [
|
||||
|
||||
@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import NoReturn
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
def _skip(msg: str) -> NoReturn:
|
||||
pytest.skip(msg)
|
||||
raise RuntimeError(msg)
|
||||
|
||||
|
||||
def _snapshot_download_or_skip(model_id: str) -> str:
|
||||
try:
|
||||
from huggingface_hub import snapshot_download
|
||||
except Exception as e: # pragma: no cover
|
||||
_skip(f"huggingface_hub is required to download {model_id}: {e}")
|
||||
|
||||
try:
|
||||
return snapshot_download(
|
||||
repo_id=model_id,
|
||||
repo_type="model",
|
||||
# These checkpoints are already small; download full repo for simplicity.
|
||||
allow_patterns=["*"],
|
||||
)
|
||||
except Exception as e:
|
||||
_skip(f"Failed to download {model_id} from the HF Hub: {e}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
print(f"ModelOpt FP8 output: {output}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
|
||||
"""Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
|
||||
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
|
||||
model_path = _snapshot_download_or_skip(model_id)
|
||||
|
||||
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
from vllm.model_executor.layers.quantization.modelopt import (
|
||||
ModelOptFp8PcPtLinearMethod,
|
||||
)
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
|
||||
assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
|
||||
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
|
||||
assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
|
||||
|
||||
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert o_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert down_proj.weight.dtype == torch.float8_e4m3fn
|
||||
|
||||
# Per-channel scales; activations are dynamically scaled per token.
|
||||
assert hasattr(qkv_proj, "weight_scale")
|
||||
assert qkv_proj.weight_scale.dtype == torch.float32
|
||||
assert qkv_proj.weight_scale.dim() == 1
|
||||
assert not hasattr(qkv_proj, "input_scale")
|
||||
|
||||
assert hasattr(o_proj, "weight_scale")
|
||||
assert o_proj.weight_scale.dtype == torch.float32
|
||||
assert o_proj.weight_scale.dim() == 1
|
||||
assert not hasattr(o_proj, "input_scale")
|
||||
|
||||
assert hasattr(gate_up_proj, "weight_scale")
|
||||
assert gate_up_proj.weight_scale.dtype == torch.float32
|
||||
assert gate_up_proj.weight_scale.dim() == 1
|
||||
assert not hasattr(gate_up_proj, "input_scale")
|
||||
|
||||
assert hasattr(down_proj, "weight_scale")
|
||||
assert down_proj.weight_scale.dtype == torch.float32
|
||||
assert down_proj.weight_scale.dim() == 1
|
||||
assert not hasattr(down_proj, "input_scale")
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
|
||||
"""Test ModelOpt FP8_PB_WO checkpoint setup."""
|
||||
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
|
||||
model_path = _snapshot_download_or_skip(model_id)
|
||||
|
||||
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
from vllm.model_executor.layers.quantization.modelopt import (
|
||||
ModelOptFp8PbWoLinearMethod,
|
||||
)
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
|
||||
assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
|
||||
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
|
||||
assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
|
||||
|
||||
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert o_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert down_proj.weight.dtype == torch.float8_e4m3fn
|
||||
|
||||
# Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
|
||||
assert hasattr(qkv_proj, "weight_scale")
|
||||
assert qkv_proj.weight_scale.dtype == torch.float32
|
||||
assert qkv_proj.weight_scale.dim() == 2
|
||||
|
||||
assert hasattr(o_proj, "weight_scale")
|
||||
assert o_proj.weight_scale.dtype == torch.float32
|
||||
assert o_proj.weight_scale.dim() == 2
|
||||
|
||||
assert hasattr(gate_up_proj, "weight_scale")
|
||||
assert gate_up_proj.weight_scale.dtype == torch.float32
|
||||
assert gate_up_proj.weight_scale.dim() == 2
|
||||
|
||||
assert hasattr(down_proj, "weight_scale")
|
||||
assert down_proj.weight_scale.dtype == torch.float32
|
||||
assert down_proj.weight_scale.dim() == 2
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
print(f"ModelOpt FP8_PB_WO output: {output}")
|
||||
|
||||
@ -18,25 +18,37 @@ for i in {1..5}; do
|
||||
echo "Checking metadata.json URL (attempt $i)..."
|
||||
if curl --fail "$meta_json_url" > metadata.json; then
|
||||
echo "INFO: metadata.json URL is valid."
|
||||
# check whether it is valid json by python
|
||||
# check whether it is valid json by python (printed to stdout)
|
||||
if python3 -m json.tool metadata.json; then
|
||||
echo "INFO: metadata.json is valid JSON. Proceeding with the test."
|
||||
echo "INFO: metadata.json is valid JSON. Proceeding with the check."
|
||||
# check whether there is an object in the json matching:
|
||||
# "package_name": "vllm", and "platform_tag" matches the current architecture
|
||||
# see `determine_wheel_url` in setup.py for more details
|
||||
if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \
|
||||
s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \
|
||||
for o in d)))" 2>/dev/null; then
|
||||
echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture."
|
||||
break
|
||||
else
|
||||
echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture."
|
||||
fi
|
||||
else
|
||||
echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
|
||||
echo "INFO: metadata.json content:"
|
||||
cat metadata.json
|
||||
exit 1
|
||||
fi
|
||||
break
|
||||
fi
|
||||
# failure handling
|
||||
# failure handling & retry logic
|
||||
if [ $i -eq 5 ]; then
|
||||
echo "ERROR: metadata.json URL is still not valid after 5 attempts."
|
||||
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
|
||||
echo "ERROR: metadata is still not available after 5 attempts."
|
||||
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available."
|
||||
echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
|
||||
echo " NOTE: If it fails, please report in #sig-ci channel."
|
||||
exit 1
|
||||
else
|
||||
echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
|
||||
sleep 180
|
||||
echo "WARNING: metadata is not available. Retrying after 5 minutes..."
|
||||
sleep 300
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
169
tests/test_attention_backend_registry.py
Normal file
169
tests/test_attention_backend_registry.py
Normal file
@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.attention.backends.abstract import (
|
||||
AttentionBackend,
|
||||
AttentionImpl,
|
||||
)
|
||||
from vllm.attention.backends.registry import (
|
||||
AttentionBackendEnum,
|
||||
MambaAttentionBackendEnum,
|
||||
register_backend,
|
||||
)
|
||||
|
||||
|
||||
class CustomAttentionImpl(AttentionImpl):
|
||||
"""Mock custom attention implementation for testing."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
"""Mock forward pass."""
|
||||
pass
|
||||
|
||||
|
||||
class CustomAttentionBackend(AttentionBackend):
|
||||
"""Mock custom attention backend for testing."""
|
||||
|
||||
@staticmethod
|
||||
def get_name():
|
||||
return "CUSTOM"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls():
|
||||
return CustomAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls():
|
||||
"""Mock builder class."""
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_required_kv_cache_layout():
|
||||
"""Mock KV cache layout."""
|
||||
return None
|
||||
|
||||
|
||||
class CustomMambaAttentionImpl(AttentionImpl):
|
||||
"""Mock custom mamba attention implementation for testing."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
"""Mock forward pass."""
|
||||
pass
|
||||
|
||||
|
||||
class CustomMambaAttentionBackend(AttentionBackend):
|
||||
"""Mock custom mamba attention backend for testing."""
|
||||
|
||||
@staticmethod
|
||||
def get_name():
|
||||
return "CUSTOM_MAMBA"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls():
|
||||
return CustomMambaAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_builder_cls():
|
||||
"""Mock builder class."""
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_required_kv_cache_layout():
|
||||
"""Mock KV cache layout."""
|
||||
return None
|
||||
|
||||
|
||||
def test_custom_is_not_alias_of_any_backend():
|
||||
# Get all members of AttentionBackendEnum
|
||||
all_backends = list(AttentionBackendEnum)
|
||||
|
||||
# Find any aliases of CUSTOM
|
||||
aliases = []
|
||||
for backend in all_backends:
|
||||
if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM:
|
||||
aliases.append(backend.name)
|
||||
|
||||
# CUSTOM should not be an alias of any other backend
|
||||
assert len(aliases) == 0, (
|
||||
f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n"
|
||||
f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n"
|
||||
f"This happens when CUSTOM has the same value as another backend.\n"
|
||||
f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n"
|
||||
f"All backend values:\n"
|
||||
+ "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends)
|
||||
)
|
||||
|
||||
# Verify CUSTOM has its own unique identity
|
||||
assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", (
|
||||
f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'"
|
||||
)
|
||||
|
||||
|
||||
def test_register_custom_backend_with_class_path():
|
||||
# Register with explicit class path
|
||||
register_backend(
|
||||
backend=AttentionBackendEnum.CUSTOM,
|
||||
class_path="tests.test_attention_backend_registry.CustomAttentionBackend",
|
||||
is_mamba=False,
|
||||
)
|
||||
|
||||
# Check that CUSTOM backend is registered
|
||||
assert AttentionBackendEnum.CUSTOM.is_overridden(), (
|
||||
"CUSTOM should be overridden after registration"
|
||||
)
|
||||
|
||||
# Get the registered class path
|
||||
class_path = AttentionBackendEnum.CUSTOM.get_path()
|
||||
assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend"
|
||||
|
||||
# Get the backend class
|
||||
backend_cls = AttentionBackendEnum.CUSTOM.get_class()
|
||||
assert backend_cls.get_name() == "CUSTOM"
|
||||
assert backend_cls.get_impl_cls() == CustomAttentionImpl
|
||||
|
||||
|
||||
def test_mamba_custom_is_not_alias_of_any_backend():
|
||||
# Get all mamba backends
|
||||
all_backends = list(MambaAttentionBackendEnum)
|
||||
|
||||
# Find any aliases of CUSTOM
|
||||
aliases = []
|
||||
for backend in all_backends:
|
||||
if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM:
|
||||
aliases.append(backend.name)
|
||||
|
||||
# CUSTOM should not be an alias of any other backend
|
||||
assert len(aliases) == 0, (
|
||||
f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n"
|
||||
f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n"
|
||||
f"All mamba backend values:\n"
|
||||
+ "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends)
|
||||
)
|
||||
|
||||
|
||||
def test_register_custom_mamba_backend_with_class_path():
|
||||
# Register with explicit class path
|
||||
register_backend(
|
||||
backend=MambaAttentionBackendEnum.CUSTOM,
|
||||
class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend",
|
||||
is_mamba=True,
|
||||
)
|
||||
|
||||
# Check that the backend is registered
|
||||
assert MambaAttentionBackendEnum.CUSTOM.is_overridden()
|
||||
|
||||
# Get the registered class path
|
||||
class_path = MambaAttentionBackendEnum.CUSTOM.get_path()
|
||||
assert (
|
||||
class_path
|
||||
== "tests.test_attention_backend_registry.CustomMambaAttentionBackend"
|
||||
)
|
||||
|
||||
# Get the backend class
|
||||
backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class()
|
||||
assert backend_cls.get_name() == "CUSTOM_MAMBA"
|
||||
assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl
|
||||
@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
|
||||
envs.environment_variables[env_name] = lambda s=strategy: s
|
||||
|
||||
# Test the select_experts method
|
||||
topk_weights, topk_ids, _ = fused_moe.select_experts(
|
||||
topk_weights, topk_ids = fused_moe.select_experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -2,12 +2,14 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from threading import Thread
|
||||
from types import SimpleNamespace
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
|
||||
from vllm.v1.engine.core_client import (
|
||||
AsyncMPClient,
|
||||
EngineCoreClient,
|
||||
SyncMPClient,
|
||||
)
|
||||
from vllm.v1.engine.utils import CoreEngineProcManager
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
|
||||
@ -60,6 +66,91 @@ def make_request(
|
||||
)
|
||||
|
||||
|
||||
def _reload_envs_module():
|
||||
import vllm.envs as envs_mod
|
||||
|
||||
cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
|
||||
if cache_clear is not None:
|
||||
cache_clear()
|
||||
return importlib.reload(envs_mod)
|
||||
|
||||
|
||||
def _reload_core_client_module():
|
||||
module = importlib.import_module("vllm.v1.engine.core_client")
|
||||
return importlib.reload(module)
|
||||
|
||||
|
||||
def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
|
||||
timeout_value = 654
|
||||
monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
|
||||
|
||||
# Ensure that the environment variable is loaded if caching is enabled
|
||||
_reload_envs_module()
|
||||
core_client_mod = _reload_core_client_module()
|
||||
|
||||
poll_timeouts: list[int] = []
|
||||
|
||||
class ShadowSocket:
|
||||
def poll(self, timeout: int) -> int:
|
||||
# Capture the timeout value for each poll call
|
||||
poll_timeouts.append(timeout)
|
||||
return 1
|
||||
|
||||
def recv_multipart(self):
|
||||
return (b"\x00\x00", b"ready")
|
||||
|
||||
class DummySocket:
|
||||
def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
|
||||
if track:
|
||||
return SimpleNamespace(done=True)
|
||||
|
||||
def recv_multipart(self, *, copy: bool = False):
|
||||
return (b"", b"")
|
||||
|
||||
def close(self, *, linger: int = 0):
|
||||
pass
|
||||
|
||||
def bind(self, _address):
|
||||
pass
|
||||
|
||||
def connect(self, _address):
|
||||
pass
|
||||
|
||||
def setsockopt(self, *_args, **_kwargs):
|
||||
pass
|
||||
|
||||
monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
|
||||
monkeypatch.setattr(
|
||||
core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
|
||||
)
|
||||
|
||||
parallel_config = SimpleNamespace(
|
||||
data_parallel_size=1,
|
||||
data_parallel_rank=0,
|
||||
data_parallel_size_local=1,
|
||||
data_parallel_rank_local=None,
|
||||
data_parallel_hybrid_lb=False,
|
||||
data_parallel_external_lb=False,
|
||||
)
|
||||
vllm_config = SimpleNamespace(parallel_config=parallel_config)
|
||||
|
||||
client = core_client_mod.MPClient(
|
||||
asyncio_mode=False,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=object,
|
||||
log_stats=False,
|
||||
client_addresses={
|
||||
"input_address": "inproc://input",
|
||||
"output_address": "inproc://output",
|
||||
},
|
||||
)
|
||||
try:
|
||||
# timeout_value is in seconds, but poll receives milliseconds
|
||||
assert poll_timeouts == [timeout_value * 1000]
|
||||
finally:
|
||||
client.shutdown()
|
||||
|
||||
|
||||
def loop_until_done(client: EngineCoreClient, outputs: dict):
|
||||
while True:
|
||||
engine_core_outputs = client.get_output().outputs
|
||||
|
||||
56
tests/v1/engine/test_preprocess_error_handling.py
Normal file
56
tests/v1/engine/test_preprocess_error_handling.py
Normal file
@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch.cuda
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core import EngineCore
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
|
||||
|
||||
def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test that preprocessing errors are handled gracefully."""
|
||||
|
||||
assert not torch.cuda.is_initialized(), (
|
||||
"fork needs to be used for the engine "
|
||||
"core process and this isn't possible if cuda is already initialized"
|
||||
)
|
||||
|
||||
# Store original method to call for non-failing requests
|
||||
original_preprocess = EngineCore.preprocess_add_request
|
||||
|
||||
# Monkeypatch to make preprocess_add_request raise an exception
|
||||
# only for requests with "FAIL" in the first token
|
||||
def conditional_failing_preprocess(self, request: EngineCoreRequest):
|
||||
# Fail if the first token id is 333
|
||||
if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
|
||||
raise ValueError("Simulated preprocessing error!")
|
||||
return original_preprocess(self, request)
|
||||
|
||||
monkeypatch.setattr(
|
||||
EngineCore, "preprocess_add_request", conditional_failing_preprocess
|
||||
)
|
||||
|
||||
llm = LLM(model=MODEL_NAME)
|
||||
|
||||
# Create a failing request by crafting a request with an invalid token
|
||||
# We need to use a direct approach since LLM.generate tokenizes for us
|
||||
from vllm.inputs import TokensPrompt
|
||||
|
||||
# This should raise an exception due to the preprocessing failure
|
||||
# Special token id to trigger the failure
|
||||
failing_prompt = TokensPrompt(prompt_token_ids=[333])
|
||||
outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10)) # type: ignore
|
||||
assert len(outputs) == 1
|
||||
assert len(outputs[0].outputs[0].token_ids) == 0
|
||||
assert outputs[0].finished
|
||||
assert outputs[0].outputs[0].finish_reason == "error"
|
||||
|
||||
# Verify the engine is still functional with a normal request
|
||||
outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
|
||||
assert len(outputs) == 1
|
||||
assert len(outputs[0].outputs[0].token_ids) > 0
|
||||
assert outputs[0].outputs[0].finish_reason in ("stop", "length")
|
||||
@ -547,6 +547,13 @@ def test_spec_decode_logprobs(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False
|
||||
)
|
||||
penalty_sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
logprobs=top_logprobs,
|
||||
max_tokens=10,
|
||||
ignore_eos=False,
|
||||
presence_penalty=-1.0,
|
||||
)
|
||||
method, model_name, spec_model_name = model_setup
|
||||
max_model_len = 256
|
||||
|
||||
@ -558,14 +565,17 @@ def test_spec_decode_logprobs(
|
||||
seed=42,
|
||||
logprobs_mode=logprobs_mode,
|
||||
gpu_memory_utilization=0.4,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
ref_results = ref_llm.generate(
|
||||
[prompt, prompt], [sampling_params, penalty_sampling_params]
|
||||
)
|
||||
ref_results = ref_llm.generate([prompt], sampling_params)
|
||||
# Collect logprobs outputs from reference LLM.
|
||||
ref_logprobs = []
|
||||
for output in ref_results[0].outputs:
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
ref_logprobs.append(logprobs[token_id])
|
||||
for results in ref_results:
|
||||
for output in results.outputs:
|
||||
for logprobs in output.logprobs:
|
||||
ref_logprobs.extend(logprobs.values())
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
@ -587,14 +597,17 @@ def test_spec_decode_logprobs(
|
||||
# Force prefill chunking
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=32,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
spec_results = spec_llm.generate(
|
||||
[prompt, prompt], [sampling_params, penalty_sampling_params]
|
||||
)
|
||||
spec_results = spec_llm.generate([prompt], sampling_params)
|
||||
# Collect logprobs outputs from spec decode LLM.
|
||||
spec_logprobs = []
|
||||
for output in spec_results[0].outputs:
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
spec_logprobs.append(logprobs[token_id])
|
||||
for results in spec_results:
|
||||
for output in results.outputs:
|
||||
for logprobs in output.logprobs:
|
||||
spec_logprobs.extend(logprobs.values())
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
@ -761,7 +761,7 @@ class rocm_aiter_ops:
|
||||
|
||||
@classmethod
|
||||
@if_aiter_supported
|
||||
def is_linear_fp8_enaled(cls) -> bool:
|
||||
def is_linear_fp8_enabled(cls) -> bool:
|
||||
return cls.is_linear_enabled()
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
|
||||
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
|
||||
|
||||
|
||||
def cutlass_blockwise_scaled_grouped_mm(
|
||||
output: torch.Tensor,
|
||||
a: torch.Tensor,
|
||||
b: torch.Tensor,
|
||||
scales_a: torch.Tensor,
|
||||
scales_b: torch.Tensor,
|
||||
problem_sizes: torch.Tensor,
|
||||
expert_offsets: torch.Tensor,
|
||||
):
|
||||
torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
|
||||
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
|
||||
)
|
||||
|
||||
|
||||
def cutlass_scaled_fp4_mm(
|
||||
a: torch.Tensor,
|
||||
b: torch.Tensor,
|
||||
|
||||
@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
|
||||
)
|
||||
CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
|
||||
# Placeholder for third-party/custom backends - must be registered before use
|
||||
CUSTOM = ""
|
||||
# set to None to avoid alias with other backend, whose value is an empty string
|
||||
CUSTOM = None
|
||||
|
||||
def get_path(self, include_classname: bool = True) -> str:
|
||||
"""Get the class path for this backend (respects overrides).
|
||||
@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
|
||||
LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
|
||||
GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
|
||||
# Placeholder for third-party/custom backends - must be registered before use
|
||||
CUSTOM = ""
|
||||
# set to None to avoid alias with other backend, whose value is an empty string
|
||||
CUSTOM = None
|
||||
|
||||
def get_path(self, include_classname: bool = True) -> str:
|
||||
"""Get the class path for this backend (respects overrides).
|
||||
|
||||
@ -15,7 +15,7 @@ def merge_attn_states(
|
||||
output_lse: torch.Tensor | None = None,
|
||||
) -> None:
|
||||
# NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
|
||||
# is not support for FP8 dtype, fallback to use Triton kernel.
|
||||
# does not support FP8 dtype, fallback to use Triton kernel.
|
||||
def supported_dtypes(o: torch.Tensor) -> bool:
|
||||
return o.dtype in [torch.float32, torch.half, torch.bfloat16]
|
||||
|
||||
|
||||
@ -189,9 +189,14 @@ def kernel_unified_attention_2d(
|
||||
+ 1
|
||||
)
|
||||
|
||||
# adjust for potential padding in the last q_block by considering the
|
||||
# actual sequence length
|
||||
max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
|
||||
if USE_MM_PREFIX:
|
||||
# image bidirectional attention ranges require a full range
|
||||
# including q_block padding to make sure doc mask is correct
|
||||
max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len)
|
||||
else:
|
||||
# adjust for potential padding in the last q_block by considering the
|
||||
# actual sequence length
|
||||
max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
|
||||
|
||||
# calculate the number of tiles that need to be processed to
|
||||
# cover the longest sequence prefix (due to causal masking, tiles beyond
|
||||
@ -202,7 +207,8 @@ def kernel_unified_attention_2d(
|
||||
# Default: keep previous global behavior
|
||||
tile_start = 0
|
||||
tile_end = num_tiles
|
||||
if SLIDING_WINDOW > 0:
|
||||
# TODO(Isotr0py): sliding window pruning with image bidirectional mask
|
||||
if SLIDING_WINDOW > 0 and not USE_MM_PREFIX:
|
||||
# Query rows covered by this Q-block
|
||||
qpos_lo = q_block_local_idx * BLOCK_Q
|
||||
qpos_hi = tl.minimum(
|
||||
@ -357,6 +363,12 @@ def kernel_unified_attention_2d(
|
||||
L = L * alpha + l_j
|
||||
M = m_j
|
||||
|
||||
if SLIDING_WINDOW:
|
||||
qpos_lo = q_block_local_idx * BLOCK_Q
|
||||
V = tl.where(
|
||||
(context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
|
||||
)
|
||||
|
||||
# acc : (BLOCK_M, HEAD_SIZE_PADDED)
|
||||
acc += tl.dot(P.to(V.dtype), V)
|
||||
|
||||
@ -672,6 +684,12 @@ def kernel_unified_attention_3d(
|
||||
L = L * alpha + l_j
|
||||
M = m_j
|
||||
|
||||
if SLIDING_WINDOW:
|
||||
qpos_lo = q_block_local_idx * BLOCK_Q
|
||||
V = tl.where(
|
||||
(context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
|
||||
)
|
||||
|
||||
# acc : (BLOCK_M, HEAD_SIZE_PADDED)
|
||||
acc += tl.dot(P.to(V.dtype), V)
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ from vllm.config.lora import LoRAConfig
|
||||
from vllm.config.model import (
|
||||
ModelConfig,
|
||||
iter_architecture_defaults,
|
||||
str_dtype_to_torch_dtype,
|
||||
try_match_architecture_defaults,
|
||||
)
|
||||
from vllm.config.multimodal import MultiModalConfig
|
||||
@ -72,6 +73,7 @@ __all__ = [
|
||||
# From vllm.config.model
|
||||
"ModelConfig",
|
||||
"iter_architecture_defaults",
|
||||
"str_dtype_to_torch_dtype",
|
||||
"try_match_architecture_defaults",
|
||||
# From vllm.config.multimodal
|
||||
"MultiModalConfig",
|
||||
|
||||
@ -71,7 +71,7 @@ else:
|
||||
logger = init_logger(__name__)
|
||||
|
||||
RunnerOption = Literal["auto", RunnerType]
|
||||
ConvertType = Literal["none", "embed", "classify", "reward"]
|
||||
ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"]
|
||||
ConvertOption = Literal["auto", ConvertType]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
@ -843,12 +843,18 @@ class ModelConfig:
|
||||
producer_name = quant_cfg.get("producer", {}).get("name")
|
||||
if producer_name == "modelopt":
|
||||
quant_algo = quant_cfg.get("quantization", {}).get("quant_algo")
|
||||
if quant_algo == "FP8":
|
||||
quant_cfg["quant_method"] = "modelopt"
|
||||
elif quant_algo == "NVFP4":
|
||||
quant_cfg["quant_method"] = "modelopt_fp4"
|
||||
elif quant_algo is not None:
|
||||
raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
|
||||
if quant_algo is not None:
|
||||
quant_algo_upper = str(quant_algo).upper()
|
||||
if quant_algo_upper in {
|
||||
"FP8",
|
||||
"FP8_PER_CHANNEL_PER_TOKEN",
|
||||
"FP8_PB_WO",
|
||||
}:
|
||||
quant_cfg["quant_method"] = "modelopt"
|
||||
elif quant_algo_upper == "NVFP4":
|
||||
quant_cfg["quant_method"] = "modelopt_fp4"
|
||||
else:
|
||||
raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}")
|
||||
|
||||
return quant_cfg
|
||||
|
||||
@ -1849,6 +1855,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
|
||||
"bfloat16": torch.bfloat16,
|
||||
}
|
||||
|
||||
|
||||
def str_dtype_to_torch_dtype(type: str):
|
||||
return _STR_DTYPE_TO_TORCH_DTYPE.get(type)
|
||||
|
||||
|
||||
# model_type -> reason
|
||||
_FLOAT16_NOT_SUPPORTED_MODELS = {
|
||||
"gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
|
||||
|
||||
@ -71,7 +71,11 @@ class EngineClient(ABC):
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from a pooling model."""
|
||||
"""Generate outputs for a request from a pooling model.
|
||||
|
||||
NOTE: truncate_prompt_tokens is deprecated in v0.14.
|
||||
TODO: Remove this argument in v0.15.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@ -51,6 +51,9 @@ from vllm.entrypoints.openai.protocol import (
|
||||
ToolCall,
|
||||
UsageInfo,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_chat_stream_harmony import (
|
||||
extract_harmony_streaming_delta,
|
||||
)
|
||||
from vllm.entrypoints.openai.serving_engine import (
|
||||
GenerationError,
|
||||
OpenAIServing,
|
||||
@ -253,18 +256,31 @@ class OpenAIServingChat(OpenAIServing):
|
||||
truncate_tool_call_ids(request)
|
||||
validate_request_params(request)
|
||||
|
||||
if (
|
||||
request.tool_choice == "auto"
|
||||
and not (self.enable_auto_tools and tool_parser is not None)
|
||||
# Check if tool parsing is unavailable (common condition)
|
||||
tool_parsing_unavailable = (
|
||||
tool_parser is None
|
||||
and not isinstance(tokenizer, MistralTokenizer)
|
||||
and not self.use_harmony
|
||||
)
|
||||
|
||||
# Validate tool_choice when tool parsing is required but unavailable
|
||||
if tool_parsing_unavailable and request.tool_choice not in (
|
||||
None,
|
||||
"none",
|
||||
):
|
||||
# for hf tokenizers, "auto" tools requires
|
||||
# --enable-auto-tool-choice and --tool-call-parser
|
||||
return self.create_error_response(
|
||||
'"auto" tool choice requires '
|
||||
"--enable-auto-tool-choice and --tool-call-parser to be set"
|
||||
)
|
||||
if request.tool_choice == "auto" and not self.enable_auto_tools:
|
||||
# for hf tokenizers, "auto" tools requires
|
||||
# --enable-auto-tool-choice and --tool-call-parser
|
||||
return self.create_error_response(
|
||||
'"auto" tool choice requires '
|
||||
"--enable-auto-tool-choice and --tool-call-parser to be set"
|
||||
)
|
||||
elif request.tool_choice != "auto":
|
||||
# "required" or named tool requires tool parser
|
||||
return self.create_error_response(
|
||||
f'tool_choice="{request.tool_choice}" requires '
|
||||
"--tool-call-parser to be set"
|
||||
)
|
||||
|
||||
if request.tools is None or (
|
||||
request.tool_choice == "none"
|
||||
@ -299,7 +315,10 @@ class OpenAIServingChat(OpenAIServing):
|
||||
)
|
||||
else:
|
||||
# For GPT-OSS.
|
||||
conversation, engine_prompts = self._make_request_with_harmony(request)
|
||||
should_include_tools = tool_dicts is not None
|
||||
conversation, engine_prompts = self._make_request_with_harmony(
|
||||
request, should_include_tools
|
||||
)
|
||||
except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(f"{e} {e.__cause__}")
|
||||
@ -792,6 +811,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
delta_text += harmony_parser.last_content_delta or ""
|
||||
cur_channel = harmony_parser.current_channel
|
||||
cur_recipient = harmony_parser.current_recipient
|
||||
# handle the case where several tokens where generated at once
|
||||
# including the final token, leading to a delta in the text
|
||||
# but the current channel to be empty (start state)
|
||||
if not cur_channel and delta_text:
|
||||
cur_channel = "final"
|
||||
else:
|
||||
delta_text = output.text
|
||||
|
||||
@ -821,64 +845,17 @@ class OpenAIServingChat(OpenAIServing):
|
||||
current_token_ids = as_list(output.token_ids)
|
||||
|
||||
if self.use_harmony:
|
||||
if cur_channel == "final":
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
elif cur_channel == "analysis":
|
||||
if request.include_reasoning:
|
||||
delta_message = DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
delta_message = None
|
||||
elif (
|
||||
cur_channel == "commentary"
|
||||
and cur_recipient
|
||||
and cur_recipient.startswith("functions.")
|
||||
):
|
||||
# Count completed tool calls to determine index
|
||||
base_index = 0
|
||||
for msg in harmony_parser.messages:
|
||||
if (
|
||||
msg.channel == "commentary"
|
||||
and msg.recipient
|
||||
and msg.recipient.startswith("functions.")
|
||||
):
|
||||
base_index += 1
|
||||
|
||||
if prev_recipient != cur_recipient:
|
||||
tool_name = cur_recipient.split("functions.", 1)[1]
|
||||
delta_message = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
id=make_tool_call_id(),
|
||||
type="function",
|
||||
function=DeltaFunctionCall(
|
||||
name=tool_name,
|
||||
arguments="",
|
||||
),
|
||||
index=base_index,
|
||||
)
|
||||
]
|
||||
)
|
||||
elif delta_text:
|
||||
delta_message = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
index=base_index,
|
||||
function=DeltaFunctionCall(
|
||||
arguments=delta_text
|
||||
),
|
||||
)
|
||||
]
|
||||
)
|
||||
else:
|
||||
delta_message = None
|
||||
|
||||
if delta_message is not None:
|
||||
harmony_tools_streamed[i] = True
|
||||
elif cur_channel == "commentary":
|
||||
# Tool call preambles meant to be shown to the user
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
else:
|
||||
delta_message = None
|
||||
delta_message, tools_streamed_flag = (
|
||||
extract_harmony_streaming_delta(
|
||||
harmony_parser=harmony_parser,
|
||||
cur_channel=cur_channel,
|
||||
cur_recipient=cur_recipient,
|
||||
prev_recipient=prev_recipient,
|
||||
delta_text=delta_text,
|
||||
include_reasoning=request.include_reasoning,
|
||||
)
|
||||
)
|
||||
harmony_tools_streamed[i] |= tools_streamed_flag
|
||||
# handle streaming deltas for tools with named tool_choice
|
||||
elif tool_choice_function_name:
|
||||
if (
|
||||
@ -1833,6 +1810,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
def _make_request_with_harmony(
|
||||
self,
|
||||
request: ChatCompletionRequest,
|
||||
should_include_tools: bool = True,
|
||||
):
|
||||
messages: list[OpenAIMessage] = []
|
||||
|
||||
@ -1850,13 +1828,16 @@ class OpenAIServingChat(OpenAIServing):
|
||||
reasoning_effort=request.reasoning_effort,
|
||||
browser_description=None,
|
||||
python_description=None,
|
||||
with_custom_tools=request.tools is not None,
|
||||
with_custom_tools=should_include_tools,
|
||||
)
|
||||
messages.append(sys_msg)
|
||||
|
||||
# Add developer message.
|
||||
dev_msg = get_developer_message(tools=request.tools)
|
||||
messages.append(dev_msg)
|
||||
if request.tools:
|
||||
dev_msg = get_developer_message(
|
||||
tools=request.tools if should_include_tools else None
|
||||
)
|
||||
messages.append(dev_msg)
|
||||
|
||||
# Add user message.
|
||||
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
|
||||
|
||||
101
vllm/entrypoints/openai/serving_chat_stream_harmony.py
Normal file
101
vllm/entrypoints/openai/serving_chat_stream_harmony.py
Normal file
@ -0,0 +1,101 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Harmony-specific streaming delta extraction for chat completions.
|
||||
|
||||
This module handles the extraction of DeltaMessage objects from
|
||||
harmony parser state during streaming chat completions.
|
||||
"""
|
||||
|
||||
from openai_harmony import StreamableParser
|
||||
|
||||
from vllm.entrypoints.chat_utils import make_tool_call_id
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
)
|
||||
|
||||
|
||||
def extract_harmony_streaming_delta(
|
||||
harmony_parser: StreamableParser,
|
||||
cur_channel: str | None,
|
||||
cur_recipient: str | None,
|
||||
prev_recipient: str | None,
|
||||
delta_text: str,
|
||||
include_reasoning: bool,
|
||||
) -> tuple[DeltaMessage | None, bool]:
|
||||
"""
|
||||
Extract a DeltaMessage from harmony parser state during streaming.
|
||||
|
||||
Args:
|
||||
harmony_parser: The StreamableParser instance tracking parse state
|
||||
cur_channel: Current channel ("final", "analysis", "commentary", etc.)
|
||||
cur_recipient: Current recipient (e.g., "functions.my_func")
|
||||
prev_recipient: Previous recipient for detecting tool call transitions
|
||||
delta_text: The text delta to include in the message
|
||||
include_reasoning: Whether to include reasoning content
|
||||
|
||||
Returns:
|
||||
A tuple of (DeltaMessage or None, tools_streamed_flag)
|
||||
"""
|
||||
tools_streamed = False
|
||||
|
||||
if cur_channel == "final":
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
elif (
|
||||
(cur_channel == "commentary" or cur_channel == "analysis")
|
||||
and cur_recipient
|
||||
and cur_recipient.startswith("functions.")
|
||||
):
|
||||
# Count completed tool calls to determine index
|
||||
base_index = 0
|
||||
for msg in harmony_parser.messages:
|
||||
if (
|
||||
(msg.channel == "commentary" or msg.channel == "analysis")
|
||||
and msg.recipient
|
||||
and msg.recipient.startswith("functions.")
|
||||
):
|
||||
base_index += 1
|
||||
|
||||
if prev_recipient != cur_recipient:
|
||||
tool_name = cur_recipient.split("functions.", 1)[1]
|
||||
delta_message = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
id=make_tool_call_id(),
|
||||
type="function",
|
||||
function=DeltaFunctionCall(
|
||||
name=tool_name,
|
||||
arguments="",
|
||||
),
|
||||
index=base_index,
|
||||
)
|
||||
]
|
||||
)
|
||||
elif delta_text:
|
||||
delta_message = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
index=base_index,
|
||||
function=DeltaFunctionCall(arguments=delta_text),
|
||||
)
|
||||
]
|
||||
)
|
||||
else:
|
||||
delta_message = None
|
||||
|
||||
if delta_message is not None:
|
||||
tools_streamed = True
|
||||
elif cur_channel == "commentary":
|
||||
# Tool call preambles meant to be shown to the user
|
||||
delta_message = DeltaMessage(content=delta_text)
|
||||
elif cur_channel == "analysis":
|
||||
if include_reasoning:
|
||||
delta_message = DeltaMessage(reasoning=delta_text)
|
||||
else:
|
||||
delta_message = None
|
||||
else:
|
||||
delta_message = None
|
||||
|
||||
return delta_message, tools_streamed
|
||||
@ -24,6 +24,7 @@ if TYPE_CHECKING:
|
||||
LOCAL_RANK: int = 0
|
||||
CUDA_VISIBLE_DEVICES: str | None = None
|
||||
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
|
||||
VLLM_ENGINE_READY_TIMEOUT_S: int = 600
|
||||
VLLM_API_KEY: str | None = None
|
||||
VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
|
||||
S3_ACCESS_KEY_ID: str | None = None
|
||||
@ -604,6 +605,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int(
|
||||
os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")
|
||||
),
|
||||
# Timeout in seconds for waiting for engine cores to become ready
|
||||
# during startup. Default is 600 seconds (10 minutes).
|
||||
"VLLM_ENGINE_READY_TIMEOUT_S": lambda: int(
|
||||
os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600")
|
||||
),
|
||||
# API key for vLLM API server
|
||||
"VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None),
|
||||
# Whether to log responses from API Server for debugging
|
||||
|
||||
@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
|
||||
UnquantizedFusedMoEMethod,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
|
||||
from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
|
||||
ZeroExpertFusedMoE,
|
||||
)
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
_config: dict[str, Any] | None = None
|
||||
@ -54,6 +57,7 @@ __all__ = [
|
||||
"FusedMoEPrepareAndFinalize",
|
||||
"RoutingMethodType",
|
||||
"SharedFusedMoE",
|
||||
"ZeroExpertFusedMoE",
|
||||
"activation_without_mul",
|
||||
"override_config",
|
||||
"get_config",
|
||||
|
||||
@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
|
||||
OCP_MX_Scheme,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
from vllm.utils.import_utils import has_triton_kernels
|
||||
from vllm.utils.math_utils import cdiv
|
||||
@ -39,6 +40,7 @@ if has_triton_kernels():
|
||||
def _get_config_dtype_str(
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool = False,
|
||||
use_fp8_w8a16: bool = False,
|
||||
use_int8_w8a16: bool = False,
|
||||
use_int4_w4a16: bool = False,
|
||||
ocp_mx_scheme: str | None = None,
|
||||
@ -50,6 +52,8 @@ def _get_config_dtype_str(
|
||||
"""
|
||||
if use_fp8_w8a8:
|
||||
return "fp8_w8a8"
|
||||
elif use_fp8_w8a16:
|
||||
return "fp8_w8a16"
|
||||
elif use_int8_w8a16:
|
||||
return "int8_w8a16"
|
||||
elif use_int4_w4a16:
|
||||
@ -319,6 +323,10 @@ class FusedMoEQuantConfig:
|
||||
def use_int8_w8a16(self) -> bool:
|
||||
return self._a1.dtype is None and self._w1.dtype == torch.int8
|
||||
|
||||
@property
|
||||
def use_fp8_w8a16(self) -> bool:
|
||||
return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()
|
||||
|
||||
@property
|
||||
def use_int4_w4a16(self) -> bool:
|
||||
return self._a1.dtype is None and self._w1.dtype == "int4"
|
||||
@ -362,6 +370,7 @@ class FusedMoEQuantConfig:
|
||||
"""
|
||||
return _get_config_dtype_str(
|
||||
use_fp8_w8a8=self.use_fp8_w8a8,
|
||||
use_fp8_w8a16=self.use_fp8_w8a16,
|
||||
use_int8_w8a16=self.use_int8_w8a16,
|
||||
use_int4_w4a16=self.use_int4_w4a16,
|
||||
ocp_mx_scheme=self.ocp_mx_scheme,
|
||||
@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config(
|
||||
) -> FusedMoEQuantConfig:
|
||||
"""
|
||||
Construct a quant config for 16-bit float activations and int4 weights.
|
||||
Note: Activations are pre-quantized.
|
||||
"""
|
||||
group_shape = GroupShape(*block_shape) if block_shape is not None else None
|
||||
return FusedMoEQuantConfig(
|
||||
@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config(
|
||||
)
|
||||
|
||||
|
||||
def fp8_w8a16_moe_quant_config(
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
block_shape: list[int] | None = None,
|
||||
) -> FusedMoEQuantConfig:
|
||||
"""
|
||||
Construct a quant config for 16-bit float activations and fp8 weights.
|
||||
"""
|
||||
group_shape = GroupShape(*block_shape) if block_shape is not None else None
|
||||
return FusedMoEQuantConfig(
|
||||
_a1=FusedMoEQuantDesc(),
|
||||
_a2=FusedMoEQuantDesc(),
|
||||
_w1=FusedMoEQuantDesc(
|
||||
current_platform.fp8_dtype(), group_shape, w1_scale, None, None
|
||||
),
|
||||
_w2=FusedMoEQuantDesc(
|
||||
current_platform.fp8_dtype(), group_shape, w2_scale, None, None
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def int8_w8a16_moe_quant_config(
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config(
|
||||
) -> FusedMoEQuantConfig:
|
||||
"""
|
||||
Construct a quant config for 16-bit float activations and int8 weights.
|
||||
Note: Activations are pre-quantized.
|
||||
"""
|
||||
group_shape = GroupShape(*block_shape) if block_shape is not None else None
|
||||
return FusedMoEQuantConfig(
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
TopKWeightAndReduceDelegate,
|
||||
TopKWeightAndReduceNoOP,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache
|
||||
from vllm.model_executor.layers.fused_moe.utils import _resize_cache
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -896,162 +896,6 @@ def cutlass_moe_fp4(
|
||||
)
|
||||
|
||||
|
||||
def _valid_cutlass_block_scaled_grouped_gemm(
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
inplace: bool,
|
||||
activation: str,
|
||||
apply_router_weight_on_input: bool,
|
||||
expert_map: torch.Tensor | None,
|
||||
) -> bool:
|
||||
def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
|
||||
return N % 128 == 0 and K % 128 == 0
|
||||
|
||||
_, K, N = w2.size()
|
||||
if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K):
|
||||
logger.debug_once(
|
||||
"CutlassBlockScaledGroupedGemm disabled: unaligned problem size. "
|
||||
"N: %s, K: %s",
|
||||
N,
|
||||
K,
|
||||
)
|
||||
return False
|
||||
|
||||
if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn:
|
||||
logger.debug_once(
|
||||
"CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). "
|
||||
"w1.dtype: %s, w2.dtype: %s",
|
||||
w1.dtype,
|
||||
w2.dtype,
|
||||
)
|
||||
return False
|
||||
|
||||
if expert_map is not None:
|
||||
logger.debug_once(
|
||||
"CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported."
|
||||
)
|
||||
return False
|
||||
|
||||
if activation != "silu":
|
||||
logger.debug_once(
|
||||
"CutlassBlockScaledGroupedGemm disabled: only activation silu is supported."
|
||||
)
|
||||
return False
|
||||
|
||||
if apply_router_weight_on_input:
|
||||
logger.debug_once(
|
||||
"CutlassBlockScaledGroupedGemm disabled:"
|
||||
" apply_router_weight_on_input is not supported."
|
||||
)
|
||||
return False
|
||||
|
||||
if inplace:
|
||||
logger.debug_once(
|
||||
"CutlassBlockScaledGroupedGemm disabled: inplace is not supported."
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8.
|
||||
def run_cutlass_block_scaled_fused_experts(
|
||||
a: torch.Tensor,
|
||||
w1: torch.Tensor,
|
||||
w2: torch.Tensor,
|
||||
w1_scale: torch.Tensor,
|
||||
w2_scale: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
w1_q = w1.transpose(1, 2)
|
||||
w2_q = w2.transpose(1, 2)
|
||||
w1_scale = w1_scale.transpose(1, 2)
|
||||
w2_scale = w2_scale.transpose(1, 2)
|
||||
|
||||
assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
|
||||
assert a.shape[0] == topk_ids.shape[0], (
|
||||
"a and topk_ids must have the same batch size"
|
||||
)
|
||||
assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn"
|
||||
assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn"
|
||||
assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
|
||||
assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
|
||||
assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
|
||||
assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch"
|
||||
assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch"
|
||||
assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
|
||||
|
||||
out_dtype = a.dtype
|
||||
num_experts = w1_q.size(0)
|
||||
m = a.size(0)
|
||||
k = w1_q.size(1)
|
||||
n = w2_q.size(1)
|
||||
|
||||
topk = topk_ids.size(1)
|
||||
|
||||
a_q, a1_scale = _fp8_quantize(
|
||||
a, A_scale=None, per_act_token=False, block_shape=[128, 128]
|
||||
)
|
||||
device = a_q.device
|
||||
|
||||
expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
|
||||
problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
|
||||
problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device)
|
||||
|
||||
a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
|
||||
c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
|
||||
|
||||
ops.get_cutlass_moe_mm_data(
|
||||
topk_ids,
|
||||
expert_offsets,
|
||||
problem_sizes1,
|
||||
problem_sizes2,
|
||||
a_map,
|
||||
c_map,
|
||||
num_experts,
|
||||
n,
|
||||
k,
|
||||
)
|
||||
|
||||
rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
|
||||
rep_a1_scales = a1_scale[a_map]
|
||||
|
||||
c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device)
|
||||
c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device)
|
||||
|
||||
ops.cutlass_blockwise_scaled_grouped_mm(
|
||||
c1,
|
||||
rep_a_q,
|
||||
w1_q,
|
||||
rep_a1_scales,
|
||||
w1_scale,
|
||||
problem_sizes1,
|
||||
expert_offsets[:-1],
|
||||
)
|
||||
|
||||
intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device)
|
||||
torch.ops._C.silu_and_mul(intermediate, c1)
|
||||
|
||||
intermediate_q, a2_scale = _fp8_quantize(
|
||||
intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128]
|
||||
)
|
||||
|
||||
ops.cutlass_blockwise_scaled_grouped_mm(
|
||||
c2,
|
||||
intermediate_q,
|
||||
w2_q,
|
||||
a2_scale,
|
||||
w2_scale,
|
||||
problem_sizes2,
|
||||
expert_offsets[:-1],
|
||||
)
|
||||
|
||||
return (
|
||||
c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
|
||||
).sum(dim=1)
|
||||
|
||||
|
||||
# W4A8
|
||||
def run_cutlass_moe_w4a8_fp8(
|
||||
output: torch.Tensor,
|
||||
|
||||
@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||
batched_moe_align_block_size,
|
||||
moe_align_block_size,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
|
||||
TopKWeightAndReduceDelegate,
|
||||
TopKWeightAndReduceNoOP,
|
||||
@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
marlin_moe_intermediate_size,
|
||||
marlin_quant_input,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
|
||||
@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
is_k_full: bool = True,
|
||||
):
|
||||
# TODO (varun) : Enable activation quantization
|
||||
assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, (
|
||||
"Supports only mxfp4_w4a16 or int4_w4a16"
|
||||
)
|
||||
assert (
|
||||
quant_config.use_mxfp4_w4a16
|
||||
or quant_config.use_int4_w4a16
|
||||
or quant_config.use_fp8_w8a16
|
||||
), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16"
|
||||
self.w13_g_idx = w13_g_idx
|
||||
self.w2_g_idx = w2_g_idx
|
||||
self.w13_g_idx_sort_indices = w13_g_idx_sort_indices
|
||||
@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
@property
|
||||
def quant_type_id(self) -> int:
|
||||
# uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4
|
||||
return (
|
||||
scalar_types.uint4b8.id
|
||||
if self.quant_config.use_int4_w4a16
|
||||
else scalar_types.float4_e2m1f.id
|
||||
)
|
||||
if self.quant_config.use_int4_w4a16:
|
||||
return scalar_types.uint4b8.id
|
||||
elif self.quant_config.use_mxfp4_w4a16:
|
||||
return scalar_types.float4_e2m1f.id
|
||||
elif (
|
||||
self.quant_config.use_fp8_w8a16
|
||||
and current_platform.fp8_dtype() == torch.float8_e4m3fn
|
||||
):
|
||||
return scalar_types.float8_e4m3fn.id
|
||||
else:
|
||||
raise NotImplementedError("Unsupported quantization type.")
|
||||
|
||||
def moe_problem_size(
|
||||
self,
|
||||
@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase):
|
||||
ops.moe_sum(input, output)
|
||||
|
||||
|
||||
def modular_marlin_fused_moe(
|
||||
quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
|
||||
) -> mk.FusedMoEModularKernel:
|
||||
return mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
MarlinExperts(quant_config),
|
||||
shared_experts,
|
||||
)
|
||||
|
||||
|
||||
class BatchedMarlinExperts(MarlinExpertsBase):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -25,10 +25,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
_get_config_dtype_str,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
||||
_valid_cutlass_block_scaled_grouped_gemm,
|
||||
run_cutlass_block_scaled_fused_experts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
||||
_valid_deep_gemm,
|
||||
deep_gemm_moe_fp8,
|
||||
@ -1678,11 +1674,9 @@ def fused_experts(
|
||||
expert_map: torch.Tensor | None = None,
|
||||
quant_config: FusedMoEQuantConfig | None = None,
|
||||
allow_deep_gemm: bool = False,
|
||||
allow_cutlass_block_scaled_grouped_gemm: bool = False,
|
||||
) -> torch.Tensor:
|
||||
if quant_config is None:
|
||||
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
|
||||
use_fp8_w8a8 = quant_config.use_fp8_w8a8
|
||||
|
||||
# For now, disable DeepGemm for small N (<= 512) until better
|
||||
# permute/unpermute ops are available.
|
||||
@ -1712,23 +1706,6 @@ def fused_experts(
|
||||
a2_scale=quant_config.a2_scale,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
)
|
||||
elif (
|
||||
allow_cutlass_block_scaled_grouped_gemm
|
||||
and use_fp8_w8a8
|
||||
and _valid_cutlass_block_scaled_grouped_gemm(
|
||||
w1, w2, inplace, activation, apply_router_weight_on_input, expert_map
|
||||
)
|
||||
):
|
||||
assert quant_config is not None
|
||||
return run_cutlass_block_scaled_fused_experts(
|
||||
a=hidden_states,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
w1_scale=quant_config.w1_scale,
|
||||
w2_scale=quant_config.w2_scale,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
)
|
||||
else:
|
||||
return dispatch_fused_experts_func(inplace)(
|
||||
hidden_states=hidden_states,
|
||||
|
||||
@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
expert_map=None if self.disable_expert_map else layer.expert_map,
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
return result, zero_expert_result
|
||||
else:
|
||||
return result
|
||||
return result
|
||||
|
||||
@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
|
||||
init_aiter_topK_meta_data,
|
||||
)
|
||||
@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
|
||||
num_redundant_experts: int = 0,
|
||||
has_bias: bool = False,
|
||||
is_sequence_parallel=False,
|
||||
zero_expert_num: int | None = 0,
|
||||
zero_expert_type: str | None = None,
|
||||
expert_mapping: list[tuple[str, str, int, str]] | None = None,
|
||||
n_shared_experts: int | None = None,
|
||||
routing_method_type: int | None = None,
|
||||
@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
|
||||
|
||||
self.global_num_experts = num_experts + num_redundant_experts
|
||||
self.logical_num_experts = num_experts
|
||||
self.zero_expert_num = zero_expert_num
|
||||
self.zero_expert_type = zero_expert_type
|
||||
|
||||
# Expert mapping used in self.load_weights
|
||||
self.expert_mapping = expert_mapping
|
||||
@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Route the input hidden states to the top-k experts based on the
|
||||
router logits.
|
||||
|
||||
Returns:
|
||||
(topk_weights, topk_ids, zero_expert_result)
|
||||
(tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
|
||||
The weights, expert ids, and zero expert computation result.
|
||||
(topk_weights, topk_ids)
|
||||
(tuple[torch.Tensor, torch.Tensor]):
|
||||
The weights and expert ids.
|
||||
|
||||
**Compatibility**: When EPLB is not enabled, the returned ids are
|
||||
equivalent to global logical ids, so should be compatible with
|
||||
@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
|
||||
|
||||
assert topk_ids.dtype == indices_type or indices_type is None
|
||||
|
||||
# Compute zero expert result if needed
|
||||
if (
|
||||
self.zero_expert_num is not None
|
||||
and self.zero_expert_num > 0
|
||||
and self.zero_expert_type is not None
|
||||
and self.global_num_experts is not None
|
||||
):
|
||||
zero_expert_result = zero_experts_compute_triton(
|
||||
expert_indices=topk_ids,
|
||||
expert_scales=topk_weights,
|
||||
num_experts=self.global_num_experts,
|
||||
zero_expert_type=self.zero_expert_type,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
else:
|
||||
zero_expert_result = None
|
||||
return topk_weights, topk_ids, zero_expert_result
|
||||
return topk_weights, topk_ids
|
||||
|
||||
def must_reduce_shared_expert_outputs(self) -> bool:
|
||||
"""
|
||||
@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp):
|
||||
fused_output = torch.ops.vllm.moe_forward(
|
||||
hidden_states, router_logits, self.layer_name
|
||||
)
|
||||
if self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(fused_output, tuple)
|
||||
fused_output, zero_expert_result = fused_output
|
||||
return (reduce_output(fused_output) + zero_expert_result)[
|
||||
..., :og_hidden_states
|
||||
]
|
||||
else:
|
||||
return reduce_output(fused_output)[..., :og_hidden_states]
|
||||
return reduce_output(fused_output)[..., :og_hidden_states]
|
||||
else:
|
||||
if current_platform.is_tpu() or current_platform.is_cpu():
|
||||
# TODO: Once the OOM issue for the TPU backend is resolved, we
|
||||
@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp):
|
||||
final_hidden_states,
|
||||
)
|
||||
|
||||
if self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, tuple)
|
||||
assert self.shared_experts is None
|
||||
final_hidden_states, zero_expert_result = final_hidden_states
|
||||
if zero_expert_result is not None:
|
||||
final_hidden_states += zero_expert_result
|
||||
|
||||
if not skip_result_store:
|
||||
if self.shared_experts is None:
|
||||
full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
|
||||
@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp):
|
||||
shared_output,
|
||||
final_hidden_states,
|
||||
)
|
||||
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, tuple)
|
||||
final_hidden_states, zero_expert_result = final_hidden_states
|
||||
|
||||
def combine_output(states: torch.Tensor) -> torch.Tensor:
|
||||
if do_naive_dispatch_combine:
|
||||
@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp):
|
||||
final_hidden_states[0],
|
||||
combine_output(final_hidden_states[1]),
|
||||
)
|
||||
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, torch.Tensor)
|
||||
return (combine_output(final_hidden_states), zero_expert_result)
|
||||
else:
|
||||
return combine_output(final_hidden_states)
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
import vllm.envs as envs
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm._aiter_ops import rocm_aiter_ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEPermuteExpertsUnpermute,
|
||||
FusedMoEPrepareAndFinalize,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.interface import CpuArchEnum
|
||||
@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
|
||||
if current_platform.is_cuda_alike():
|
||||
from .fused_batched_moe import BatchedTritonExperts
|
||||
from .fused_moe import TritonExperts, fused_experts
|
||||
from .fused_moe import TritonExperts
|
||||
else:
|
||||
fused_experts = None # type: ignore
|
||||
TritonExperts = None # type: ignore
|
||||
|
||||
if current_platform.is_tpu():
|
||||
from .moe_pallas import fused_moe as fused_moe_pallas
|
||||
@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
|
||||
else:
|
||||
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
|
||||
elif current_platform.is_cuda_alike():
|
||||
self.moe_quant_config = self.get_fused_moe_quant_config(layer)
|
||||
self.kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(),
|
||||
TritonExperts(self.moe_quant_config),
|
||||
shared_experts=None,
|
||||
)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
def get_fused_moe_quant_config(
|
||||
self, layer: torch.nn.Module
|
||||
) -> FusedMoEQuantConfig | None:
|
||||
def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
|
||||
if self.moe.has_bias:
|
||||
return biased_moe_quant_config(
|
||||
layer.w13_bias,
|
||||
@ -295,7 +304,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
)
|
||||
else:
|
||||
result = fused_experts(
|
||||
result = self.kernel(
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
w2=layer.w2_weight,
|
||||
@ -330,19 +339,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
topk_ids=topk_ids,
|
||||
inplace=True,
|
||||
activation=layer.activation,
|
||||
quant_config=self.moe_quant_config,
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
global_num_experts=layer.global_num_experts,
|
||||
expert_map=layer.expert_map,
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
return result, zero_expert_result
|
||||
else:
|
||||
return result
|
||||
return result
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
|
||||
189
vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
Normal file
189
vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
Normal file
@ -0,0 +1,189 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
|
||||
|
||||
class ZeroExpertFusedMoE(FusedMoE):
|
||||
"""
|
||||
A FusedMoE operation that also computes the results of zero experts.
|
||||
Zero experts perform identity operations (scaled pass-through) instead
|
||||
of full MLP computations.
|
||||
|
||||
This class uses memoization to avoid redundant routing computation:
|
||||
routing is computed once and reused for both zero expert computation
|
||||
and the main FusedMoE forward pass.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
zero_expert_num: int,
|
||||
zero_expert_type: str,
|
||||
router: nn.Module,
|
||||
**kwargs,
|
||||
):
|
||||
# ZeroExpertFusedMoE manages its own custom_routing_function for memoization
|
||||
assert (
|
||||
"custom_routing_function" not in kwargs
|
||||
or kwargs.get("custom_routing_function") is None
|
||||
), (
|
||||
"ZeroExpertFusedMoE does not support external custom_routing_function. "
|
||||
"It manages its own for routing memoization."
|
||||
)
|
||||
|
||||
# Automatically slice router's e_score_correction_bias to only include
|
||||
# real experts (not zero_experts) for the base FusedMoE.
|
||||
# The full bias will be used temporarily in forward() for routing.
|
||||
if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
|
||||
num_real_experts = kwargs["num_experts"]
|
||||
router_bias = router.e_score_correction_bias
|
||||
user_bias = kwargs.get("e_score_correction_bias")
|
||||
|
||||
# Use router's bias if:
|
||||
# 1. User didn't provide bias, or
|
||||
# 2. User provided full bias (same size as router)
|
||||
if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
|
||||
kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
|
||||
|
||||
# FusedMoE no longer accepts zero_expert_num/zero_expert_type.
|
||||
# We handle zero experts ourselves in forward().
|
||||
super().__init__(**kwargs)
|
||||
# Store the actual zero_expert_num and zero_expert_type for our own use
|
||||
self._actual_zero_expert_num = zero_expert_num
|
||||
self._actual_zero_expert_type = zero_expert_type
|
||||
self._router = router # Full router (includes zero experts)
|
||||
|
||||
# Expose zero_expert_num and zero_expert_type as attributes for
|
||||
# compatibility with quantization methods that check these attributes
|
||||
self.zero_expert_num = 0
|
||||
self.zero_expert_type = None
|
||||
|
||||
# Memoization state for routing results
|
||||
self._memoized_topk_weights: torch.Tensor | None = None
|
||||
self._memoized_topk_ids: torch.Tensor | None = None
|
||||
|
||||
# Create custom_routing_function to reuse memoized routing results
|
||||
def custom_routing_function(hidden_states, gating_output, topk, renormalize):
|
||||
"""Return memoized `topk_weights` and `topk_ids`."""
|
||||
if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
|
||||
raise RuntimeError(
|
||||
"ZeroExpertFusedMoE: routing results not memoized. "
|
||||
"Call select_experts first to compute routing."
|
||||
)
|
||||
return self._memoized_topk_weights, self._memoized_topk_ids
|
||||
|
||||
self.custom_routing_function = custom_routing_function
|
||||
|
||||
@contextmanager
|
||||
def _temporarily_set_attrs(self, **attrs):
|
||||
"""
|
||||
Temporarily set attributes using object.__setattr__ and restore them.
|
||||
|
||||
This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
|
||||
When PyTorch Dynamo traces the forward pass, it cannot handle
|
||||
nn.Module.__setattr__ calls (which include parameter registration logic),
|
||||
resulting in "Unsupported" errors. Using object.__setattr__ directly
|
||||
sets the attribute without triggering nn.Module's custom __setattr__,
|
||||
allowing Dynamo to trace the code successfully.
|
||||
"""
|
||||
originals = {key: getattr(self, key) for key in attrs}
|
||||
try:
|
||||
for key, value in attrs.items():
|
||||
object.__setattr__(self, key, value)
|
||||
yield
|
||||
finally:
|
||||
for key, value in originals.items():
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
def _compute_zero_expert_result(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
) -> torch.Tensor | None:
|
||||
"""Compute zero expert results using pre-computed routing."""
|
||||
if (
|
||||
self._actual_zero_expert_num is None
|
||||
or self._actual_zero_expert_num <= 0
|
||||
or self._actual_zero_expert_type is None
|
||||
):
|
||||
return None
|
||||
|
||||
return zero_experts_compute_triton(
|
||||
expert_indices=topk_ids.clone(),
|
||||
expert_scales=topk_weights.clone(),
|
||||
num_experts=self.logical_num_experts,
|
||||
zero_expert_type=self._actual_zero_expert_type,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor, # Full logits including zero experts
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Forward pass with zero expert support and routing memoization.
|
||||
|
||||
Args:
|
||||
hidden_states: Input hidden states
|
||||
router_logits: Full router logits (including zero experts)
|
||||
|
||||
Returns:
|
||||
Combined output from real experts and zero experts
|
||||
"""
|
||||
# Prepare temporary attribute overrides for routing computation
|
||||
temp_attrs = {
|
||||
"custom_routing_function": None, # Disable for first routing
|
||||
}
|
||||
if self._router is not None:
|
||||
temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
|
||||
|
||||
# Compute routing with temporary attributes
|
||||
# Pass full router_logits (including zero experts) so that zero experts
|
||||
# can be properly identified in topk_ids
|
||||
with self._temporarily_set_attrs(**temp_attrs):
|
||||
topk_weights, topk_ids = self.select_experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits, # Full logits (includes zero experts)
|
||||
)
|
||||
|
||||
# Compute zero expert result if needed
|
||||
zero_expert_result = self._compute_zero_expert_result(
|
||||
hidden_states=hidden_states,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
)
|
||||
|
||||
# Memoize routing results for reuse in super().forward()
|
||||
self._memoized_topk_weights = topk_weights
|
||||
self._memoized_topk_ids = topk_ids
|
||||
|
||||
# Slice router_logits for real experts only
|
||||
router_logits_sliced = router_logits[..., : self.logical_num_experts]
|
||||
|
||||
# Compute real expert results (will reuse memoized routing via
|
||||
# custom_routing_function)
|
||||
# zero_expert_num is already 0, so FusedMoE won't handle zero experts
|
||||
fused_out = super().forward(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits_sliced,
|
||||
)
|
||||
|
||||
# Combine results
|
||||
# Both zero_expert_result and fused_out are computed from the same
|
||||
# hidden_states, so they should be on the same device.
|
||||
if zero_expert_result is not None:
|
||||
fused_out = fused_out + zero_expert_result
|
||||
|
||||
# Clear memoization after use
|
||||
self._memoized_topk_weights = None
|
||||
self._memoized_topk_ids = None
|
||||
|
||||
return fused_out
|
||||
@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [
|
||||
"GPTQLinearMethod",
|
||||
"FBGEMMFp8LinearMethod",
|
||||
"ModelOptFp8LinearMethod",
|
||||
"ModelOptFp8PcPtLinearMethod",
|
||||
"ModelOptFp8PbWoLinearMethod",
|
||||
"IPEXAWQLinearMethod",
|
||||
"IPEXGPTQLinearMethod",
|
||||
"HQQMarlinMethod",
|
||||
@ -277,6 +279,7 @@ class LinearBase(CustomOp):
|
||||
self.params_dtype = params_dtype
|
||||
self.quant_config = quant_config
|
||||
self.prefix = prefix
|
||||
self.allow_fp8_block_shape_mismatch = False
|
||||
if quant_config is None:
|
||||
self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
|
||||
else:
|
||||
@ -475,6 +478,7 @@ class ColumnParallelLinear(LinearBase):
|
||||
disable_tp=disable_tp,
|
||||
)
|
||||
|
||||
self._maybe_allow_fp8_block_shape_mismatch()
|
||||
self.gather_output = gather_output
|
||||
|
||||
if output_sizes is None:
|
||||
@ -509,6 +513,33 @@ class ColumnParallelLinear(LinearBase):
|
||||
self.register_parameter("bias", None)
|
||||
self.update_param_tp_status()
|
||||
|
||||
def _maybe_allow_fp8_block_shape_mismatch(self) -> None:
|
||||
quant_config = getattr(self, "quant_config", None)
|
||||
weight_block = getattr(quant_config, "weight_block_size", None)
|
||||
if (
|
||||
weight_block is None
|
||||
or len(weight_block) < 1
|
||||
or len(self.output_partition_sizes) <= 1
|
||||
):
|
||||
return
|
||||
|
||||
try:
|
||||
block_n = int(weight_block[0])
|
||||
except (ValueError, TypeError):
|
||||
return
|
||||
|
||||
if block_n <= 0:
|
||||
return
|
||||
|
||||
if any(size % block_n != 0 for size in self.output_partition_sizes):
|
||||
self.allow_fp8_block_shape_mismatch = True
|
||||
logger.debug(
|
||||
"Allowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)",
|
||||
getattr(self, "prefix", "<unknown>"),
|
||||
block_n,
|
||||
self.output_partition_sizes,
|
||||
)
|
||||
|
||||
def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
|
||||
output_dim = getattr(param, "output_dim", None)
|
||||
|
||||
@ -906,9 +937,11 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
*,
|
||||
return_bias: bool = True,
|
||||
disable_tp: bool = False,
|
||||
v_head_size: int | None = None,
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.head_size = head_size
|
||||
self.v_head_size = v_head_size if v_head_size is not None else head_size
|
||||
self.total_num_heads = total_num_heads
|
||||
if total_num_kv_heads is None:
|
||||
total_num_kv_heads = total_num_heads
|
||||
@ -924,12 +957,14 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
self.num_kv_head_replicas = 1
|
||||
input_size = self.hidden_size
|
||||
output_size = (
|
||||
(self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
|
||||
)
|
||||
self.num_heads * self.head_size
|
||||
+ self.num_kv_heads * self.head_size
|
||||
+ self.num_kv_heads * self.v_head_size
|
||||
) * tp_size
|
||||
self.output_sizes = [
|
||||
self.num_heads * self.head_size * tp_size, # q_proj
|
||||
self.num_kv_heads * self.head_size * tp_size, # k_proj
|
||||
self.num_kv_heads * self.head_size * tp_size, # v_proj
|
||||
self.num_kv_heads * self.v_head_size * tp_size, # v_proj
|
||||
]
|
||||
|
||||
super().__init__(
|
||||
@ -950,7 +985,8 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
"q": 0,
|
||||
"k": self.num_heads * self.head_size,
|
||||
"v": (self.num_heads + self.num_kv_heads) * self.head_size,
|
||||
"total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
|
||||
"total": (self.num_heads + self.num_kv_heads) * self.head_size
|
||||
+ self.num_kv_heads * self.v_head_size,
|
||||
}
|
||||
return shard_offset_mapping.get(loaded_shard_id)
|
||||
|
||||
@ -958,7 +994,7 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
shard_size_mapping = {
|
||||
"q": self.num_heads * self.head_size,
|
||||
"k": self.num_kv_heads * self.head_size,
|
||||
"v": self.num_kv_heads * self.head_size,
|
||||
"v": self.num_kv_heads * self.v_head_size,
|
||||
}
|
||||
return shard_size_mapping.get(loaded_shard_id)
|
||||
|
||||
@ -985,7 +1021,7 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
(
|
||||
"v",
|
||||
(self.total_num_heads + self.total_num_kv_heads) * self.head_size,
|
||||
self.total_num_kv_heads * self.head_size,
|
||||
self.total_num_kv_heads * self.v_head_size,
|
||||
),
|
||||
]
|
||||
|
||||
@ -1110,7 +1146,7 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
(
|
||||
"v",
|
||||
(self.total_num_heads + self.total_num_kv_heads) * self.head_size,
|
||||
self.total_num_kv_heads * self.head_size,
|
||||
self.total_num_kv_heads * self.v_head_size,
|
||||
),
|
||||
]
|
||||
use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
|
||||
@ -1139,11 +1175,12 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
"v": (
|
||||
(self.total_num_heads + self.total_num_kv_heads)
|
||||
* self.head_size,
|
||||
self.total_num_kv_heads * self.head_size,
|
||||
self.total_num_kv_heads * self.v_head_size,
|
||||
),
|
||||
"total": (
|
||||
(self.total_num_heads + 2 * self.total_num_kv_heads)
|
||||
* self.head_size,
|
||||
(self.total_num_heads + self.total_num_kv_heads)
|
||||
* self.head_size
|
||||
+ self.total_num_kv_heads * self.v_head_size,
|
||||
0,
|
||||
),
|
||||
}
|
||||
@ -1170,7 +1207,7 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
shard_size = self.num_kv_heads * self.head_size
|
||||
elif loaded_shard_id == "v":
|
||||
shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
|
||||
shard_size = self.num_kv_heads * self.head_size
|
||||
shard_size = self.num_kv_heads * self.v_head_size
|
||||
# Special case for Quantized Weights.
|
||||
# If quantized, we need to adjust the offset and size to account
|
||||
# for the packing.
|
||||
@ -1199,10 +1236,11 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
),
|
||||
"v": (
|
||||
(self.num_heads + self.num_kv_heads) * self.head_size,
|
||||
self.num_kv_heads * self.head_size,
|
||||
self.num_kv_heads * self.v_head_size,
|
||||
),
|
||||
"total": (
|
||||
(self.num_heads + 2 * self.num_kv_heads) * self.head_size,
|
||||
(self.num_heads + self.num_kv_heads) * self.head_size
|
||||
+ self.num_kv_heads * self.v_head_size,
|
||||
0,
|
||||
),
|
||||
}
|
||||
|
||||
@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
assert layer.activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
|
||||
e_score_correction_bias=layer.e_score_correction_bias,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
|
||||
f"{layer.activation} not supported for Marlin MoE."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
"EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
|
||||
)
|
||||
assert self.moe_quant_config is not None
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
|
||||
)
|
||||
|
||||
self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
|
||||
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
|
||||
|
||||
if self.weight_block_size is not None:
|
||||
assert not self.is_static_input_scheme
|
||||
|
||||
@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from enum import Enum
|
||||
from functools import partial
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
import torch
|
||||
@ -33,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
fp8_w8a8_moe_quant_config,
|
||||
fp8_w8a16_moe_quant_config,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
|
||||
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
|
||||
from vllm.model_executor.layers.linear import (
|
||||
LinearBase,
|
||||
@ -51,7 +50,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
|
||||
FlashinferMoeBackend,
|
||||
apply_flashinfer_per_tensor_scale_fp8,
|
||||
build_flashinfer_fp8_cutlass_moe_prepare_finalize,
|
||||
flashinfer_cutlass_moe_fp8,
|
||||
get_flashinfer_moe_backend,
|
||||
register_moe_scaling_factors,
|
||||
rotate_flashinfer_fp8_moe_weights,
|
||||
@ -97,7 +95,6 @@ from vllm.model_executor.parameter import (
|
||||
)
|
||||
from vllm.model_executor.utils import replace_parameter, set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.utils.deep_gemm import (
|
||||
is_deep_gemm_e8m0_used,
|
||||
is_deep_gemm_supported,
|
||||
@ -118,20 +115,21 @@ class Fp8MoeBackend(Enum):
|
||||
FLASHINFER_TRTLLM = 1
|
||||
FLASHINFER_CUTLASS = 2
|
||||
DEEPGEMM = 3
|
||||
CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
|
||||
MARLIN = 5
|
||||
TRITON = 6
|
||||
MARLIN = 4
|
||||
TRITON = 5
|
||||
|
||||
|
||||
def get_fp8_moe_backend(
|
||||
block_quant: bool,
|
||||
moe_parallel_config: FusedMoEParallelConfig,
|
||||
with_lora_support: bool,
|
||||
) -> Fp8MoeBackend:
|
||||
) -> Fp8MoeBackend | None:
|
||||
"""
|
||||
Select the primary FP8 MoE backend
|
||||
Note: Shape-specific fallbacks may still occur at runtime.
|
||||
"""
|
||||
if current_platform.is_xpu():
|
||||
return None
|
||||
if with_lora_support:
|
||||
return Fp8MoeBackend.TRITON
|
||||
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
|
||||
@ -191,17 +189,6 @@ def get_fp8_moe_backend(
|
||||
logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
|
||||
return Fp8MoeBackend.DEEPGEMM
|
||||
|
||||
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability_family(100)
|
||||
and block_quant
|
||||
):
|
||||
logger.info_once(
|
||||
"Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
|
||||
)
|
||||
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
|
||||
|
||||
# default to Triton
|
||||
logger.info_once("Using Triton backend for FP8 MoE")
|
||||
return Fp8MoeBackend.TRITON
|
||||
@ -306,6 +293,13 @@ class Fp8Config(QuantizationConfig):
|
||||
return UnquantizedLinearMethod()
|
||||
return XPUFp8LinearMethod(fp8_config)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
if is_layer_skipped(
|
||||
prefix=prefix,
|
||||
ignored_layers=self.ignored_layers,
|
||||
fused_mapping=self.packed_modules_mapping,
|
||||
):
|
||||
return UnquantizedFusedMoEMethod(layer.moe_config)
|
||||
|
||||
return XPUFp8MoEMethod(fp8_config, layer)
|
||||
elif isinstance(layer, Attention):
|
||||
return Fp8KVCacheMethod(self)
|
||||
@ -420,7 +414,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
if vllm_is_batch_invariant():
|
||||
self.use_marlin = False
|
||||
|
||||
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
|
||||
self.use_deep_gemm = is_deep_gemm_supported()
|
||||
|
||||
self.weight_block_size = self.quant_config.weight_block_size
|
||||
@ -734,27 +728,33 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
self.marlin_input_dtype = None
|
||||
self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
|
||||
self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
|
||||
if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
|
||||
self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
|
||||
elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
|
||||
self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
|
||||
if self.block_quant:
|
||||
assert self.weight_block_size == [128, 128], (
|
||||
f"Only support weight_block_size == [128, 128], "
|
||||
f"got {self.weight_block_size}"
|
||||
if self.block_quant and self.weight_block_size != [128, 128]:
|
||||
raise NotImplementedError(
|
||||
"FlashInfer CUTLASS FP8 MoE backend only supports block "
|
||||
"size [128, 128]."
|
||||
)
|
||||
if not self.block_quant:
|
||||
if layer.renormalize or layer.custom_routing_function is not None:
|
||||
raise NotImplementedError(
|
||||
"FlashInfer CUTLASS FP8 MoE backend does custom routing "
|
||||
f"function or renormalization, but got {layer.renormalize} and "
|
||||
f"{layer.custom_routing_function}."
|
||||
)
|
||||
if layer.scoring_func != "sigmoid":
|
||||
raise NotImplementedError(
|
||||
"FlashInfer CUTLASS FP8 MoE backend only supports "
|
||||
f"'sigmoid' scoring function, but got {layer.scoring_func}."
|
||||
)
|
||||
if layer.activation != "silu":
|
||||
raise NotImplementedError(
|
||||
"FlashInfer CUTLASS FP8 MoE backend only supports SiLU "
|
||||
"activation function, but got {layer.activation}."
|
||||
)
|
||||
self.flashinfer_moe_fn = partial(
|
||||
flashinfer_cutlass_moe_fp8,
|
||||
moe=self.moe,
|
||||
use_deepseek_fp8_block_scale=self.block_quant,
|
||||
)
|
||||
|
||||
self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
|
||||
self.allow_cutlass_block_scaled_grouped_gemm = (
|
||||
self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
|
||||
)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
@ -943,7 +943,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
# DeepGemm scales need to be transposed and aligned. We try to do
|
||||
# it ahead of time for performance reasons.
|
||||
if self.allow_deep_gemm:
|
||||
if self.fp8_backend == Fp8MoeBackend.DEEPGEMM:
|
||||
dg_w13_weight, dg_w13_weight_scale_inv = (
|
||||
deepgemm_post_process_fp8_weight_block(
|
||||
wq=layer.w13_weight.data,
|
||||
@ -1046,7 +1046,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
|
||||
layer.w13_weight.data = w13_weight.data
|
||||
|
||||
if self.use_marlin:
|
||||
if self.fp8_backend == Fp8MoeBackend.MARLIN:
|
||||
prepare_moe_fp8_layer_for_marlin(
|
||||
layer, False, input_dtype=self.marlin_input_dtype
|
||||
)
|
||||
@ -1054,13 +1054,82 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
del layer.w13_input_scale
|
||||
del layer.w2_input_scale
|
||||
|
||||
# NOTE(rob): this is a WIP refactor. We are first migrating
|
||||
# all of the kernels in the TP case to use mk. Once this is
|
||||
# done, then we will initialzie the TP case and DP/EP case
|
||||
# via the same code path (i.e. via maybe_init_modular_kernel).
|
||||
# NOTE(rob): in progress migrating all into this format.
|
||||
if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
|
||||
FlashInferExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
|
||||
FlashInferAllGatherMoEPrepareAndFinalize,
|
||||
)
|
||||
|
||||
config = self.get_fused_moe_quant_config(layer)
|
||||
assert config is not None
|
||||
self.moe_quant_config = config
|
||||
|
||||
self.kernel = mk.FusedMoEModularKernel(
|
||||
FlashInferAllGatherMoEPrepareAndFinalize(
|
||||
use_dp=(self.moe.dp_size > 1),
|
||||
use_deepseek_fp8_block_scale=self.block_quant,
|
||||
),
|
||||
FlashInferExperts(
|
||||
out_dtype=torch.get_default_dtype(),
|
||||
quant_config=self.moe_quant_config,
|
||||
ep_rank=self.moe.ep_rank,
|
||||
ep_size=self.moe.ep_size,
|
||||
tp_rank=self.moe.tp_rank,
|
||||
tp_size=self.moe.tp_size,
|
||||
use_dp=(self.moe.dp_size > 1),
|
||||
use_deepseek_fp8_block_scale=self.block_quant,
|
||||
),
|
||||
)
|
||||
self.use_inplace = False
|
||||
|
||||
elif self.fp8_backend in [
|
||||
Fp8MoeBackend.DEEPGEMM,
|
||||
Fp8MoeBackend.TRITON,
|
||||
Fp8MoeBackend.MARLIN,
|
||||
]:
|
||||
from vllm.model_executor.layers.fused_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
|
||||
MarlinExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
||||
MoEPrepareAndFinalizeNoEP,
|
||||
)
|
||||
|
||||
config = self.get_fused_moe_quant_config(layer)
|
||||
assert config is not None
|
||||
self.moe_quant_config = config
|
||||
use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
|
||||
allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
|
||||
moe_kernel = (
|
||||
MarlinExperts(quant_config=self.moe_quant_config)
|
||||
if use_marlin
|
||||
else TritonOrDeepGemmExperts(
|
||||
quant_config=self.moe_quant_config,
|
||||
allow_deep_gemm=allow_deep_gemm,
|
||||
)
|
||||
)
|
||||
|
||||
self.kernel = mk.FusedMoEModularKernel(
|
||||
MoEPrepareAndFinalizeNoEP(), moe_kernel
|
||||
)
|
||||
self.use_inplace = True
|
||||
|
||||
def maybe_make_prepare_finalize(
|
||||
self,
|
||||
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
|
||||
) -> mk.FusedMoEPrepareAndFinalize | None:
|
||||
if (
|
||||
self.rocm_aiter_moe_enabled
|
||||
or self.use_marlin
|
||||
or self.fp8_backend == Fp8MoeBackend.MARLIN
|
||||
or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
|
||||
):
|
||||
return None
|
||||
@ -1092,7 +1161,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
|
||||
assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
|
||||
assert (
|
||||
self.fp8_backend != Fp8MoeBackend.MARLIN
|
||||
) and not self.rocm_aiter_moe_enabled, (
|
||||
"Marlin and ROCm AITER are not supported with all2all yet."
|
||||
)
|
||||
|
||||
@ -1106,7 +1177,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
assert max_num_tokens_per_rank is not None
|
||||
|
||||
experts_impl = (
|
||||
BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts
|
||||
BatchedDeepGemmExperts
|
||||
if self.fp8_backend == Fp8MoeBackend.DEEPGEMM
|
||||
else BatchedTritonExperts
|
||||
)
|
||||
logger.debug(
|
||||
"%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
|
||||
@ -1141,14 +1214,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
return TritonOrDeepGemmExperts(
|
||||
quant_config=self.moe_quant_config,
|
||||
allow_deep_gemm=self.allow_deep_gemm,
|
||||
allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
|
||||
)
|
||||
|
||||
def get_fused_moe_quant_config(
|
||||
self, layer: torch.nn.Module
|
||||
) -> FusedMoEQuantConfig | None:
|
||||
if self.use_marlin:
|
||||
return None
|
||||
if self.fp8_backend == Fp8MoeBackend.MARLIN:
|
||||
return fp8_w8a16_moe_quant_config(
|
||||
w1_scale=layer.w13_weight_scale,
|
||||
w2_scale=layer.w2_weight_scale,
|
||||
block_shape=self.weight_block_size,
|
||||
)
|
||||
|
||||
return fp8_w8a8_moe_quant_config(
|
||||
w1_scale=(
|
||||
@ -1179,6 +1256,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
|
||||
# TODO(rob): convert this to MK.
|
||||
if layer.enable_eplb:
|
||||
raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.")
|
||||
assert layer.activation == "silu", (
|
||||
@ -1231,18 +1309,17 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
)
|
||||
|
||||
select_result = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, zero_expert_result = select_result
|
||||
|
||||
if self.rocm_aiter_moe_enabled:
|
||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
|
||||
rocm_aiter_fused_experts,
|
||||
)
|
||||
|
||||
# TODO(rob): convert this to MK.
|
||||
result = rocm_aiter_fused_experts(
|
||||
x,
|
||||
layer.w13_weight,
|
||||
@ -1254,80 +1331,21 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
expert_map=layer.expert_map,
|
||||
quant_config=self.moe_quant_config,
|
||||
)
|
||||
elif self.use_marlin:
|
||||
assert layer.activation == "silu", (
|
||||
f"{layer.activation} not supported for Marlin MoE."
|
||||
)
|
||||
result = fused_marlin_moe(
|
||||
else:
|
||||
result = self.kernel(
|
||||
x,
|
||||
layer.w13_weight,
|
||||
layer.w2_weight,
|
||||
None,
|
||||
None,
|
||||
layer.w13_weight_scale,
|
||||
layer.w2_weight_scale,
|
||||
router_logits,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
quant_type_id=scalar_types.float8_e4m3fn.id,
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
global_num_experts=layer.global_num_experts,
|
||||
expert_map=layer.expert_map,
|
||||
input_dtype=self.marlin_input_dtype,
|
||||
workspace=layer.workspace,
|
||||
)
|
||||
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
|
||||
assert layer.activation == "silu", (
|
||||
f"Expected 'silu' activation but got {layer.activation}"
|
||||
)
|
||||
if not self.block_quant:
|
||||
assert (
|
||||
not layer.renormalize and layer.custom_routing_function is not None
|
||||
)
|
||||
assert layer.scoring_func == "sigmoid", (
|
||||
f"Expected 'sigmoid' scoring func but got {layer.scoring_func}"
|
||||
)
|
||||
# Delegate to CUTLASS FlashInfer path; function already bound with
|
||||
# use_deepseek_fp8_block_scale for block-quant when applicable
|
||||
result = self.flashinfer_moe_fn(
|
||||
x,
|
||||
layer,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
inplace=False,
|
||||
inplace=self.use_inplace,
|
||||
activation=layer.activation,
|
||||
global_num_experts=layer.global_num_experts,
|
||||
expert_map=layer.expert_map,
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
)
|
||||
else:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
result = fused_experts(
|
||||
hidden_states=x,
|
||||
w1=layer.w13_weight,
|
||||
w2=layer.w2_weight,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=True,
|
||||
activation=layer.activation,
|
||||
global_num_experts=layer.global_num_experts,
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
expert_map=layer.expert_map,
|
||||
quant_config=self.moe_quant_config,
|
||||
allow_deep_gemm=self.allow_deep_gemm,
|
||||
allow_cutlass_block_scaled_grouped_gemm=(
|
||||
self.allow_cutlass_block_scaled_grouped_gemm
|
||||
),
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
return result, zero_expert_result
|
||||
else:
|
||||
return result
|
||||
return result
|
||||
|
||||
|
||||
class Fp8OnlineMoEMethod(Fp8MoEMethod):
|
||||
@ -1471,7 +1489,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
|
||||
replace_parameter(layer, "w2_weight", shuffled_w2)
|
||||
|
||||
# Rushuffle weights for MARLIN if needed.
|
||||
if self.use_marlin:
|
||||
if self.fp8_backend == Fp8MoeBackend.MARLIN:
|
||||
prepare_moe_fp8_layer_for_marlin(
|
||||
layer, False, input_dtype=self.marlin_input_dtype
|
||||
)
|
||||
|
||||
@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
|
||||
"fused GGUF MoE method."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
assert layer.activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -51,7 +51,7 @@ class QuantFP8(CustomOp):
|
||||
self.column_major_scales = column_major_scales
|
||||
self.use_ue8m0 = use_ue8m0
|
||||
|
||||
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled()
|
||||
self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled()
|
||||
|
||||
self.is_group_quant = group_shape.is_per_group()
|
||||
if self.is_group_quant:
|
||||
|
||||
@ -6,13 +6,8 @@ from typing import Any, Optional
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch.nn import Module
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe import (
|
||||
FusedMoEMethodBase,
|
||||
FusedMoeWeightScaleSupported,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.model_executor.layers.linear import (
|
||||
LinearBase,
|
||||
@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
|
||||
QuantizationMethods,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
|
||||
from vllm.model_executor.layers.quantization.fp8 import (
|
||||
Fp8Config,
|
||||
Fp8LinearMethod,
|
||||
Fp8OnlineMoEMethod,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
maybe_create_device_identity,
|
||||
)
|
||||
from vllm.model_executor.parameter import ModelWeightParameter
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.model_executor.utils import replace_parameter
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MIN_IPEX_VERSION = "2.6.0"
|
||||
@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
|
||||
def __init__(self, quant_config: Fp8Config):
|
||||
super().__init__(quant_config)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int],
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
maybe_create_device_identity()
|
||||
|
||||
output_size_per_partition = sum(output_partition_sizes)
|
||||
weight_loader = extra_weight_attrs.get("weight_loader")
|
||||
layer.logical_widths = output_partition_sizes
|
||||
layer.input_size_per_partition = input_size_per_partition
|
||||
layer.output_size_per_partition = output_size_per_partition
|
||||
layer.orig_dtype = params_dtype
|
||||
layer.weight_block_size = None
|
||||
weight = ModelWeightParameter(
|
||||
data=torch.empty(
|
||||
output_size_per_partition,
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
input_dim=1,
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
layer.register_parameter("weight", weight)
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
if getattr(layer, "_already_called_process_weights_after_loading", False):
|
||||
return
|
||||
# If checkpoint not serialized fp8, quantize the weights.
|
||||
if not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
|
||||
# Update the layer with the new values.
|
||||
layer.weight = Parameter(qweight, requires_grad=False)
|
||||
layer.weight_scale = Parameter(weight_scale, requires_grad=False)
|
||||
replace_parameter(layer, "weight", qweight.data)
|
||||
replace_parameter(layer, "weight_scale", weight_scale.data)
|
||||
layer.input_scale = None
|
||||
|
||||
def apply(
|
||||
@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
|
||||
return output
|
||||
|
||||
|
||||
class XPUFp8MoEMethod(FusedMoEMethodBase):
|
||||
class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
|
||||
def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
|
||||
super().__init__(layer.moe_config)
|
||||
super().__init__(quant_config, layer)
|
||||
self.quant_config = quant_config
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: Module,
|
||||
num_experts: int,
|
||||
hidden_size: int,
|
||||
intermediate_size_per_partition: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
layer.intermediate_size_per_partition = intermediate_size_per_partition
|
||||
layer.hidden_size = hidden_size
|
||||
layer.num_experts = num_experts
|
||||
layer.orig_dtype = params_dtype
|
||||
layer.weight_block_size = None
|
||||
# WEIGHTS
|
||||
w13_weight = torch.nn.Parameter(
|
||||
torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
hidden_size,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
layer.register_parameter("w13_weight", w13_weight)
|
||||
set_weight_attrs(w13_weight, extra_weight_attrs)
|
||||
|
||||
w2_weight = torch.nn.Parameter(
|
||||
torch.empty(
|
||||
num_experts,
|
||||
hidden_size,
|
||||
intermediate_size_per_partition,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
layer.register_parameter("w2_weight", w2_weight)
|
||||
set_weight_attrs(w2_weight, extra_weight_attrs)
|
||||
|
||||
# Allocate 2 scales for w1 and w3 respectively.
|
||||
# They will be combined to a single scale after weight loading.
|
||||
w13_weight_scale = torch.nn.Parameter(
|
||||
torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
|
||||
)
|
||||
w2_weight_scale = torch.nn.Parameter(
|
||||
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
|
||||
)
|
||||
layer.register_parameter("w13_weight_scale", w13_weight_scale)
|
||||
layer.register_parameter("w2_weight_scale", w2_weight_scale)
|
||||
|
||||
extra_weight_attrs.update(
|
||||
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
|
||||
)
|
||||
# INPUT_SCALES
|
||||
layer.w13_input_scale = None
|
||||
layer.w2_input_scale = None
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
if getattr(layer, "_already_called_process_weights_after_loading", False):
|
||||
return
|
||||
if not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
fp8_dtype = current_platform.fp8_dtype()
|
||||
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
|
||||
@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
|
||||
w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
|
||||
ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
|
||||
)
|
||||
layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
|
||||
layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
|
||||
replace_parameter(layer, "w13_weight", w13_weight)
|
||||
replace_parameter(layer, "w2_weight", w2_weight)
|
||||
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
|
||||
|
||||
@ -30,7 +30,7 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
|
||||
class MarlinLinearKernel(MPLinearKernel):
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
return 80
|
||||
return 75
|
||||
|
||||
@classmethod
|
||||
def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
|
||||
|
||||
@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
|
||||
select_cutlass_fp8_gemm_impl,
|
||||
swap_w13_to_w31,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
W8A8BlockFp8LinearOp,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
get_marlin_input_dtype,
|
||||
)
|
||||
@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
Fp8LinearOp,
|
||||
cutlass_block_fp8_supported,
|
||||
requantize_with_max_scale,
|
||||
)
|
||||
from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
|
||||
from vllm.model_executor.parameter import (
|
||||
BlockQuantScaleParameter,
|
||||
ChannelQuantScaleParameter,
|
||||
ModelWeightParameter,
|
||||
PerTensorScaleParameter,
|
||||
)
|
||||
from vllm.scalar_type import scalar_types
|
||||
from vllm.utils.flashinfer import (
|
||||
flashinfer_scaled_fp4_mm,
|
||||
@ -88,7 +97,16 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
QUANT_ALGOS = ["FP8", "NVFP4"]
|
||||
QUANT_ALGOS = [
|
||||
# FP8 (per-tensor weight + optional static activation scale).
|
||||
"FP8",
|
||||
# FP8 per-channel weight scale + per-token activation scale.
|
||||
"FP8_PER_CHANNEL_PER_TOKEN",
|
||||
# FP8 per-block weight-only (ModelOpt may emit this as lowercase).
|
||||
"FP8_PB_WO",
|
||||
# FP4
|
||||
"NVFP4",
|
||||
]
|
||||
KV_CACHE_QUANT_ALGOS = ["FP8"]
|
||||
|
||||
|
||||
@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig):
|
||||
if not quant_method:
|
||||
raise ValueError("Missing 'quant_algo' in quantization config")
|
||||
|
||||
# Normalize quant_algo for robust matching (ModelOpt may emit lowercase).
|
||||
quant_method = str(quant_method).upper()
|
||||
|
||||
if kv_cache_quant_method is None:
|
||||
# No KV cache quantization, keep this branch just to have this comment
|
||||
pass
|
||||
@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig):
|
||||
f"kv_cache_quant_algo must be a string, got "
|
||||
f"{type(kv_cache_quant_method)}"
|
||||
)
|
||||
else:
|
||||
kv_cache_quant_method = kv_cache_quant_method.upper()
|
||||
|
||||
if not isinstance(exclude_modules, list):
|
||||
raise ValueError(
|
||||
@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_method: str,
|
||||
is_checkpoint_fp8_serialized: bool,
|
||||
kv_cache_quant_method: str | None,
|
||||
exclude_modules: list[str],
|
||||
) -> None:
|
||||
super().__init__(exclude_modules)
|
||||
self.quant_method = quant_method
|
||||
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
|
||||
self.kv_cache_quant_method = kv_cache_quant_method
|
||||
if is_checkpoint_fp8_serialized:
|
||||
logger.warning(
|
||||
"Detected ModelOpt fp8 checkpoint. Please note that"
|
||||
" the format is experimental and could change."
|
||||
"Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note "
|
||||
"that the format is experimental and could change.",
|
||||
quant_method,
|
||||
)
|
||||
|
||||
# Select LinearMethod implementation based on quant_algo.
|
||||
if self.quant_method == "FP8":
|
||||
self.LinearMethodCls = ModelOptFp8LinearMethod
|
||||
elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN":
|
||||
self.LinearMethodCls = ModelOptFp8PcPtLinearMethod
|
||||
elif self.quant_method == "FP8_PB_WO":
|
||||
self.LinearMethodCls = ModelOptFp8PbWoLinearMethod
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported ModelOpt FP8 quant_algo for vLLM: "
|
||||
f"{self.quant_method}. Supported: FP8 / "
|
||||
"FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO."
|
||||
)
|
||||
|
||||
def get_name(self) -> QuantizationMethods:
|
||||
@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
|
||||
if "quantization" in hf_quant_cfg:
|
||||
quant_config = hf_quant_cfg["quantization"]
|
||||
if isinstance(quant_config, dict):
|
||||
quant_algo = quant_config.get("quant_algo", "")
|
||||
if "FP8" in quant_algo:
|
||||
quant_algo = str(quant_config.get("quant_algo", ""))
|
||||
if "FP8" in quant_algo.upper():
|
||||
return "modelopt"
|
||||
else:
|
||||
# Check for compressed-tensors style config with specific quant_algo
|
||||
quant_algo = hf_quant_cfg.get("quant_algo", "")
|
||||
if isinstance(quant_algo, str) and "FP8" in quant_algo:
|
||||
quant_algo = str(hf_quant_cfg.get("quant_algo", ""))
|
||||
if "FP8" in quant_algo.upper():
|
||||
return "modelopt"
|
||||
|
||||
return None
|
||||
@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase):
|
||||
) -> "ModelOptFp8Config":
|
||||
is_checkpoint_fp8_serialized = "FP8" in quant_method
|
||||
|
||||
return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
|
||||
return cls(
|
||||
quant_method,
|
||||
is_checkpoint_fp8_serialized,
|
||||
kv_cache_quant_method,
|
||||
exclude_modules,
|
||||
)
|
||||
|
||||
|
||||
class ModelOptFp8LinearMethod(LinearMethodBase):
|
||||
@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
|
||||
)
|
||||
|
||||
|
||||
class ModelOptFp8PcPtLinearMethod(LinearMethodBase):
|
||||
"""Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.
|
||||
|
||||
Expected checkpoint structure (per Linear):
|
||||
- weight: fp8-e4m3fn, shape [out, in]
|
||||
- weight_scale: fp32, shape [out] (per-output-channel)
|
||||
- no input_scale (activations are dynamically quantized per-token)
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config: ModelOptFp8Config) -> None:
|
||||
self.quant_config = quant_config
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN
|
||||
)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int],
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
del input_size, output_size
|
||||
|
||||
if not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
raise ValueError(
|
||||
"FP8_PER_CHANNEL_PER_TOKEN currently only supports "
|
||||
"FP8-serialized checkpoints."
|
||||
)
|
||||
|
||||
output_size_per_partition = sum(output_partition_sizes)
|
||||
weight_loader = extra_weight_attrs.get("weight_loader")
|
||||
layer.logical_widths = output_partition_sizes
|
||||
layer.input_size_per_partition = input_size_per_partition
|
||||
layer.output_size_per_partition = output_size_per_partition
|
||||
|
||||
weight = ModelWeightParameter(
|
||||
data=torch.empty(
|
||||
output_size_per_partition,
|
||||
input_size_per_partition,
|
||||
dtype=torch.float8_e4m3fn,
|
||||
),
|
||||
input_dim=1,
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
layer.register_parameter("weight", weight)
|
||||
|
||||
weight_scale = ChannelQuantScaleParameter(
|
||||
data=torch.empty(output_size_per_partition, dtype=torch.float32),
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
weight_scale[:] = torch.finfo(torch.float32).min
|
||||
layer.register_parameter("weight_scale", weight_scale)
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
layer.weight = Parameter(layer.weight.t(), requires_grad=False)
|
||||
layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
return self.fp8_linear.apply(
|
||||
input=x,
|
||||
weight=layer.weight,
|
||||
weight_scale=layer.weight_scale,
|
||||
input_scale=None,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
|
||||
class ModelOptFp8PbWoLinearMethod(LinearMethodBase):
|
||||
"""Linear method for ModelOpt FP8_PB_WO checkpoints.
|
||||
|
||||
ModelOpt exports `weight_scale` as a 4D tensor:
|
||||
[out_blk, 1, in_blk, 1]
|
||||
where block size is typically 128 for both dims.
|
||||
|
||||
vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
|
||||
"""
|
||||
|
||||
_WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128)
|
||||
|
||||
def __init__(self, quant_config: ModelOptFp8Config) -> None:
|
||||
self.quant_config = quant_config
|
||||
block_n, block_k = self._WEIGHT_BLOCK_SIZE
|
||||
self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE)
|
||||
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
|
||||
weight_group_shape=GroupShape(block_n, block_k),
|
||||
act_quant_group_shape=GroupShape(1, block_k),
|
||||
cutlass_block_fp8_supported=cutlass_block_fp8_supported(),
|
||||
use_aiter_and_is_supported=False,
|
||||
)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int],
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
del input_size, output_size
|
||||
|
||||
if not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
raise ValueError(
|
||||
"FP8_PB_WO currently only supports FP8-serialized checkpoints."
|
||||
)
|
||||
|
||||
output_size_per_partition = sum(output_partition_sizes)
|
||||
weight_loader = extra_weight_attrs.get("weight_loader")
|
||||
layer.logical_widths = output_partition_sizes
|
||||
layer.input_size_per_partition = input_size_per_partition
|
||||
layer.output_size_per_partition = output_size_per_partition
|
||||
|
||||
# Expose block size so the v2 weight loaders can translate offsets from
|
||||
# element-space -> block-space for BlockQuantScaleParameter.
|
||||
layer.weight_block_size = self.weight_block_size
|
||||
|
||||
weight = ModelWeightParameter(
|
||||
data=torch.empty(
|
||||
output_size_per_partition,
|
||||
input_size_per_partition,
|
||||
dtype=torch.float8_e4m3fn,
|
||||
),
|
||||
input_dim=1,
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
layer.register_parameter("weight", weight)
|
||||
|
||||
block_n, block_k = self._WEIGHT_BLOCK_SIZE
|
||||
if output_size_per_partition % block_n != 0:
|
||||
raise ValueError(
|
||||
"ModelOpt FP8_PB_WO requires out_features divisible by "
|
||||
f"{block_n}, got {output_size_per_partition}."
|
||||
)
|
||||
if input_size_per_partition % block_k != 0:
|
||||
raise ValueError(
|
||||
"ModelOpt FP8_PB_WO requires in_features divisible by "
|
||||
f"{block_k}, got {input_size_per_partition}."
|
||||
)
|
||||
|
||||
out_blks = output_size_per_partition // block_n
|
||||
in_blks = input_size_per_partition // block_k
|
||||
|
||||
# Match ModelOpt's exported shape so weight loading works without a
|
||||
# custom loader: [out_blk, 1, in_blk, 1]
|
||||
weight_scale = BlockQuantScaleParameter(
|
||||
data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32),
|
||||
input_dim=2,
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
weight_scale[:] = torch.finfo(torch.float32).min
|
||||
layer.register_parameter("weight_scale", weight_scale)
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
# Keep weight in [out, in] layout for W8A8BlockFp8LinearOp.
|
||||
layer.weight = Parameter(layer.weight.data, requires_grad=False)
|
||||
|
||||
scale = layer.weight_scale
|
||||
if scale.dim() == 4:
|
||||
# [out_blk, 1, in_blk, 1] -> [out_blk, in_blk]
|
||||
scale = scale.squeeze(1).squeeze(-1)
|
||||
elif scale.dim() != 2:
|
||||
raise ValueError(
|
||||
"Unexpected ModelOpt FP8_PB_WO weight_scale shape: "
|
||||
f"{tuple(scale.shape)}."
|
||||
)
|
||||
|
||||
layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
return self.w8a8_block_fp8_linear.apply(
|
||||
input=x,
|
||||
weight=layer.weight,
|
||||
weight_scale=layer.weight_scale,
|
||||
input_scale=None,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
|
||||
class ModelOptFp8MoEMethod(FusedMoEMethodBase):
|
||||
"""MoE method for ModelOpt FP8.
|
||||
Supports loading FP8 checkpoints with static weight scale and
|
||||
@ -796,7 +1038,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
# Expert selection
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1599,7 +1841,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
x_routing, _ = x
|
||||
else:
|
||||
x_routing = x
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x_routing,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
assert layer.activation == "silu", "Only SiLU activation is supported."
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
raise NotImplementedError("EPLB is not supported for mxfp4")
|
||||
|
||||
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -989,7 +989,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
):
|
||||
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
DeepGemmQuantScaleFMT,
|
||||
fp8_gemm_nt,
|
||||
is_deep_gemm_e8m0_used,
|
||||
is_deep_gemm_supported,
|
||||
@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp:
|
||||
self.act_quant_group_shape = act_quant_group_shape
|
||||
self.is_deep_gemm_supported = is_deep_gemm_supported()
|
||||
self.is_hopper = current_platform.is_device_capability(90)
|
||||
self.is_blackwell = current_platform.is_device_capability_family(100)
|
||||
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
|
||||
|
||||
# Get the correct blockscale mul and input quant operations.
|
||||
@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if self.use_deep_gemm_e8m0 and self.is_blackwell:
|
||||
if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
|
||||
q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
|
||||
input_2d,
|
||||
group_size=self.act_quant_group_shape.col,
|
||||
@ -1252,6 +1252,14 @@ def validate_fp8_block_shape(
|
||||
"""Validate block quantization shapes for tensor parallelism."""
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
|
||||
if getattr(layer, "allow_fp8_block_shape_mismatch", False):
|
||||
logger.debug(
|
||||
"Skipping FP8 block shape validation for layer %s due to detected"
|
||||
" mismatch allowance.",
|
||||
getattr(layer, "prefix", "<unknown>"),
|
||||
)
|
||||
return
|
||||
|
||||
tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ def query_marlin_supported_quant_types(
|
||||
-1 if capability_tuple is None else capability_tuple.to_int()
|
||||
)
|
||||
|
||||
if device_capability < 80:
|
||||
if device_capability < 75:
|
||||
return []
|
||||
|
||||
# - has_zp is True: return quant_types that has zero points
|
||||
@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
|
||||
|
||||
a_scales = None
|
||||
if input_dtype == torch.int8:
|
||||
assert quant_type == scalar_types.uint4, (
|
||||
"W8A8-INT8 is not supported by marlin kernel."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
a_scales = a_scales * input_global_scale
|
||||
elif input_dtype == torch.float8_e4m3fn:
|
||||
assert quant_type == scalar_types.uint4, (
|
||||
"INT8 weight + FP8 activation is not supported."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
|
||||
|
||||
a_scales = None
|
||||
if input_dtype == torch.int8:
|
||||
assert quant_type == scalar_types.uint4b8, (
|
||||
"W8A8-INT8 is not supported by marlin kernel."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
a_scales = a_scales * input_global_scale
|
||||
elif input_dtype == torch.float8_e4m3fn:
|
||||
assert quant_type == scalar_types.uint4b8, (
|
||||
"INT8 weight + FP8 activation is not supported."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
|
||||
@ -23,7 +23,7 @@ logger = init_logger(__name__)
|
||||
|
||||
|
||||
def is_fp4_marlin_supported():
|
||||
return current_platform.has_device_capability(80)
|
||||
return current_platform.has_device_capability(75)
|
||||
|
||||
|
||||
def nvfp4_marlin_process_scales(marlin_scales):
|
||||
@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
|
||||
)
|
||||
|
||||
is_nvfp4 = hasattr(layer, "weight_scale_2")
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if is_nvfp4:
|
||||
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
|
||||
elif input_dtype != torch.float8_e4m3fn:
|
||||
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
|
||||
|
||||
group_size = 16 if is_nvfp4 else 32
|
||||
|
||||
part_size_n = layer.output_size_per_partition
|
||||
@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
|
||||
)
|
||||
|
||||
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if is_nvfp4:
|
||||
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
|
||||
elif input_dtype != torch.float8_e4m3fn:
|
||||
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
|
||||
|
||||
group_size = 16 if is_nvfp4 else 32
|
||||
|
||||
e = layer.num_experts
|
||||
|
||||
@ -11,7 +11,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
marlin_make_workspace_new,
|
||||
marlin_permute_bias,
|
||||
marlin_permute_scales,
|
||||
marlin_quant_input,
|
||||
should_use_atomic_add_reduce,
|
||||
)
|
||||
from vllm.model_executor.utils import replace_parameter
|
||||
@ -22,7 +21,7 @@ logger = init_logger(__name__)
|
||||
|
||||
|
||||
def is_fp8_marlin_supported():
|
||||
return current_platform.has_device_capability(80)
|
||||
return current_platform.has_device_capability(75)
|
||||
|
||||
|
||||
def fp8_fused_exponent_bias_into_scales(scales):
|
||||
@ -63,13 +62,11 @@ def apply_fp8_marlin_linear(
|
||||
inputs = reshaped_x
|
||||
a_scales = None
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if input_dtype != torch.float8_e4m3fn:
|
||||
raise RuntimeError("FP8 weight + INT8 activation is not supported.")
|
||||
|
||||
inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
|
||||
# inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
|
||||
raise RuntimeError("Marlin W8A8 is not supported.")
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
a=reshaped_x,
|
||||
a=inputs,
|
||||
c=None,
|
||||
b_q_weight=weight,
|
||||
b_bias=bias,
|
||||
@ -102,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
raise RuntimeError("Marlin W8A8 is not supported.")
|
||||
|
||||
part_size_n = layer.output_size_per_partition
|
||||
part_size_k = layer.input_size_per_partition
|
||||
@ -145,10 +144,20 @@ def prepare_fp8_layer_for_marlin(
|
||||
# marlin kernel only support channel-wise and group-wise quantization
|
||||
# we need to convert the scales
|
||||
if weight_block_size is None:
|
||||
logical_widths = getattr(layer, "logical_widths", [])
|
||||
if scales.nelement() == 1:
|
||||
# tensor-wise quantization -> channel-wise quantization
|
||||
# (1, 1) =>(repeat)=> (1, size_n)
|
||||
scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
|
||||
elif scales.nelement() == len(logical_widths):
|
||||
# tensor-wise quantization with logical_widths ->
|
||||
# channel-wise quantization
|
||||
assert sum(logical_widths) == part_size_n, (
|
||||
f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
|
||||
f"to part_size_n ({part_size_n})"
|
||||
)
|
||||
lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
|
||||
scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
|
||||
elif scales.nelement() > 1 and scales.nelement() != part_size_n:
|
||||
assert part_size_n % scales.nelement() == 0
|
||||
s_size = scales.nelement()
|
||||
@ -199,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
raise RuntimeError("Marlin W8A8 is not supported.")
|
||||
|
||||
e = layer.num_experts
|
||||
k = layer.hidden_size
|
||||
|
||||
@ -178,6 +178,37 @@ class ApplyRotaryEmb(CustomOp):
|
||||
output = output.to(origin_dtype)
|
||||
return output
|
||||
|
||||
def _pre_process(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
|
||||
origin_shape = x.shape
|
||||
if len(origin_shape) == 3:
|
||||
# x: [seq_len, num_heads, head_size]
|
||||
x = x.unsqueeze(0)
|
||||
|
||||
origin_dtype = x.dtype
|
||||
if self.enable_fp32_compute:
|
||||
x = x.float()
|
||||
cos = cos.float()
|
||||
sin = sin.float()
|
||||
|
||||
return x, cos, sin, origin_shape, origin_dtype
|
||||
|
||||
def _post_process(
|
||||
self,
|
||||
output: torch.Tensor,
|
||||
origin_shape: torch.Size,
|
||||
origin_dtype: torch.dtype,
|
||||
) -> torch.Tensor:
|
||||
if len(origin_shape) == 3:
|
||||
output = output.squeeze(0)
|
||||
if self.enable_fp32_compute:
|
||||
output = output.to(origin_dtype)
|
||||
return output
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
@ -197,16 +228,7 @@ class ApplyRotaryEmb(CustomOp):
|
||||
) -> torch.Tensor:
|
||||
from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
|
||||
|
||||
origin_dtype = x.dtype
|
||||
if self.enable_fp32_compute:
|
||||
x = x.float()
|
||||
cos = cos.float()
|
||||
sin = sin.float()
|
||||
|
||||
origin_shape = x.shape
|
||||
if len(origin_shape) == 3:
|
||||
# x: [seq_len, num_heads, head_size]
|
||||
x = x.unsqueeze(0)
|
||||
x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
|
||||
|
||||
"""
|
||||
Arguments of apply_rotary_emb() in vllm_flash_attn:
|
||||
@ -218,10 +240,7 @@ class ApplyRotaryEmb(CustomOp):
|
||||
interleaved = not self.is_neox_style
|
||||
output = apply_rotary_emb(x, cos, sin, interleaved)
|
||||
|
||||
if len(origin_shape) == 3:
|
||||
output = output.squeeze(0)
|
||||
if self.enable_fp32_compute:
|
||||
output = output.to(origin_dtype)
|
||||
output = self._post_process(output, origin_shape, origin_dtype)
|
||||
return output
|
||||
|
||||
def forward_hip(
|
||||
@ -231,16 +250,7 @@ class ApplyRotaryEmb(CustomOp):
|
||||
sin: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if self.apply_rotary_emb_flash_attn is not None:
|
||||
origin_dtype = x.dtype
|
||||
if self.enable_fp32_compute:
|
||||
x = x.float()
|
||||
cos = cos.float()
|
||||
sin = sin.float()
|
||||
|
||||
origin_shape = x.shape
|
||||
if len(origin_shape) == 3:
|
||||
# x: [seq_len, num_heads, head_size]
|
||||
x = x.unsqueeze(0)
|
||||
x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
|
||||
|
||||
"""
|
||||
Arguments of apply_rotary() in flash_attn:
|
||||
@ -254,10 +264,7 @@ class ApplyRotaryEmb(CustomOp):
|
||||
x, cos, sin, interleaved=interleaved
|
||||
).type_as(x)
|
||||
|
||||
if len(origin_shape) == 3:
|
||||
output = output.squeeze(0)
|
||||
if self.enable_fp32_compute:
|
||||
output = output.to(origin_dtype)
|
||||
output = self._post_process(output, origin_shape, origin_dtype)
|
||||
else:
|
||||
# Falling back to PyTorch native implementation.
|
||||
output = self.forward_native(x, cos, sin)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user