diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 015f48c2520d6..64a262c6cb401 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models. **Benchmarking Duration**: about 1hr. @@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh Runtime environment variables: -- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). @@ -34,8 +34,9 @@ Runtime environment variables: See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. -For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. -> +> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. +> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead. + ### Latency test Here is an example of one test inside `latency-tests.json`: diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh old mode 100644 new mode 100755 index 34ceefe0996f2..6b6a7e472b9c8 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -49,7 +49,11 @@ check_cpus() { echo "Need at least 1 NUMA to run benchmarking." exit 1 fi - declare -g gpu_type="cpu" + if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then + declare -g gpu_type="arm64-cpu" + else + declare -g gpu_type="cpu" + fi echo "GPU type is $gpu_type" } @@ -207,8 +211,8 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -276,8 +280,8 @@ run_throughput_tests() { # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -393,8 +397,8 @@ run_serving_tests() { # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -496,9 +500,9 @@ run_serving_tests() { main() { local ARCH ARCH='' - if [ "$ON_CPU" == "1" ];then - check_cpus - ARCH='-cpu' + if [[ "$ON_CPU" == "1" ]]; then + check_cpus + ARCH="-$gpu_type" else check_gpus ARCH="$arch_suffix" diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json new file mode 100644 index 0000000000000..fba695041e3ee --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json @@ -0,0 +1,26 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json new file mode 100644 index 0000000000000..63f1f8ab887b3 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json @@ -0,0 +1,130 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [ + 12, + 16, + 24, + 32, + 64, + 128, + 200 + ], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + } + ] +} \ No newline at end of file diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json new file mode 100644 index 0000000000000..da84dd4d0c67a --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json @@ -0,0 +1,27 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index d0965fbd56405..1794df9479e55 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -291,6 +291,7 @@ if __name__ == "__main__": """ Arguments: --version : version string for the current build (e.g., commit hash) + --wheel-dir : directory containing wheel files (default to be same as `version`) --current-objects : path to JSON file containing current S3 objects listing in this version directory --output-dir : directory to store generated index files --alias-to-default : (optional) alias variant name for the default variant @@ -318,6 +319,12 @@ if __name__ == "__main__": required=True, help="Directory to store generated index files", ) + parser.add_argument( + "--wheel-dir", + type=str, + default=None, + help="Directory containing wheel files (default to be same as `version`)", + ) parser.add_argument( "--alias-to-default", type=str, @@ -372,7 +379,7 @@ if __name__ == "__main__": print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") - # keep only "official" files for a non-nightly version (specifed by cli args) + # keep only "official" files for a non-nightly version (specified by cli args) PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") if PY_VERSION_RE.match(version): # upload-wheels.sh ensures no "dev" is in args.version @@ -384,9 +391,10 @@ if __name__ == "__main__": print("Nightly version detected, keeping all wheel files.") # Generate index and metadata, assuming wheels and indices are stored as: - # s3://vllm-wheels/{version}/ + # s3://vllm-wheels/{wheel_dir}/ # s3://vllm-wheels// - wheel_base_dir = Path(output_dir).parent / version + wheel_dir = args.wheel_dir or version + wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/") index_base_dir = Path(output_dir) generate_index_and_metadata( diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 3a218a4bb2e6d..1af7f476ae74b 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then echo "Re-generating indices for /$pure_version/" rm -rf "$INDICES_OUTPUT_DIR/*" mkdir -p "$INDICES_OUTPUT_DIR" - $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg + # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" fi diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6e20ff3bf38d9..a4d89a46b01ac 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -349,7 +349,9 @@ steps: - label: V1 Test e2e + engine # 65min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 + # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. + # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + agent_pool: mi325_8 # grade: Blocking source_file_dependencies: - vllm/ @@ -964,7 +966,7 @@ steps: - pytest -v -s models/multimodal/processing - label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 80 + timeout_in_minutes: 100 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -973,13 +975,15 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work -- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min - timeout_in_minutes: 180 +- label: Multi-Modal Accuracy Eval (Small Models) # 5min + timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking @@ -989,7 +993,9 @@ steps: - vllm/inputs/ - vllm/v1/core/ commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 @@ -1001,10 +1007,13 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing -- label: Multi-Modal Models Test (Extended) 2 +- label: Multi-Modal Models Test (Extended) 2 #60min + timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -1013,6 +1022,8 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' @@ -1026,6 +1037,8 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' @@ -1243,13 +1256,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1497,7 +1510,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index faf34d95735f4..7b664c4fa15fe 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -319,7 +319,10 @@ steps: # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e - - pytest -v -s v1/engine + # Run this test standalone for now; + # need to untangle use (implicit) use of spawn/fork across the tests. + - pytest -v -s v1/engine/test_preprocess_error_handling.py + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 @@ -1106,13 +1109,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1331,7 +1334,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### @@ -1356,6 +1359,7 @@ steps: - vllm/ - .buildkite/scripts/run-prime-rl-test.sh commands: + - nvidia-smi - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 52d57c99fcfb5..65a981a9d6d00 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -145,7 +145,7 @@ steps: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) @@ -171,7 +171,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d6447649cd89a..4d7a366f05e37 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,6 +15,7 @@ /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang +/vllm/tool_parsers @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/distributed/kv_transfer @NickLucche @ApostaC CMakeLists.txt @tlrmchlsmth @LucasWilkinson diff --git a/CMakeLists.txt b/CMakeLists.txt index a14496e035d9a..c46fb18d7bfef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") - message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is " - "not >= 12.8, we recommend upgrading to CUDA 12.8 or later " - "if you intend on running FP8 quantized MoE models on Blackwell.") - else() - message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found " - "in CUDA target architectures") - endif() - endif() # # Machete kernels diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py new file mode 100644 index 0000000000000..7517376535925 --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_quant.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.triton_utils import triton +from vllm.utils.flashinfer import flashinfer_fp4_quantize + +if not current_platform.has_device_capability(100): + raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)") + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +PROVIDER_CFGS = { + "vllm": dict(backend="vllm", enabled=True), + "flashinfer": dict(backend="flashinfer", enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor: + """Compute global scale for FP4 quantization.""" + amax = torch.abs(tensor).max().to(torch.float32) + return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="us (lower is better)", + plot_name="NVFP4 Input Quantization Latency (us)", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale for activation + a_global_scale = compute_global_scale(a) + + quantiles = [0.5, 0.2, 0.8] + + cfg = PROVIDER_CFGS[provider] + + if cfg["backend"] == "vllm": + # vLLM's FP4 quantization + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: ops.scaled_fp4_quant(a, a_global_scale), + quantiles=quantiles, + ) + elif cfg["backend"] == "flashinfer": + # FlashInfer's FP4 quantization + # Use is_sf_swizzled_layout=True to match vLLM's output format + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ), + quantiles=quantiles, + ) + + # Convert ms to us for better readability at small batch sizes + to_us = lambda t_ms: t_ms * 1000 + return to_us(ms), to_us(max_ms), to_us(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str): + """Test accuracy between vLLM and FlashInfer FP4 quantization.""" + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale + a_global_scale = compute_global_scale(a) + + # vLLM quantization + vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale) + + # FlashInfer quantization (with swizzled layout to match vLLM's output) + flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ) + flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn) + + # Compare outputs + torch.testing.assert_close( + vllm_fp4, + flashinfer_fp4, + ) + print(f"M={M}, K={K}, dtype={dtype}: PASSED") + + +def test_accuracy(): + """Run accuracy tests across various shapes.""" + print("\n" + "=" * 60) + print("Running accuracy tests: vLLM vs FlashInfer") + print("=" * 60) + + device = "cuda" + dtype = torch.bfloat16 + + # Test various batch sizes and hidden dimensions + Ms = [1, 1024] + Ks = [4096] + + for M in Ms: + for K in Ks: + _test_accuracy_once(M, K, dtype, device) + + print("\nAll accuracy tests passed!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark NVFP4 quantization: vLLM vs FlashInfer" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save benchmark results", + ) + parser.add_argument( + "--accuracy", + action="store_true", + help="Run accuracy tests", + ) + args = parser.parse_args() + + if args.accuracy: + test_accuracy() + + for K, N, model in prepare_shapes(args): + print(f"\n{model}, N={N} K={K}") + benchmark.run( + print_data=True, + save_path=args.save_path, + N=N, + K=K, + ) + + print("\nBenchmark finished!") diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp index 8ab0bb039c014..682751d67b1cd 100644 --- a/csrc/cpu/utils.hpp +++ b/csrc/cpu/utils.hpp @@ -37,10 +37,12 @@ struct VecTypeTrait { }; #endif +#if !defined(__powerpc__) template <> struct VecTypeTrait { using vec_t = vec_op::FP16Vec16; }; +#endif struct Counter { std::atomic counter; diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu index baff8363162ef..a51e1a347e1d4 100644 --- a/csrc/fused_qknorm_rope_kernel.cu +++ b/csrc/fused_qknorm_rope_kernel.cu @@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel( void const* k_weight_void, // RMSNorm weights for key void const* cos_sin_cache_void, // Pre-computed cos/sin cache int64_t const* position_ids, // Position IDs for RoPE - int const num_tokens // Number of tokens + int const num_tokens, // Number of tokens + int const rotary_dim // Dimension for RoPE ) { #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM) if constexpr ((std::is_same_v) || @@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel( // Calculate cache pointer for this position - similar to // pos_encoding_kernels.cu - T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim; - int const embed_dim = head_dim / 2; + T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim; + int const embed_dim = rotary_dim / 2; T_cache const* cos_ptr = cache_ptr; T_cache const* sin_ptr = cache_ptr + embed_dim; - - if constexpr (interleave) { - // Perform interleaving. Use pre-computed cos/sin values. + int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range + if (laneId < rotary_lanes) { + if constexpr (interleave) { + // Perform interleaving. Use pre-computed cos/sin values. #pragma unroll - for (int i = 0; i < numElemsPerThread / 2; ++i) { - int const idx0 = 2 * i; - int const idx1 = 2 * i + 1; + for (int i = 0; i < numElemsPerThread / 2; ++i) { + int const idx0 = 2 * i; + int const idx1 = 2 * i + 1; + // Global dimension index in the head + int const dim_idx = laneId * numElemsPerThread + idx0; - float const val0 = elements[idx0]; - float const val1 = elements[idx1]; + float const val0 = elements[idx0]; + float const val1 = elements[idx1]; - int const dim_idx = laneId * numElemsPerThread + idx0; - int const half_dim = dim_idx / 2; - float const cos_val = - CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); - float const sin_val = - CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + int const half_dim = dim_idx / 2; + float const cos_val = + CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); + float const sin_val = + CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); - elements[idx0] = val0 * cos_val - val1 * sin_val; - elements[idx1] = val0 * sin_val + val1 * cos_val; - } - } else { - // Before data exchange with in warp, we need to sync. - __syncwarp(); - // Get the data from the other half of the warp. Use pre-computed cos/sin - // values. -#pragma unroll - for (int i = 0; i < numElemsPerThread; i++) { - elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16); - if (laneId < 16) { - elements2[i] = -elements2[i]; + elements[idx0] = val0 * cos_val - val1 * sin_val; + elements[idx1] = val0 * sin_val + val1 * cos_val; } + } else { + // Before data exchange with in warp, we need to sync. + __syncwarp(); + int pairOffset = (rotary_dim / 2) / numElemsPerThread; + // Get the data from the other half of the warp. Use pre-computed + // cos/sin values. +#pragma unroll + for (int i = 0; i < numElemsPerThread; i++) { + elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset); - int dim_idx = laneId * numElemsPerThread + i; - dim_idx = (dim_idx * 2) % head_dim; - int half_dim = dim_idx / 2; - // Use pre-computed cos/sin from cache - float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); - float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + if (laneId < pairOffset) { + elements2[i] = -elements2[i]; + } + int dim_idx = laneId * numElemsPerThread + i; - elements[i] = elements[i] * cos_val + elements2[i] * sin_val; + dim_idx = (dim_idx * 2) % rotary_dim; + int half_dim = dim_idx / 2; + float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); + float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + + elements[i] = elements[i] * cos_val + elements2[i] * sin_val; + } + // __shfl_xor_sync does not provide memfence. Need to sync again. + __syncwarp(); } - // __shfl_xor_sync does not provide memfence. Need to sync again. - __syncwarp(); } - // Store. { vec_T vec; @@ -312,10 +316,10 @@ template void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_q, int const num_heads_k, int const num_heads_v, int const head_dim, - float const eps, void const* q_weight, - void const* k_weight, void const* cos_sin_cache, - bool const interleave, int64_t const* position_ids, - cudaStream_t stream) { + int const rotary_dim, float const eps, + void const* q_weight, void const* k_weight, + void const* cos_sin_cache, bool const interleave, + int64_t const* position_ids, cudaStream_t stream) { constexpr int blockSize = 256; int const warpsPerBlock = blockSize / 32; @@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; case 128: @@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; case 256: @@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; default: @@ -392,8 +396,11 @@ void fused_qk_norm_rope( "Query weights size must match head dimension"); TORCH_CHECK(k_weight.size(0) == head_dim, "Key weights size must match head dimension"); - TORCH_CHECK(cos_sin_cache.size(1) == head_dim, - "Cos/sin cache dimension must match head_dim"); + + TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even"); + TORCH_CHECK(cos_sin_cache.size(1) <= head_dim, + "rotary_dim must be less than or equal to head_dim"); + TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() && qkv.scalar_type() == k_weight.scalar_type(), "qkv, q_weight and k_weight must have the same dtype"); @@ -419,7 +426,8 @@ void fused_qk_norm_rope( qkv.data_ptr(), static_cast(num_tokens), static_cast(num_heads_q), static_cast(num_heads_k), static_cast(num_heads_v), static_cast(head_dim), - static_cast(eps), q_weight.data_ptr(), k_weight.data_ptr(), + static_cast(cos_sin_cache.size(1)), static_cast(eps), + q_weight.data_ptr(), k_weight.data_ptr(), cos_sin_cache.data_ptr(), !is_neox, reinterpret_cast(position_ids.data_ptr()), stream); diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 7539f836ecf37..e0438556dfe5c 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is // (448.f / (Alpha_A / 6.f)). @@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx, colIdx, numCols, SFout); + rowIdx, colIdx, numKTiles, SFout); out_pos = cvt_warp_fp16_to_fp4(out_silu_mul, SFScaleVal, sf_out); diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 82c53c2375a31..20191a9bc6160 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -25,6 +25,7 @@ #include #include "dispatch_utils.h" +#include "cuda_utils.h" #include "nvfp4_utils.cuh" #include "launch_bounds_utils.h" @@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + int tid = blockIdx.x * blockDim.x + threadIdx.x; int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD; @@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) // (448.f / (Alpha_A / 6.f)). float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - int factor = CVT_FP4_SF_VEC_SIZE * 4; - // The actual output_scales dim is computed from the padded numCols. - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + SFout + output_scale_offset_by_experts[expert_idx] * numKTiles; auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } @@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + extern __shared__ uint32_t shared_input_offsets[]; // Load input offsets into shared memory. @@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + SFout + output_scale_offset_by_experts[expert_idx] * numKTiles; auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } @@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input, void* input_global_scale, void* input_offset_by_experts, void* output_scale_offset_by_experts, int m_topk, int k, int n_experts, cudaStream_t stream) { - // TODO: this multiProcessorCount should be cached. - int device; - cudaGetDevice(&device); - int multiProcessorCount; - cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, - device); + int multiProcessorCount = + get_device_attribute(cudaDevAttrMultiProcessorCount, -1); // Grid, Block size. // Each thread converts 8 values. diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 6d69852bb4e4f..6acadb4cefd2c 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -38,6 +38,12 @@ __host__ __device__ inline Int round_up(Int x, Int y) { return (x + y - 1) / y * y; } +// Compute effective rows for grid configuration with swizzled SF layouts. +inline int computeEffectiveRows(int m) { + constexpr int ROW_TILE = 128; + return round_up(m, ROW_TILE); +} + // Use UE4M3 by default. template __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) @@ -49,6 +55,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + int sf_m = round_up(numRows, 128); int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_int = round_up(sf_n_unpadded, 4) / 4; @@ -79,7 +88,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx, colIdx, numCols, SFout); + rowIdx, colIdx, numKTiles, SFout); out_pos = cvt_warp_fp16_to_fp4(in_vec, global_scale, sf_out); @@ -87,43 +96,6 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) } } -template -void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, - int64_t* output, int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, cudaStream_t stream) { - // Grid, Block size. - // Each thread converts 8 values. - dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); - // Get number of blocks per SM - int const numBlocksPerSM = - vllm_runtime_blocks_per_sm(static_cast(block.x)); - dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); - - // Launch the cvt kernel. - if (useUE8M0) { - cvt_fp16_to_fp4<<>>( - m, n, input, SFScale, reinterpret_cast(output), - reinterpret_cast(SFOuput)); - } else { - cvt_fp16_to_fp4<<>>( - m, n, input, SFScale, reinterpret_cast(output), - reinterpret_cast(SFOuput)); - } -} - -// Instantiate the function. -template void invokeFP4Quantization(int m, int n, half const* input, - float const* SFScale, int64_t* output, - int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, - cudaStream_t stream); - -template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input, - float const* SFScale, int64_t* output, - int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, - cudaStream_t stream); - } // namespace vllm void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, @@ -147,13 +119,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); - // We don't support e8m0 scales at this moment. - bool useUE8M0 = false; + // Grid, Block size. Each thread converts 8 values. + dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); + int effectiveRows = vllm::computeEffectiveRows(m); + dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM)); VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] { using cuda_type = vllm::CUDATypeConverter::Type; auto input_ptr = static_cast(input.data_ptr()); - vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, - sf_out, useUE8M0, multiProcessorCount, stream); + // NOTE: We don't support e8m0 scales at this moment. + vllm::cvt_fp16_to_fp4<<>>( + m, n, input_ptr, input_sf_ptr, reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); }); } diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh index 48e4959de9793..4c91af85e1514 100644 --- a/csrc/quantization/fp4/nvfp4_utils.cuh +++ b/csrc/quantization/fp4/nvfp4_utils.cuh @@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) { return b; } +// Compute SF output offset for swizzled tensor core layout. +// SF layout: [numMTiles, numKTiles, 32, 4, 4] +// Caller must precompute: numKTiles = (numCols + 63) / 64 template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { +__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset( + int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) { static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2); // One pair of threads write one SF to global memory. // TODO: stage through smem for packed STG.32 // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) { + return nullptr; } - return nullptr; + + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // Decompose indices using bitwise ops (all divisors are powers of 2). + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + int32_t mTileIdx = mIdx >> 7; // mIdx / 128 + int32_t outerMIdx = mIdx & 31; // mIdx % 32 + int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4 + int32_t kTileIdx = kIdx >> 2; // kIdx / 4 + int32_t innerKIdx = kIdx & 3; // kIdx % 4 + + // Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 + + // outerMIdx * 16 + innerMIdx * 4 + innerKIdx + // Use bitwise OR for non-overlapping lower bits. + int64_t SFOffset = (static_cast(mTileIdx) * numKTiles + kTileIdx) + << 9 | + (outerMIdx << 4) | (innerMIdx << 2) | innerKIdx; + + return reinterpret_cast(SFout) + SFOffset; } // Quantizes the provided PackedVec into the uint32_t output diff --git a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu deleted file mode 100644 index 6c8f6309ef43f..0000000000000 --- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu +++ /dev/null @@ -1,373 +0,0 @@ -#include "core/registration.h" - -#include -#include - -#include -#include -#include - -#include "cute/tensor.hpp" -#include "cutlass/tensor_ref.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/epilogue/thread/linear_combination.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/gemm/group_array_problem_shape.hpp" -#include "cutlass/gemm/collective/collective_builder.hpp" -#include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/kernel/gemm_universal.hpp" - -#include "cutlass/util/command_line.h" -#include "cutlass/util/distribution.h" -#include "cutlass/util/host_tensor.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/tensor_view_io.h" -#include "cutlass/util/reference/device/gemm.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "cutlass/util/reference/host/tensor_fill.h" -#include "cutlass/util/reference/host/gett.hpp" -#include "cutlass/util/reference/host/tensor_norm.h" -#include "cutlass/util/reference/host/tensor_compare.h" -#include - -using namespace cute; - -template -__global__ void get_ggemm_starts( - int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, - ElementC** out_offsets, ElementAccumulator** a_scale_offsets, - ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int, - ElementAB* b_base_as_int, ElementC* out_base_as_int, - ElementAccumulator* a_scale_base_as_int, - ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int, - LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) { - int expert_id = threadIdx.x; - - if (expert_id >= gridDim.x * blockDim.x) { - return; - } - - int m = problem_sizes[expert_id * 3]; - int n = problem_sizes[expert_id * 3 + 1]; - int k = problem_sizes[expert_id * 3 + 2]; - - int32_t expert_offset = expert_offsets[expert_id]; - int a_stride = expert_offset * k; - int b_stride = expert_id * k * n; - int a_scale_stride = expert_offset * k / 128; - int b_scale_stride = expert_id * k * n / 128 / 128; - - a_offsets[expert_id] = a_base_as_int + a_stride; - b_offsets[expert_id] = b_base_as_int + b_stride; - out_offsets[expert_id] = out_base_as_int + expert_offset * n; - a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride; - b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride; - - LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id; - LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id; - - *layout_sfa_ptr = - ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1)); - *layout_sfb_ptr = - ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1)); -} - -#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \ - ScaleConfig) \ - else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ - get_ggemm_starts<<<1, num_experts, 0, stream>>>( \ - static_cast(expert_offsets.data_ptr()), \ - static_cast(a_ptrs.data_ptr()), \ - static_cast(b_ptrs.data_ptr()), \ - static_cast(out_ptrs.data_ptr()), \ - static_cast(a_scales_ptrs.data_ptr()), \ - static_cast(b_scales_ptrs.data_ptr()), \ - static_cast(a_tensors.data_ptr()), \ - static_cast(b_tensors.data_ptr()), \ - static_cast(out_tensors.data_ptr()), \ - static_cast(a_scales.data_ptr()), \ - static_cast(b_scales.data_ptr()), \ - reinterpret_cast(layout_sfa.data_ptr()), \ - reinterpret_cast(layout_sfb.data_ptr()), \ - static_cast(problem_sizes.data_ptr())); \ - } - -template -void run_get_ggemm_starts( - torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs, - torch::Tensor& b_ptrs, torch::Tensor& out_ptrs, - torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs, - torch::Tensor const& a_tensors, torch::Tensor const& b_tensors, - torch::Tensor out_tensors, torch::Tensor const& a_scales, - torch::Tensor const& b_scales, torch::Tensor const& layout_sfa, - torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) { - TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0); - TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0); - - int num_experts = (int)expert_offsets.size(0); - auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); - - if (false) { - } - __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA, - LayoutSFB, ScaleConfig) - __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA, - LayoutSFB, ScaleConfig) - else { - TORCH_CHECK(false, "Unsupported output tensor type"); - } -} - -template -void run_blockwise_scaled_group_mm( - torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs, - const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs, - const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a, - const torch::Tensor& stride_b, const torch::Tensor& stride_c, - const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb, - const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { - using ProblemShape = cutlass::gemm::GroupProblemShape>; - - // Types - using ElementA = cutlass::float_e4m3_t; - using ElementB = cutlass::float_e4m3_t; - using ElementC = OutType; - using ElementD = ElementC; - using ElementAccumulator = float; - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - using LayoutC = LayoutD; - - // Alignments - static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; - static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; - - using ArchTag = cutlass::arch::Sm100; - using OperatorClass = cutlass::arch::OpClassTensorOp; - - using CollectiveEpilogue = - typename cutlass::epilogue::collective::CollectiveBuilder< - ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape, - typename ScheduleConfig::ClusterShape, - cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, - ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*, - AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp; - - using CollectiveMainloop = - typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, OperatorClass, ElementA, - cute::tuple, - AlignmentA, ElementB, - cute::tuple, - AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape, - typename ScheduleConfig::ClusterShape, - cutlass::gemm::collective::StageCountAutoCarveout( - sizeof(typename CollectiveEpilogue::SharedStorage))>, - typename ScheduleConfig::KernelSchedule>::CollectiveOp; - - using GemmKernel = - cutlass::gemm::kernel::GemmUniversal; - - using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - using StrideA = typename Gemm::GemmKernel::InternalStrideA; - using StrideB = typename Gemm::GemmKernel::InternalStrideB; - using StrideC = typename Gemm::GemmKernel::InternalStrideC; - using StrideD = typename Gemm::GemmKernel::InternalStrideD; - - using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape; - int num_experts = (int)expert_offsets.size(0); - - Gemm gemm_op; - - // Mainloop Arguments - typename GemmKernel::MainloopArguments mainloop_args{ - static_cast(a_ptrs.data_ptr()), - static_cast(stride_a.data_ptr()), - static_cast(b_ptrs.data_ptr()), - static_cast(stride_b.data_ptr()), - static_cast(a_scales_ptrs.data_ptr()), - reinterpret_cast( - layout_sfa.data_ptr()), - static_cast(b_scales_ptrs.data_ptr()), - reinterpret_cast( - layout_sfb.data_ptr())}; - - int device_id = a_ptrs.device().index(); - static const cutlass::KernelHardwareInfo hw_info{ - device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count( - device_id)}; - - // Epilogue Arguments - typename GemmKernel::EpilogueArguments epilogue_args{ - {}, // epilogue.thread - nullptr, - static_cast(stride_c.data_ptr()), - static_cast(out_ptrs.data_ptr()), - static_cast(stride_c.data_ptr())}; - - UnderlyingProblemShape* problem_sizes_as_shapes = - static_cast(problem_sizes.data_ptr()); - - // Gemm Arguments - typename GemmKernel::Arguments args{ - cutlass::gemm::GemmUniversalMode::kGrouped, - {num_experts, problem_sizes_as_shapes, nullptr}, - mainloop_args, - epilogue_args, - hw_info}; - - at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()}; - const cudaStream_t stream = - at::cuda::getCurrentCUDAStream(a_ptrs.get_device()); - - auto can_implement_status = gemm_op.can_implement(args); - TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, - "Failed to implement GEMM"); - - size_t workspace_size = gemm_op.get_workspace_size(args); - auto const workspace_options = - torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device()); - auto workspace = torch::empty(workspace_size, workspace_options); - - auto status = gemm_op.initialize(args, workspace.data_ptr(), stream); - TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM"); - - status = gemm_op.run(stream); - TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); -} - -template -void blockwise_scaled_group_mm_dispatch_shape( - torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, - const torch::Tensor& scales_a, const torch::Tensor& scales_b, - const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { - struct MmaConfig { - using ElementA = cutlass::float_e4m3_t; - using KernelSchedule = - cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100; - using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; - using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< - 1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>; - using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); - using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); - using LayoutC = cutlass::layout::RowMajor; - using MmaTileShape = Shape<_128, _128, _128>; - using ClusterShape = Shape<_1, _1, _1>; - }; - - int num_experts = (int)expert_offsets.size(0); - - auto a_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto b_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto out_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto a_scales_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto b_scales_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - - auto layout_sfa = torch::empty( - {num_experts, 5}, - torch::TensorOptions().dtype(torch::kInt32).device(a.device())); - auto layout_sfb = torch::empty( - {num_experts, 5}, - torch::TensorOptions().dtype(torch::kInt32).device(a.device())); - - auto stride_a = torch::full( - {num_experts}, a.size(1), - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto stride_b = torch::full( - {num_experts}, a.size(1), - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto stride_c = torch::full( - {num_experts}, output.size(1), - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - - torch::TensorOptions options_int = - torch::TensorOptions().dtype(torch::kInt64).device(a.device()); - - run_get_ggemm_starts( - expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a, - b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes); - - run_blockwise_scaled_group_mm( - out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a, - stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes, - expert_offsets); -} - -void cutlass_blockwise_scaled_grouped_mm( - torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, - const torch::Tensor& scales_a, const torch::Tensor& scales_b, - const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { - TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor"); - TORCH_CHECK(problem_sizes.size(1) == 3, - "problem_sizes must have shape (num_experts, 3)"); - TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0), - "Number of experts in problem_sizes must match expert_offsets"); - TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, - "problem_sizes must be int32"); - TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn, - "a must be kFloat8_e4m3fn"); - TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn, - "b must be kFloat8_e4m3fn"); - TORCH_CHECK(output.scalar_type() == torch::kBFloat16 || - output.scalar_type() == torch::kHalf, - "output must be bfloat16 or half"); - TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, - "scales_a must be float32"); - TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, - "scales_b must be float32"); - TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32, - "expert_offsets must be int32"); - - TORCH_CHECK(output.dim() == 2, "output must be 2D tensor"); - TORCH_CHECK(a.dim() == 2, "a must be 2D tensor"); - TORCH_CHECK(b.dim() == 3, "b must be 3D tensor"); - TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor"); - TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor"); - TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor"); - TORCH_CHECK(problem_sizes.size(1) == 3, - "problem_sizes must have shape (num_experts, 3)"); - TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0), - "Number of experts in problem_sizes must match expert_offsets"); - TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, - "problem_sizes must be int32"); - TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor"); - -#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100 - if (output.scalar_type() == torch::kBFloat16) { - blockwise_scaled_group_mm_dispatch_shape( - output, a, b, scales_a, scales_b, problem_sizes, expert_offsets); - } else if (output.scalar_type() == torch::kFloat16) { - blockwise_scaled_group_mm_dispatch_shape( - output, a, b, scales_a, scales_b, problem_sizes, expert_offsets); - } else { - TORCH_CHECK(false, "Unsupported output tensor type"); - } -#endif -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("cutlass_blockwise_scaled_grouped_mm", - &cutlass_blockwise_scaled_grouped_mm); -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 83d4943d62776..461f74ca184fd 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor alpha) -> ()"); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); - // cutlass blockwise scaledgroup GEMM - ops.def( - "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " - "Tensor scales_a, Tensor scales_b, " - "Tensor problem_sizes, Tensor expert_offsets) -> ()"); - // conditionally compiled so impl registration is in source file - // cutlass nvfp4 block scaled group GEMM ops.def( "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1b6bdabc7a539..4c09808a14333 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ && uv pip install --system *.whl ARG COMMON_WORKDIR +ARG BASE_IMAGE # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks @@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1 # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 +# Workaround for ROCm profiler limits +RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf +ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf" +RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt + CMD ["/bin/bash"] diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index a57ee728d9243..c5e94ee1f6928 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,15 +1,15 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG TRITON_BRANCH="57c693b6" ARG TRITON_REPO="https://github.com/ROCm/triton.git" -ARG PYTORCH_BRANCH="1c57644d" -ARG PYTORCH_VISION_BRANCH="v0.23.0" +ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" +ARG PYTORCH_VISION_BRANCH="v0.24.1" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_AUDIO_BRANCH="v2.9.0" ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git" ARG FA_BRANCH="0e60e394" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="59bd8ff2" +ARG AITER_BRANCH="6af8b687" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ - && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md index 7cc4d23250df9..4cbc1a6a0a4fb 100644 --- a/docs/benchmarking/dashboard.md +++ b/docs/benchmarking/dashboard.md @@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar ## Manually Trigger the benchmark Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. -For CPU environment, please use the image with "-cpu" postfix. +For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix. -Here is an example for docker run command for CPU. +Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var. ```bash -docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu +export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch +export HF_TOKEN= +if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then + IMG_SUFFIX="arm64-cpu" +else + IMG_SUFFIX="cpu" +fi +docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX} ``` Then, run below command inside the docker instance. @@ -26,7 +33,7 @@ When run, benchmark script generates results under **benchmark/results** folder, ### Runtime environment variables -- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index 735bb2e205332..74c0beb779c7d 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -77,25 +77,20 @@ This complicates the process as we cannot use the out-of-the-box - `.buildkite/release-pipeline.yaml` - `.buildkite/scripts/upload-wheels.sh` -## Address long vLLM build time +## Manually running vLLM builds on BuildKiteCI -When building vLLM with a new PyTorch/CUDA version, no cache will exist -in the vLLM sccache S3 bucket, causing the build job on CI to potentially take more than 5 hours -and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mode, -it doesn't populate the cache, so re-running it to warm up the cache -is ineffective. +When building vLLM with a new PyTorch/CUDA version, the vLLM sccache S3 bucket +will not have any cached artifacts, which can cause CI build jobs to exceed 5 hours. +Furthermore, vLLM's fastcheck pipeline operates in read-only mode and does not +populate the cache, making it ineffective for cache warm-up purposes. -While ongoing efforts like -address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH` -to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`) -when manually triggering a build on Buildkite. This branch accomplishes two things: +To address this, manually trigger a build on Buildkite to accomplish two objectives: -1. Increase the timeout limit to 10 hours so that the build doesn't time out. -2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket -to warm it up so that future builds are faster. +1. Run the complete test suite against the PyTorch RC build by setting the environment variables: `RUN_ALL=1` and `NIGHTLY=1` +2. Populate the vLLM sccache S3 bucket with compiled artifacts, enabling faster subsequent builds

- Buildkite new build popup +Buildkite new build popup

## Update all the different vLLM platforms diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md index 5cc5878425515..53368ab1a79fa 100644 --- a/docs/design/paged_attention.md +++ b/docs/design/paged_attention.md @@ -139,18 +139,18 @@ token data. const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` -
- ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" } -
+

+ query +

Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. -
- ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" } -
+

+ q_vecs +

```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr` points to key token data based on `k_cache` at assigned block, assigned head and assigned token. -
- ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" } -
+

+ key +

The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128 elements for one token) that will be processed by 2 threads (one thread group) separately. -
- ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" } -
+

+ k_vecs +

```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of ## Value -
- ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" } -
+

+ value +

-
- ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" } -
+

+ logits_vec +

-
- ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" } -
+

+ v_vec +

Now we need to retrieve the value data and perform dot multiplication with `logits`. Unlike query and key, there is no thread group diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index b02d5ba9e89a2..5c846767bc5b8 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -8,6 +8,16 @@ We recommend installing the library with: pip install nvidia-modelopt ``` +## Supported ModelOpt checkpoint formats + +vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the +following `quantization.quant_algo` values: + +- `FP8`: per-tensor weight scale (+ optional static activation scale). +- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization. +- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks). +- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`). + ## Quantizing HuggingFace Models with PTQ You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. @@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll if __name__ == "__main__": main() ``` + +## Running the OpenAI-compatible server + +To serve a local ModelOpt checkpoint via the OpenAI-compatible API: + +```bash +vllm serve \ + --quantization modelopt \ + --host 0.0.0.0 --port 8000 +``` + +## Testing (local checkpoints) + +vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped +by default in CI. To run the tests locally: + +```bash +export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH= +export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH= +pytest -q tests/quantization/test_modelopt.py +``` diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index d26a5e217f314..586117272d3ba 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). +### How FP8 KV Cache Works + +The FP8 KV cache implementation follows this workflow: + +1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache +2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16) +3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor + +This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations. + ### Performance Impact The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 70a11d6def566..7b6945cb71c27 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -352,10 +352,17 @@ Supported models: * `zai-org/GLM-4.5` * `zai-org/GLM-4.5-Air` * `zai-org/GLM-4.6` -* `zai-org/GLM-4.6-Air` Flags: `--tool-call-parser glm45` +### GLM-4.7 Models (`glm47`) + +Supported models: + +* `zai-org/GLM-4.7` + +Flags: `--tool-call-parser glm47` + ### Qwen3-Coder Models (`qwen3_xml`) Supported models: diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 4940e5781b29a..611e6edf6668b 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -19,12 +19,12 @@ Pre-built vLLM wheels for Arm are available since version 0.11.2. These wheels c ```bash export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ``` ??? console "pip" ```bash - pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu + pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl ``` !!! warning "set `LD_PRELOAD`" diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md index 01e34eee10539..013750bc537bf 100644 --- a/docs/getting_started/installation/cpu.x86.inc.md +++ b/docs/getting_started/installation/cpu.x86.inc.md @@ -23,12 +23,12 @@ Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') # use uv -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --index-strategy first-index --torch-backend cpu +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --torch-backend cpu ``` ??? console "pip" ```bash # use pip - pip install vllm==${VLLM_VERSION}+cpu --extra-index-url https://wheels.vllm.ai/${VLLM_VERSION}/cpu --extra-index-url https://download.pytorch.org/whl/cpu + pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cpu ``` !!! warning "set `LD_PRELOAD`" Before use vLLM CPU installed via wheels, make sure TCMalloc and Intel OpenMP are installed and added to `LD_PRELOAD`: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 3ffbf63f9a18b..90d4ff96c52f7 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -387,7 +387,7 @@ th { | `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | @@ -415,9 +415,10 @@ th { | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | +| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | -| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | +| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index be0b846995a92..287409fa2b5c1 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -5,130 +5,91 @@ Usage: Single node: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 + -dp=2 \ + -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ - --node-rank=0 \ - --master-addr=10.99.48.128 \ - --master-port=13345 + -dp=2 \ + -tp=2 \ + --dp-num-nodes=2 \ + --dp-node-rank=0 \ + --dp-master-addr=10.99.48.128 \ + --dp-master-port=13345 Node 1: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ - --node-rank=1 \ - --master-addr=10.99.48.128 \ - --master-port=13345 + -dp=2 \ + -tp=2 \ + --dp-num-nodes=2 \ + --dp-node-rank=1 \ + --dp-master-addr=10.99.48.128 \ + --dp-master-port=13345 """ import os from time import sleep -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.platforms import current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import get_open_port -def parse_args(): - import argparse +def create_parser(): + parser = FlexibleArgumentParser(description="Data Parallel Inference") - parser = argparse.ArgumentParser(description="Data Parallel Inference") + # Add all engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults( + model="ibm-research/PowerMoE-3b", + enable_expert_parallel=True, + ) + + # Add DP-specific args (separate from engine args to avoid conflicts) parser.add_argument( - "--model", + "--dp-num-nodes", + type=int, + default=1, + help="Total number of nodes for data parallel.", + ) + parser.add_argument( + "--dp-node-rank", + type=int, + default=0, + help="Rank of the current node for data parallel.", + ) + parser.add_argument( + "--dp-master-addr", type=str, - default="ibm-research/PowerMoE-3b", - help="Model name or path", - ) - parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size") - parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size") - parser.add_argument( - "--node-size", type=int, default=1, help="Total number of nodes" + default="", + help="Master node IP address for DP coordination.", ) parser.add_argument( - "--node-rank", type=int, default=0, help="Rank of the current node" - ) - parser.add_argument( - "--master-addr", type=str, default="", help="Master node IP address" - ) - parser.add_argument("--master-port", type=int, default=0, help="Master node port") - parser.add_argument( - "--enforce-eager", action="store_true", help="Enforce eager mode execution." - ) - parser.add_argument( - "--trust-remote-code", action="store_true", help="Trust remote code." - ) - parser.add_argument( - "--max-num-seqs", + "--dp-master-port", type=int, - default=64, - help=("Maximum number of sequences to be processed in a single iteration."), - ) - parser.add_argument( - "--max-model-len", - type=int, - help=("Maximum number of tokens to be processed in a single iteration."), + default=0, + help="Master node port for DP coordination.", ) parser.add_argument( "--timeout", type=int, default=300, - help=("Number of seconds before unresponsive process is killed."), + help="Number of seconds before unresponsive process is killed.", ) - parser.add_argument( - "--gpu-memory-utilization", - type=float, - default=0.8, - help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), - ) - parser.add_argument( - "--enable-dbo", - action="store_true", - help=("Enable microbatched execution"), - ) - parser.add_argument( - "--compilation-config", - type=int, - help=("Compilation optimization (O) mode 0-3."), - ) - parser.add_argument( - "--quantization", - type=str, - ) - parser.add_argument( - "--disable-expert-parallel", - dest="enable_expert_parallel", - action="store_false", - help="Disable expert parallel (default: enabled).", - ) - parser.set_defaults(enable_expert_parallel=True) - return parser.parse_args() + + return parser def main( - model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - GPUs_per_dp_rank, - enforce_eager, - enable_expert_parallel, - trust_remote_code, - max_num_seqs, - max_model_len, - compilation_config, - gpu_memory_utilization, - enable_dbo, - quantization, + engine_args, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -173,19 +134,7 @@ def main( ) # Create an LLM. - llm = LLM( - model=model, - tensor_parallel_size=GPUs_per_dp_rank, - enforce_eager=enforce_eager, - enable_expert_parallel=enable_expert_parallel, - trust_remote_code=trust_remote_code, - max_num_seqs=max_num_seqs, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enable_dbo=enable_dbo, - quantization=quantization, - compilation_config=compilation_config, - ) + llm = LLM(**engine_args) outputs = llm.generate(prompts, sampling_params) # Print the outputs. for i, output in enumerate(outputs): @@ -204,22 +153,29 @@ def main( if __name__ == "__main__": - args = parse_args() + parser = create_parser() + args = vars(parser.parse_args()) - dp_size = args.dp_size - tp_size = args.tp_size - node_size = args.node_size - node_rank = args.node_rank + # Extract DP-specific args (pop to remove from engine_args) + dp_size = args.pop("data_parallel_size") + dp_num_nodes = args.pop("dp_num_nodes") + dp_node_rank = args.pop("dp_node_rank") + dp_master_addr = args.pop("dp_master_addr") + dp_master_port = args.pop("dp_master_port") + timeout = args.pop("timeout") - if node_size == 1: + # Remaining args are engine args + engine_args = args + + if dp_num_nodes == 1: dp_master_ip = "127.0.0.1" - dp_master_port = get_open_port() + dp_master_port_val = get_open_port() else: - dp_master_ip = args.master_addr - dp_master_port = args.master_port + dp_master_ip = dp_master_addr + dp_master_port_val = dp_master_port - assert dp_size % node_size == 0, "dp_size should be divisible by node_size" - dp_per_node = dp_size // node_size + assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes" + dp_per_node = dp_size // dp_num_nodes from multiprocessing import Process @@ -230,34 +186,24 @@ if __name__ == "__main__": procs = [] for local_dp_rank, global_dp_rank in enumerate( - range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node) + range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node) ): proc = Process( target=main, args=( - args.model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, - dp_master_port, - tp_size, - args.enforce_eager, - args.enable_expert_parallel, - args.trust_remote_code, - args.max_num_seqs, - args.max_model_len, - args.compilation_config, - args.gpu_memory_utilization, - args.enable_dbo, - args.quantization, + dp_master_port_val, + engine_args, ), ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=args.timeout) + proc.join(timeout=timeout) if proc.exitcode is None: print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md index b2c3bb974dfab..2a59f86d15fb7 100644 --- a/examples/online_serving/disaggregated_encoder/README.md +++ b/examples/online_serving/disaggregated_encoder/README.md @@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags: - `--max-num-batched-tokens=` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager. +- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.** + ## Local media inputs To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance: diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 4cf864bdb2de9..af5f2fec402ed 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -313,7 +313,7 @@ async def test_chat_streaming_input_audio( "format": "wav", }, }, - {"type": "text", "text": "What's happening in this audio?"}, + {"type": "text", "text": "What's a short title for this audio?"}, ], } ] diff --git a/tests/entrypoints/openai/test_embedding_shape_validation.py b/tests/entrypoints/openai/test_embedding_shape_validation.py new file mode 100644 index 0000000000000..27060e0be5aee --- /dev/null +++ b/tests/entrypoints/openai/test_embedding_shape_validation.py @@ -0,0 +1,223 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Embedding shape validation in multimodal APIs. + +Tests verify that embeddings with correct ndim but incorrect hidden_size +are rejected before they can cause crashes during model inference. + +Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems +classes, not by CompletionRenderer or MediaIO classes. +""" + +import pytest +import torch + +from vllm.multimodal.parse import ( + AudioEmbeddingItems, + ImageEmbeddingItems, + MultiModalDataParser, + VideoEmbeddingItems, +) + + +class TestMultiModalParserShapeValidation: + """Test hidden_size validation in MultiModalDataParser.""" + + def test_image_embeddings_correct_hidden_size_accepted(self): + """Baseline: Image embeddings with correct hidden_size should work.""" + expected_hidden_size = 768 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + valid_embeds = torch.randn(2, 100, expected_hidden_size) + + result = parser.parse_mm_data({"image": valid_embeds}) + + assert "image" in result + assert isinstance(result["image"], ImageEmbeddingItems) + assert result["image"].get_count() == 2 + + def test_image_embeddings_wrong_hidden_size_rejected(self): + """Security: Image embeddings with wrong hidden_size should be rejected.""" + expected_hidden_size = 768 + wrong_hidden_size = 4096 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + invalid_embeds = torch.randn(2, 100, wrong_hidden_size) + + with pytest.raises(ValueError) as exc_info: + parser.parse_mm_data({"image": invalid_embeds}) + + error_msg = str(exc_info.value).lower() + assert "image" in error_msg + assert "hidden dimension mismatch" in error_msg + + def test_audio_embeddings_wrong_hidden_size_rejected(self): + """Security: Audio embeddings with wrong hidden_size should be rejected.""" + expected_hidden_size = 768 + wrong_hidden_size = 2048 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + invalid_embeds = torch.randn(2, 100, wrong_hidden_size) + + with pytest.raises(ValueError) as exc_info: + parser.parse_mm_data({"audio": invalid_embeds}) + + error_msg = str(exc_info.value).lower() + assert "audio" in error_msg + assert "hidden dimension mismatch" in error_msg + + def test_video_embeddings_wrong_hidden_size_rejected(self): + """Security: Video embeddings with wrong hidden_size should be rejected.""" + expected_hidden_size = 768 + wrong_hidden_size = 512 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + invalid_embeds = torch.randn(2, 100, wrong_hidden_size) + + with pytest.raises(ValueError) as exc_info: + parser.parse_mm_data({"video": invalid_embeds}) + + error_msg = str(exc_info.value).lower() + assert "video" in error_msg + assert "hidden dimension mismatch" in error_msg + + def test_list_of_embeddings_validates_each(self): + """Security: Each embedding in list should be validated.""" + expected_hidden_size = 768 + wrong_hidden_size = 1024 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + # List with second tensor having wrong hidden_size + invalid_embeds = [ + torch.randn(100, expected_hidden_size), + torch.randn(100, wrong_hidden_size), + ] + + with pytest.raises(ValueError) as exc_info: + parser.parse_mm_data({"image": invalid_embeds}) + + # Should identify which embedding failed + assert "[1]" in str(exc_info.value) + + def test_validation_disabled_allows_any_size(self): + """When validation disabled (legacy), any hidden_size allowed.""" + parser = MultiModalDataParser(expected_hidden_size=None) + + any_hidden_size = 12345 + embeds = torch.randn(2, 100, any_hidden_size) + + # Should not raise + result = parser.parse_mm_data({"image": embeds}) + assert "image" in result + assert isinstance(result["image"], ImageEmbeddingItems) + + +class TestEmbeddingItemsDirectValidation: + """Direct tests for EmbeddingItems hidden_size validation.""" + + def test_image_embedding_items_validates_batched_tensor(self): + """Test validation for batched (3D) image embeddings.""" + expected = 768 + wrong = 1024 + + # Valid + valid = torch.randn(2, 100, expected) + items = ImageEmbeddingItems(valid, expected_hidden_size=expected) + assert items.get_count() == 2 + + # Invalid + invalid = torch.randn(2, 100, wrong) + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems(invalid, expected_hidden_size=expected) + + assert str(wrong) in str(exc_info.value) + assert str(expected) in str(exc_info.value) + + def test_image_embedding_items_validates_list_of_tensors(self): + """Test validation for list of 2D image embeddings.""" + expected = 768 + wrong = 512 + + # Valid list + valid_list = [torch.randn(100, expected), torch.randn(50, expected)] + items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected) + assert items.get_count() == 2 + + # Invalid list + invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)] + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems(invalid_list, expected_hidden_size=expected) + + assert "[1]" in str(exc_info.value) + + def test_audio_embedding_items_validates(self): + """Test validation for audio embeddings.""" + expected = 768 + wrong = 256 + + invalid = torch.randn(2, 100, wrong) + with pytest.raises(ValueError) as exc_info: + AudioEmbeddingItems(invalid, expected_hidden_size=expected) + + assert "audio" in str(exc_info.value).lower() + + def test_video_embedding_items_validates(self): + """Test validation for video embeddings.""" + expected = 768 + wrong = 384 + + invalid = torch.randn(2, 100, wrong) + with pytest.raises(ValueError) as exc_info: + VideoEmbeddingItems(invalid, expected_hidden_size=expected) + + assert "video" in str(exc_info.value).lower() + + +class TestShapeValidationIntegration: + """Integration tests verifying attack scenarios are blocked.""" + + def test_attack_scenario_multimodal_image(self): + """ + Simulate attack through Chat API with image embeddings. + + Verifies validation occurs in multimodal parser path. + """ + expected_hidden_size = 768 + wrong_hidden_size = 4096 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + attack_tensor = torch.randn(1, 100, wrong_hidden_size) + + with pytest.raises(ValueError): + parser.parse_mm_data({"image": attack_tensor}) + + def test_attack_scenario_multimodal_audio(self): + """ + Simulate attack through Chat API with audio embeddings. + + Verifies validation occurs in multimodal parser path. + """ + expected_hidden_size = 768 + wrong_hidden_size = 2048 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + attack_tensor = torch.randn(1, 100, wrong_hidden_size) + + with pytest.raises(ValueError): + parser.parse_mm_data({"audio": attack_tensor}) + + def test_attack_scenario_multimodal_video(self): + """ + Simulate attack through Chat API with video embeddings. + + Verifies validation occurs in multimodal parser path. + """ + expected_hidden_size = 768 + wrong_hidden_size = 1024 + parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size) + + attack_tensor = torch.randn(1, 100, wrong_hidden_size) + + with pytest.raises(ValueError): + parser.parse_mm_data({"video": attack_tensor}) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 6f2a50020699c..8ef0d7f277d5f 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib +import importlib.util import json import time @@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages( assert ( "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str): + response = await client.chat.completions.create( + model=model_name, + messages=[{"role": "user", "content": "What is the role of AI in medicine?"}], + temperature=0.0, + max_tokens=250, + ) + + choice = response.choices[0] + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length', got {choice.finish_reason}" + ) + assert choice.message.content is not None, ( + "Content should not be None when truncated" + ) + assert len(choice.message.content) > 0, "Content should not be empty" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 69d7b1ceedf59..d845913b8ee03 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -15,6 +15,7 @@ from vllm.entrypoints.openai.parser.harmony_utils import get_encoding from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, + ErrorResponse, RequestResponseMetadata, ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat @@ -52,8 +53,19 @@ def with_tool_parser(request) -> bool: return request.param +@pytest.fixture( + scope="module", + params=[True], + ids=["exclude_tools_when_tool_choice_none"], +) +def exclude_tools_when_tool_choice_none(request) -> bool: + return request.param + + @pytest.fixture(scope="module") -def default_server_args(with_tool_parser: bool): +def default_server_args( + with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool +): args = [ # use half precision for speed and memory savings in CI environment "--enforce-eager", @@ -72,6 +84,8 @@ def default_server_args(with_tool_parser: bool): "--enable-auto-tool-choice", ] ) + if exclude_tools_when_tool_choice_none: + args.append("--exclude-tools-when-tool-choice-none") return args @@ -335,6 +349,69 @@ async def test_gpt_oss_tool_message_array_content( assert response_multi_array.choices[0].message is not None +@pytest.mark.asyncio +async def test_gpt_oss_tool_choice_none( + gptoss_client: OpenAI, + with_tool_parser: bool, + exclude_tools_when_tool_choice_none: bool, +): + if not (with_tool_parser and exclude_tools_when_tool_choice_none): + pytest.skip( + "skip tool_choice tests when non-tool or " + "--exclude-tools-when-tool-choice-none not set" + ) + + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "state": {"type": "string"}, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["city", "state", "unit"], + }, + }, + } + ] + + messages = [ + { + "role": "user", + "content": "What's the temperature(in degrees Celsius) in Dallas?", + }, + ] + + tool_choice_auto = await gptoss_client.chat.completions.create( + model=GPT_OSS_MODEL_NAME, + messages=messages, + tools=tools, + tool_choice="auto", + temperature=0.0, + ) + msg = tool_choice_auto.choices[0].message + assert len(msg.tool_calls) == 1 + + tool_choice_none = await gptoss_client.chat.completions.create( + model=GPT_OSS_MODEL_NAME, + messages=messages, + tools=tools, + tool_choice="none", + temperature=0.0, + ) + + msg = tool_choice_none.choices[0].message + assert len(msg.tool_calls) == 0 + + MODEL_NAME = "openai-community/gpt2" MODEL_NAME_SHORT = "gpt2" CHAT_TEMPLATE = "Dummy chat template for testing {}" @@ -878,7 +955,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, ], ) @@ -906,7 +982,6 @@ class TestServingChatWithHarmony: input_messages_2, [ {"role": "system"}, - {"role": "developer"}, {"role": "user"}, # The analysis message should be dropped on subsequent inputs because # of the subsequent assistant message to the final channel. @@ -966,7 +1041,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1047,7 +1122,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1128,7 +1203,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1178,7 +1253,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the third turn's input - req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) verify_harmony_messages( input_messages_3, @@ -1241,7 +1316,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the fourth turn's input - req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) verify_harmony_messages( input_messages_4, @@ -1297,7 +1372,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, # The reasoning that would have resulted in an analysis message is # dropped because of a later assistant message to the final channel. @@ -1329,7 +1403,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, { "role": "assistant", @@ -1359,7 +1432,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, { "role": "assistant", @@ -1368,3 +1440,69 @@ class TestServingChatWithHarmony: }, ], ) + + +@pytest.mark.asyncio +async def test_tool_choice_validation_without_parser(): + """Test that tool_choice='required' or named tool without tool_parser + returns an appropriate error message.""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + models = OpenAIServingModels( + engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + ) + # Create serving_chat without tool_parser (enable_auto_tools=False) + serving_chat = OpenAIServingChat( + mock_engine, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None, + enable_auto_tools=False, # No tool parser + ) + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given location", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + } + ] + + # Test tool_choice="required" without tool_parser + req_required = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "What's the weather?"}], + tools=tools, + tool_choice="required", + ) + response_required = await serving_chat.create_chat_completion(req_required) + assert isinstance(response_required, ErrorResponse) + assert "tool_choice" in response_required.error.message + assert "--tool-call-parser" in response_required.error.message + + # Test named tool_choice without tool_parser + req_named = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "What's the weather?"}], + tools=tools, + tool_choice={"type": "function", "function": {"name": "get_weather"}}, + ) + response_named = await serving_chat.create_chat_completion(req_named) + assert isinstance(response_named, ErrorResponse) + assert "tool_choice" in response_named.error.message + assert "--tool-call-parser" in response_named.error.message diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py new file mode 100644 index 0000000000000..1934d43d5cfb6 --- /dev/null +++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py @@ -0,0 +1,212 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for harmony streaming delta extraction. +""" + +from dataclasses import dataclass, field +from unittest.mock import patch + +import pytest + +from vllm.entrypoints.openai.serving_chat_stream_harmony import ( + extract_harmony_streaming_delta, +) + + +@dataclass +class MockMessage: + """Mock message object for testing.""" + + channel: str | None = None + recipient: str | None = None + + +@dataclass +class MockStreamableParser: + """Mock StreamableParser for testing without openai_harmony dependency.""" + + messages: list[MockMessage] = field(default_factory=list) + + +class TestExtractHarmonyStreamingDelta: + """Tests for extract_harmony_streaming_delta function.""" + + @pytest.mark.parametrize( + "delta_text,expected_content", + [ + ("Hello, world!", "Hello, world!"), + ("", ""), + ], + ) + def test_final_channel_returns_content_delta(self, delta_text, expected_content): + """Test that final channel returns a DeltaMessage with content.""" + parser = MockStreamableParser() + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel="final", + cur_recipient=None, + prev_recipient=None, + delta_text=delta_text, + include_reasoning=False, + ) + + assert delta_message is not None + assert delta_message.content == expected_content + assert tools_streamed is False + + @pytest.mark.parametrize( + "include_reasoning,expected_has_message", + [ + (True, True), + (False, False), + ], + ) + def test_analysis_channel_reasoning(self, include_reasoning, expected_has_message): + """Test analysis channel respects include_reasoning flag.""" + parser = MockStreamableParser() + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel="analysis", + cur_recipient=None, + prev_recipient=None, + delta_text="Let me think...", + include_reasoning=include_reasoning, + ) + + if expected_has_message: + assert delta_message is not None + assert delta_message.reasoning == "Let me think..." + else: + assert delta_message is None + assert tools_streamed is False + + @pytest.mark.parametrize("channel", ["commentary", "analysis"]) + @patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id") + def test_new_tool_call(self, mock_make_tool_call_id, channel): + """Test new tool call creation when recipient changes.""" + mock_make_tool_call_id.return_value = "call_test123" + parser = MockStreamableParser() + + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel=channel, + cur_recipient="functions.get_weather", + prev_recipient=None, + delta_text="", + include_reasoning=False, + ) + + assert delta_message is not None + assert len(delta_message.tool_calls) == 1 + tool_call = delta_message.tool_calls[0] + assert tool_call.id == "call_test123" + assert tool_call.type == "function" + assert tool_call.function.name == "get_weather" + assert tool_call.function.arguments == "" + assert tool_call.index == 0 + assert tools_streamed is True + + @pytest.mark.parametrize("channel", ["commentary", "analysis"]) + def test_tool_call_argument_streaming(self, channel): + """Test streaming tool call arguments (same recipient).""" + parser = MockStreamableParser() + + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel=channel, + cur_recipient="functions.get_weather", + prev_recipient="functions.get_weather", + delta_text='{"location": "Paris"}', + include_reasoning=False, + ) + + assert delta_message is not None + tool_call = delta_message.tool_calls[0] + assert tool_call.id is None + assert tool_call.function.arguments == '{"location": "Paris"}' + assert tool_call.index == 0 + assert tools_streamed is True + + @pytest.mark.parametrize("channel", ["commentary", "analysis"]) + def test_tool_call_empty_arguments_returns_none(self, channel): + """Test empty delta_text with same recipient returns None.""" + parser = MockStreamableParser() + + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel=channel, + cur_recipient="functions.get_weather", + prev_recipient="functions.get_weather", + delta_text="", + include_reasoning=False, + ) + + assert delta_message is None + assert tools_streamed is False + + def test_tool_call_index_from_previous_messages(self): + """Test tool call index accounts for previous function messages.""" + messages = [ + MockMessage(channel="analysis", recipient=None), # Not counted + MockMessage(channel="commentary", recipient="functions.tool1"), # Counted + MockMessage(channel="final", recipient=None), # Not counted + ] + parser = MockStreamableParser(messages=messages) + + delta_message, _ = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel="commentary", + cur_recipient="functions.tool2", + prev_recipient="functions.tool2", + delta_text="args", + include_reasoning=False, + ) + + assert delta_message.tool_calls[0].index == 1 + + @pytest.mark.parametrize( + "channel,recipient", + [ + ("commentary", None), + ("commentary", "browser.search"), + ], + ) + def test_returns_tool_call_preambles(self, channel, recipient): + """Test that invalid channel/recipient combinations return None.""" + parser = MockStreamableParser() + delta_text = "some text" + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel=channel, + cur_recipient=recipient, + prev_recipient=None, + delta_text=delta_text, + include_reasoning=True, + ) + + assert delta_message.content == delta_text + assert tools_streamed is False + + @pytest.mark.parametrize( + "channel,recipient", + [ + (None, None), + ("unknown_channel", None), + ], + ) + def test_returns_none_for_invalid_inputs(self, channel, recipient): + """Test that invalid channel/recipient combinations return None.""" + parser = MockStreamableParser() + + delta_message, tools_streamed = extract_harmony_streaming_delta( + harmony_parser=parser, + cur_channel=channel, + cur_recipient=recipient, + prev_recipient=None, + delta_text="some text", + include_reasoning=True, + ) + + assert delta_message is None + assert tools_streamed is False diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py index a23959e353da9..05d61ec02fd29 100644 --- a/tests/kernels/core/test_fused_qk_norm_rope.py +++ b/tests/kernels/core/test_fused_qk_norm_rope.py @@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16] IS_NEOX = [True, False] EPS_VALUES = [1e-5, 1e-6] SEEDS = [13] +PARTIAL_ROPE = [True, False] CUDA_DEVICES = ["cuda:0"] @@ -52,6 +53,7 @@ def _apply_qk_norm_rope( @pytest.mark.parametrize("is_neox", IS_NEOX) @pytest.mark.parametrize("eps", EPS_VALUES) @pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25]) @torch.inference_mode() def test_fused_qk_norm_rope_matches_reference( device: str, @@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference( is_neox: bool, eps: float, seed: int, + rotary_ratio: float, ): torch.set_default_device(device) current_platform.seed_everything(seed) @@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference( k_norm.weight.data.normal_(mean=1.0, std=0.1) q_weight = q_norm.weight.data k_weight = k_norm.weight.data - + rotary_dim = int(head_dim * rotary_ratio) rope = RotaryEmbedding( head_size=head_dim, - rotary_dim=head_dim, + rotary_dim=rotary_dim, max_position_embeddings=4096, base=10000.0, is_neox_style=is_neox, diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 6078ce44cee9f..537dcae4e74b4 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -258,16 +258,16 @@ class Config: f"{self.fe_supported_types()}." ) - # Check block quanization support - is_block_quatized = self.quant_block_shape is not None - if is_block_quatized and self.quant_dtype is None: + # Check block quantization support + is_block_quantized = self.quant_block_shape is not None + if is_block_quantized and self.quant_dtype is None: return False, "No block quantization support." - if is_block_quatized and not self.is_block_quant_supported(): + if is_block_quantized and not self.is_block_quant_supported(): return False, "Mismatched block quantization support." # deep_gemm only works with block-quantized - if self.needs_deep_gemm() and not is_block_quatized: + if self.needs_deep_gemm() and not is_block_quantized: return False, "Needs DeepGEMM but not block quantized." # Check dependencies (turn into asserts?) diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py deleted file mode 100644 index 1c10cb3b2c699..0000000000000 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ /dev/null @@ -1,92 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# DeepGEMM Style Cutlass Grouped GEMM Test -# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py - -import random - -import pytest -import torch - -from tests.kernels.moe.utils import per_token_cast_to_fp8 -from tests.kernels.utils import baseline_scaled_mm -from vllm import _custom_ops as ops -from vllm.platforms import current_platform -from vllm.utils.deep_gemm import per_block_cast_to_fp8 -from vllm.utils.math_utils import cdiv - - -@pytest.mark.parametrize( - "num_groups, expected_m_per_group, k, n", - [ - (4, 8192, 7168, 4096), - (4, 8192, 2048, 7168), - (8, 4096, 7168, 4096), - (8, 4096, 2048, 7168), - (32, 1024, 7168, 4096), - (32, 1024, 2048, 7168), - ], -) -@pytest.mark.parametrize("out_dtype", [torch.float16]) -@pytest.mark.skipif( - (lambda x: x is None or x.to_int() != 100)( - current_platform.get_device_capability() - ), - reason="Block Scaled Grouped GEMM is only supported on SM100.", -) -def test_cutlass_grouped_gemm( - num_groups: int, - expected_m_per_group: int, - k: int, - n: int, - out_dtype: torch.dtype, -): - device = "cuda" - alignment = 128 - group_ms = [ - int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups) - ] - m = sum([cdiv(m, alignment) * alignment for m in group_ms]) - - x = torch.randn((m, k), device=device, dtype=out_dtype) - y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype) - out = torch.empty((m, n), device=device, dtype=out_dtype) - ref_out = torch.randn((m, n), device=device, dtype=out_dtype) - - ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m] - pb_size = [] - for i in range(num_groups): - pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k]) - problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32) - expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32) - - x_fp8 = per_token_cast_to_fp8(x) - y_fp8 = ( - torch.empty_like(y, dtype=torch.float8_e4m3fn), - torch.empty( - (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float - ), - ) - for i in range(num_groups): - y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128]) - - for i in range(num_groups): - a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]] - a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]] - b = y_fp8[0][i].t() - b_scale = y_fp8[1][i].t() - baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype) - ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline - - ops.cutlass_blockwise_scaled_grouped_mm( - out, - x_fp8[0], - y_fp8[0], - x_fp8[1], - y_fp8[1], - problem_sizes, - expert_offsets[:-1], - ) - - torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index ce99d9691fdc8..fd6ce6bfbd782 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.v1.worker.workspace import init_workspace_manager NUM_EXPERTS = [8, 64, 192] EP_SIZE = [1, 4] @@ -487,6 +488,7 @@ def test_mixtral_moe( monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_PORT", "12345") init_distributed_environment() + init_workspace_manager(torch.cuda.current_device()) # Instantiate our and huggingface's MoE blocks vllm_config.compilation_config.static_forward_context = dict() @@ -533,6 +535,11 @@ def test_mixtral_moe( torch.cuda.synchronize() torch.cuda.empty_cache() + # FIXME (zyongye) fix this after we move self.kernel + # assignment in FusedMoE.__init__ + + vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts) + # Run forward passes for both MoE blocks hf_states, _ = hf_moe.forward(hf_inputs) vllm_states = vllm_moe.forward(vllm_inputs) diff --git a/tests/models/multimodal/generation/conftest.py b/tests/models/multimodal/conftest.py similarity index 79% rename from tests/models/multimodal/generation/conftest.py rename to tests/models/multimodal/conftest.py index 26f8586742cea..4243298cdc896 100644 --- a/tests/models/multimodal/generation/conftest.py +++ b/tests/models/multimodal/conftest.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Pytest configuration for vLLM tests.""" +"""Pytest configuration for vLLM multimodal tests.""" import warnings @@ -9,16 +9,13 @@ import torch from vllm.platforms import current_platform -def pytest_configure(config): - """Disable Flash/MemEfficient SDP on ROCm to avoid HF - Transformers accuracy issues. - """ +def pytest_collection_modifyitems(config, items): + """Configure ROCm-specific settings based on collected tests.""" if not current_platform.is_rocm(): return skip_patterns = ["test_granite_speech.py"] if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns): - # Skip disabling SDP for Granite Speech tests on ROCm return # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index c5a0b6748f797..6640e1ff9474d 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, + vllm_runner_kwargs={ + "attention_config": { + "backend": "ROCM_AITER_FA", + }, + } + if current_platform.is_rocm() + else None, image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[ pytest.mark.core_model, @@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = { image_size_factors=[(0.25, 0.2, 0.15)], vllm_runner_kwargs={ "model_impl": "transformers", + # TODO: [ROCm] Revert this once issue #30167 is resolved + **( + { + "mm_processor_kwargs": { + "min_pixels": 256 * 28 * 28, + "max_pixels": 1280 * 28 * 28, + }, + } + if current_platform.is_rocm() + else {} + ), }, - marks=[large_gpu_mark(min_gb=32)], + marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)], ), #### Extended model tests "aria": VLMTestInfo( @@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = { hf_output_post_proc=model_utils.minimax_vl_01_hf_output, patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner, auto_cls=AutoModelForImageTextToText, - marks=[large_gpu_mark(min_gb=80)], + marks=[ + large_gpu_mark(min_gb=80), + # TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1 + pytest.mark.skipif( + current_platform.is_rocm(), + reason=( + "ROCm: Model too large for single GPU; " + "multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn" + ), + ), + ], ), "molmo": VLMTestInfo( models=["allenai/Molmo-7B-D-0924"], diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 489743c5a29b3..1519a50c1a0c3 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -39,7 +39,7 @@ models = [MODEL_NAME] def granite_speech_attention_config(): """Return attention config for Granite Speech tests on ROCm.""" if current_platform.is_rocm(): - return {"backend": "TRITON_ATTN"} + return {"backend": "ROCM_AITER_FA"} return None diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index cb875436857cf..46fd4249ea4f8 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -138,7 +138,7 @@ def create_batched_mm_kwargs( ) -# TODO(Isotr0py): Don't initalize model during test +# TODO(Isotr0py): Don't initialize model during test @contextmanager def initialize_dummy_model( model_cls: type[nn.Module], diff --git a/tests/models/registry.py b/tests/models/registry.py index fa70e94abd865..82b9303b2a21b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -459,6 +459,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ), "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"), "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True), + "MiMoV2FlashForCausalLM": _HfExamplesInfo( + "XiaomiMiMo/MiMo-V2-Flash", trust_remote_code=True + ), "Dots1ForCausalLM": _HfExamplesInfo("rednote-hilab/dots.llm1.inst"), } diff --git a/tests/multimodal/test_embedding_shape_validation_unit.py b/tests/multimodal/test_embedding_shape_validation_unit.py new file mode 100644 index 0000000000000..7966aad4e988c --- /dev/null +++ b/tests/multimodal/test_embedding_shape_validation_unit.py @@ -0,0 +1,249 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for embedding shape validation. + +Simple, fast unit tests that can run without server fixtures. +Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v +""" + +import pytest +import torch + +from vllm.multimodal.parse import ( + AudioEmbeddingItems, + ImageEmbeddingItems, +) + + +class TestImageEmbedBasicValidation: + """Test basic ndim validation in image embeddings via ImageEmbeddingItems.""" + + def test_valid_2d_tensor_accepted(self): + """Baseline: 2D tensors should be accepted.""" + valid_tensor = torch.randn(10, 768, dtype=torch.float32) + + # Should not raise - 2D is valid + items = ImageEmbeddingItems(valid_tensor) + assert items.get_count() == 10 + + def test_valid_3d_tensor_accepted(self): + """Baseline: 3D tensors should be accepted.""" + valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32) + + # Should not raise - 3D is valid + items = ImageEmbeddingItems(valid_tensor) + assert items.get_count() == 2 + + def test_valid_list_of_2d_tensors_accepted(self): + """Baseline: List of 2D tensors should be accepted.""" + tensors = [ + torch.randn(10, 768, dtype=torch.float32), + torch.randn(15, 768, dtype=torch.float32), + ] + + # Should not raise + items = ImageEmbeddingItems(tensors) + assert items.get_count() == 2 + + def test_1d_tensor_rejected(self): + """Security: 1D tensors should be rejected (invalid ndim).""" + invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D + + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems(invalid_tensor) + + assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value) + + def test_4d_tensor_rejected(self): + """Security: 4D tensors should be rejected (invalid ndim).""" + invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32) # 4D + + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems(invalid_tensor) + + assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value) + + def test_hidden_size_validation_correct_size(self): + """Embeddings with correct hidden size should be accepted.""" + expected_hidden_size = 768 + valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32) + + # Should not raise + items = ImageEmbeddingItems( + valid_tensor, expected_hidden_size=expected_hidden_size + ) + assert items.get_count() == 10 + + def test_hidden_size_validation_wrong_size_rejected(self): + """Embeddings with wrong hidden size should be rejected.""" + expected_hidden_size = 768 + wrong_hidden_size = 4096 + invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32) + + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems( + invalid_tensor, expected_hidden_size=expected_hidden_size + ) + + error_msg = str(exc_info.value) + assert "hidden dimension mismatch" in error_msg.lower() + assert str(wrong_hidden_size) in error_msg + assert str(expected_hidden_size) in error_msg + + +class TestAudioEmbedBasicValidation: + """Test basic ndim validation in audio embeddings via AudioEmbeddingItems.""" + + def test_valid_2d_tensor_accepted(self): + """Baseline: 2D tensors should be accepted.""" + valid_tensor = torch.randn(10, 768, dtype=torch.float32) + + # Should not raise - 2D is valid + items = AudioEmbeddingItems(valid_tensor) + assert items.get_count() == 10 + + def test_valid_3d_tensor_accepted(self): + """Baseline: 3D tensors should be accepted.""" + valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32) + + # Should not raise - 3D is valid + items = AudioEmbeddingItems(valid_tensor) + assert items.get_count() == 2 + + def test_valid_list_of_2d_tensors_accepted(self): + """Baseline: List of 2D tensors should be accepted.""" + tensors = [ + torch.randn(10, 768, dtype=torch.float32), + torch.randn(15, 768, dtype=torch.float32), + ] + + # Should not raise + items = AudioEmbeddingItems(tensors) + assert items.get_count() == 2 + + def test_1d_tensor_rejected(self): + """Security: 1D tensors should be rejected (invalid ndim).""" + invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D + + with pytest.raises(ValueError) as exc_info: + AudioEmbeddingItems(invalid_tensor) + + assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value) + + def test_scalar_rejected(self): + """Security: Scalar tensors should be rejected.""" + invalid_tensor = torch.tensor(1.0) # 0D (scalar) + + with pytest.raises(ValueError): + AudioEmbeddingItems(invalid_tensor) + + def test_hidden_size_validation_correct_size(self): + """Embeddings with correct hidden size should be accepted.""" + expected_hidden_size = 768 + valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32) + + # Should not raise + items = AudioEmbeddingItems( + valid_tensor, expected_hidden_size=expected_hidden_size + ) + assert items.get_count() == 10 + + def test_hidden_size_validation_wrong_size_rejected(self): + """Embeddings with wrong hidden size should be rejected.""" + expected_hidden_size = 768 + wrong_hidden_size = 4096 + invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32) + + with pytest.raises(ValueError) as exc_info: + AudioEmbeddingItems( + invalid_tensor, expected_hidden_size=expected_hidden_size + ) + + error_msg = str(exc_info.value) + assert "hidden dimension mismatch" in error_msg.lower() + assert str(wrong_hidden_size) in error_msg + assert str(expected_hidden_size) in error_msg + + +class TestShapeValidationDoSPrevention: + """ + Tests for DoS prevention through shape validation. + + Verifies that embeddings with incorrect shapes are rejected early, + preventing crashes during model inference. + """ + + def test_prevent_crash_from_wrong_shape_image_embeds(self): + """ + Prevent crash scenario: wrong hidden size in image embeddings. + + Without validation, this would pass initial checks but crash later + during model forward pass when dimensions don't match. + """ + expected_hidden_size = 768 # Typical model hidden size + wrong_hidden_size = 4096 # Wrong size (e.g., Llama-sized) + + wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32) + + # Should be rejected at instantiation time, not during inference + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems( + wrong_embedding, expected_hidden_size=expected_hidden_size + ) + + error_msg = str(exc_info.value) + assert "hidden dimension mismatch" in error_msg.lower() + assert str(expected_hidden_size) in error_msg # Expected + assert str(wrong_hidden_size) in error_msg # Received + + def test_prevent_crash_from_wrong_shape_audio_embeds(self): + """ + Prevent crash scenario: wrong hidden size in audio embeddings. + """ + expected_hidden_size = 768 + wrong_hidden_size = 4096 + + wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32) + + with pytest.raises(ValueError) as exc_info: + AudioEmbeddingItems( + wrong_embedding, expected_hidden_size=expected_hidden_size + ) + + error_msg = str(exc_info.value) + assert "hidden dimension mismatch" in error_msg.lower() + + def test_extremely_large_hidden_size_rejected(self): + """Security: Prevent DoS from extremely large embeddings.""" + expected_hidden_size = 768 + huge_hidden_size = 100000 # Large but not extreme to avoid test OOM + + invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32) + + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems( + invalid_tensor, expected_hidden_size=expected_hidden_size + ) + + assert "hidden dimension mismatch" in str(exc_info.value).lower() + + def test_batch_with_mixed_hidden_sizes_rejected(self): + """All embeddings in a list must have the same hidden size.""" + expected_hidden_size = 768 + + # One correct, one wrong + batch = [ + torch.randn(10, expected_hidden_size, dtype=torch.float32), + torch.randn(10, expected_hidden_size + 100, dtype=torch.float32), # Wrong! + ] + + # Should fail on the second one + with pytest.raises(ValueError) as exc_info: + ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size) + + assert "hidden dimension mismatch" in str(exc_info.value).lower() + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 412b21328a325..535f028202275 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") with vllm_runner(model_path, enforce_eager=True) as llm: @@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs( current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") if use_aiter: if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL: @@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token( current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") if use_aiter: if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 62203186510ce..a4b6d35987e13 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import ( Fp8Config, Fp8KVCacheMethod, Fp8LinearMethod, + Fp8MoeBackend, Fp8MoEMethod, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -216,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None: ref_y, inv_scale = ops.scaled_fp8_quant(x, None) ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype) - # Reference dynamic quantizaton + # Reference dynamic quantization y = quantize_ref(x, inv_scale) torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) @@ -324,7 +325,10 @@ def test_fp8_reloading( weight_loader=default_weight_loader, ) + # Fp8LinearMethod uses use_marlin + # Fp8MoEMethod uses fp8_backend method.use_marlin = use_marlin + method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None # capture weights format during loading original_metadata = [ diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index 0298994c396f6..154b29d7017ac 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`. """ import os +from typing import NoReturn import pytest import torch @@ -19,6 +20,28 @@ def enable_pickle(monkeypatch): monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") +def _skip(msg: str) -> NoReturn: + pytest.skip(msg) + raise RuntimeError(msg) + + +def _snapshot_download_or_skip(model_id: str) -> str: + try: + from huggingface_hub import snapshot_download + except Exception as e: # pragma: no cover + _skip(f"huggingface_hub is required to download {model_id}: {e}") + + try: + return snapshot_download( + repo_id=model_id, + repo_type="model", + # These checkpoints are already small; download full repo for simplicity. + allow_patterns=["*"], + ) + except Exception as e: + _skip(f"Failed to download {model_id} from the HF Hub: {e}") + + @pytest.mark.skipif( not is_quant_method_supported("modelopt"), reason="ModelOpt FP8 is not supported on this GPU type.", @@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner): output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8 output: {output}") + + +@pytest.mark.skipif( + not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.", +) +def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup.""" + model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt" + model_path = _snapshot_download_or_skip(model_id) + + with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8PcPtLinearMethod, + ) + + assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod) + + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Per-channel scales; activations are dynamically scaled per token. + assert hasattr(qkv_proj, "weight_scale") + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.weight_scale.dim() == 1 + assert not hasattr(qkv_proj, "input_scale") + + assert hasattr(o_proj, "weight_scale") + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.weight_scale.dim() == 1 + assert not hasattr(o_proj, "input_scale") + + assert hasattr(gate_up_proj, "weight_scale") + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.weight_scale.dim() == 1 + assert not hasattr(gate_up_proj, "input_scale") + + assert hasattr(down_proj, "weight_scale") + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.weight_scale.dim() == 1 + assert not hasattr(down_proj, "input_scale") + + llm.apply_model(check_model) + + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) + assert output + print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}") + + +@pytest.mark.skipif( + not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.", +) +def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8_PB_WO checkpoint setup.""" + model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo" + model_path = _snapshot_download_or_skip(model_id) + + with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8PbWoLinearMethod, + ) + + assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod) + + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor. + assert hasattr(qkv_proj, "weight_scale") + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.weight_scale.dim() == 2 + + assert hasattr(o_proj, "weight_scale") + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.weight_scale.dim() == 2 + + assert hasattr(gate_up_proj, "weight_scale") + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.weight_scale.dim() == 2 + + assert hasattr(down_proj, "weight_scale") + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.weight_scale.dim() == 2 + + llm.apply_model(check_model) + + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) + assert output + print(f"ModelOpt FP8_PB_WO output: {output}") diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index 2017e34030d60..ebf199a5056fb 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -18,25 +18,37 @@ for i in {1..5}; do echo "Checking metadata.json URL (attempt $i)..." if curl --fail "$meta_json_url" > metadata.json; then echo "INFO: metadata.json URL is valid." - # check whether it is valid json by python + # check whether it is valid json by python (printed to stdout) if python3 -m json.tool metadata.json; then - echo "INFO: metadata.json is valid JSON. Proceeding with the test." + echo "INFO: metadata.json is valid JSON. Proceeding with the check." + # check whether there is an object in the json matching: + # "package_name": "vllm", and "platform_tag" matches the current architecture + # see `determine_wheel_url` in setup.py for more details + if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \ + s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \ + for o in d)))" 2>/dev/null; then + echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture." + break + else + echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture." + fi else echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!" + echo "INFO: metadata.json content:" + cat metadata.json exit 1 fi - break fi - # failure handling + # failure handling & retry logic if [ $i -eq 5 ]; then - echo "ERROR: metadata.json URL is still not valid after 5 attempts." - echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists." + echo "ERROR: metadata is still not available after 5 attempts." + echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available." echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes." echo " NOTE: If it fails, please report in #sig-ci channel." exit 1 else - echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..." - sleep 180 + echo "WARNING: metadata is not available. Retrying after 5 minutes..." + sleep 300 fi done diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py new file mode 100644 index 0000000000000..7b90b949aa457 --- /dev/null +++ b/tests/test_attention_backend_registry.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionImpl, +) +from vllm.attention.backends.registry import ( + AttentionBackendEnum, + MambaAttentionBackendEnum, + register_backend, +) + + +class CustomAttentionImpl(AttentionImpl): + """Mock custom attention implementation for testing.""" + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, *args, **kwargs): + """Mock forward pass.""" + pass + + +class CustomAttentionBackend(AttentionBackend): + """Mock custom attention backend for testing.""" + + @staticmethod + def get_name(): + return "CUSTOM" + + @staticmethod + def get_impl_cls(): + return CustomAttentionImpl + + @staticmethod + def get_builder_cls(): + """Mock builder class.""" + return None + + @staticmethod + def get_required_kv_cache_layout(): + """Mock KV cache layout.""" + return None + + +class CustomMambaAttentionImpl(AttentionImpl): + """Mock custom mamba attention implementation for testing.""" + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, *args, **kwargs): + """Mock forward pass.""" + pass + + +class CustomMambaAttentionBackend(AttentionBackend): + """Mock custom mamba attention backend for testing.""" + + @staticmethod + def get_name(): + return "CUSTOM_MAMBA" + + @staticmethod + def get_impl_cls(): + return CustomMambaAttentionImpl + + @staticmethod + def get_builder_cls(): + """Mock builder class.""" + return None + + @staticmethod + def get_required_kv_cache_layout(): + """Mock KV cache layout.""" + return None + + +def test_custom_is_not_alias_of_any_backend(): + # Get all members of AttentionBackendEnum + all_backends = list(AttentionBackendEnum) + + # Find any aliases of CUSTOM + aliases = [] + for backend in all_backends: + if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM: + aliases.append(backend.name) + + # CUSTOM should not be an alias of any other backend + assert len(aliases) == 0, ( + f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n" + f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n" + f"This happens when CUSTOM has the same value as another backend.\n" + f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n" + f"All backend values:\n" + + "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends) + ) + + # Verify CUSTOM has its own unique identity + assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", ( + f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'" + ) + + +def test_register_custom_backend_with_class_path(): + # Register with explicit class path + register_backend( + backend=AttentionBackendEnum.CUSTOM, + class_path="tests.test_attention_backend_registry.CustomAttentionBackend", + is_mamba=False, + ) + + # Check that CUSTOM backend is registered + assert AttentionBackendEnum.CUSTOM.is_overridden(), ( + "CUSTOM should be overridden after registration" + ) + + # Get the registered class path + class_path = AttentionBackendEnum.CUSTOM.get_path() + assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend" + + # Get the backend class + backend_cls = AttentionBackendEnum.CUSTOM.get_class() + assert backend_cls.get_name() == "CUSTOM" + assert backend_cls.get_impl_cls() == CustomAttentionImpl + + +def test_mamba_custom_is_not_alias_of_any_backend(): + # Get all mamba backends + all_backends = list(MambaAttentionBackendEnum) + + # Find any aliases of CUSTOM + aliases = [] + for backend in all_backends: + if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM: + aliases.append(backend.name) + + # CUSTOM should not be an alias of any other backend + assert len(aliases) == 0, ( + f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n" + f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n" + f"All mamba backend values:\n" + + "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends) + ) + + +def test_register_custom_mamba_backend_with_class_path(): + # Register with explicit class path + register_backend( + backend=MambaAttentionBackendEnum.CUSTOM, + class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend", + is_mamba=True, + ) + + # Check that the backend is registered + assert MambaAttentionBackendEnum.CUSTOM.is_overridden() + + # Get the registered class path + class_path = MambaAttentionBackendEnum.CUSTOM.get_path() + assert ( + class_path + == "tests.test_attention_backend_registry.CustomMambaAttentionBackend" + ) + + # Get the backend class + backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class() + assert backend_cls.get_name() == "CUSTOM_MAMBA" + assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py index e8826eb441a24..44cbdeed45074 100644 --- a/tests/test_routing_simulator.py +++ b/tests/test_routing_simulator.py @@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device): envs.environment_variables[env_name] = lambda s=strategy: s # Test the select_experts method - topk_weights, topk_ids, _ = fused_moe.select_experts( + topk_weights, topk_ids = fused_moe.select_experts( hidden_states=hidden_states, router_logits=router_logits, ) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 770560a5e549e..8c840fd2ac7e0 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import importlib import os import signal import time import uuid from dataclasses import dataclass from threading import Thread +from types import SimpleNamespace from typing import Any from unittest.mock import MagicMock @@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils.torch_utils import set_default_torch_num_threads from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core import EngineCore -from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient +from vllm.v1.engine.core_client import ( + AsyncMPClient, + EngineCoreClient, + SyncMPClient, +) from vllm.v1.engine.utils import CoreEngineProcManager from vllm.v1.executor.abstract import Executor @@ -60,6 +66,91 @@ def make_request( ) +def _reload_envs_module(): + import vllm.envs as envs_mod + + cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None) + if cache_clear is not None: + cache_clear() + return importlib.reload(envs_mod) + + +def _reload_core_client_module(): + module = importlib.import_module("vllm.v1.engine.core_client") + return importlib.reload(module) + + +def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch): + timeout_value = 654 + monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value)) + + # Ensure that the environment variable is loaded if caching is enabled + _reload_envs_module() + core_client_mod = _reload_core_client_module() + + poll_timeouts: list[int] = [] + + class ShadowSocket: + def poll(self, timeout: int) -> int: + # Capture the timeout value for each poll call + poll_timeouts.append(timeout) + return 1 + + def recv_multipart(self): + return (b"\x00\x00", b"ready") + + class DummySocket: + def send_multipart(self, _msg, *, copy: bool = False, track: bool = False): + if track: + return SimpleNamespace(done=True) + + def recv_multipart(self, *, copy: bool = False): + return (b"", b"") + + def close(self, *, linger: int = 0): + pass + + def bind(self, _address): + pass + + def connect(self, _address): + pass + + def setsockopt(self, *_args, **_kwargs): + pass + + monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket()) + monkeypatch.setattr( + core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket() + ) + + parallel_config = SimpleNamespace( + data_parallel_size=1, + data_parallel_rank=0, + data_parallel_size_local=1, + data_parallel_rank_local=None, + data_parallel_hybrid_lb=False, + data_parallel_external_lb=False, + ) + vllm_config = SimpleNamespace(parallel_config=parallel_config) + + client = core_client_mod.MPClient( + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=object, + log_stats=False, + client_addresses={ + "input_address": "inproc://input", + "output_address": "inproc://output", + }, + ) + try: + # timeout_value is in seconds, but poll receives milliseconds + assert poll_timeouts == [timeout_value * 1000] + finally: + client.shutdown() + + def loop_until_done(client: EngineCoreClient, outputs: dict): while True: engine_core_outputs = client.get_output().outputs diff --git a/tests/v1/engine/test_preprocess_error_handling.py b/tests/v1/engine/test_preprocess_error_handling.py new file mode 100644 index 0000000000000..0586cc64fa104 --- /dev/null +++ b/tests/v1/engine/test_preprocess_error_handling.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch.cuda + +from vllm import LLM, SamplingParams +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.core import EngineCore + +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" + + +def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch): + """Test that preprocessing errors are handled gracefully.""" + + assert not torch.cuda.is_initialized(), ( + "fork needs to be used for the engine " + "core process and this isn't possible if cuda is already initialized" + ) + + # Store original method to call for non-failing requests + original_preprocess = EngineCore.preprocess_add_request + + # Monkeypatch to make preprocess_add_request raise an exception + # only for requests with "FAIL" in the first token + def conditional_failing_preprocess(self, request: EngineCoreRequest): + # Fail if the first token id is 333 + if request.prompt_token_ids and request.prompt_token_ids[0] == 333: + raise ValueError("Simulated preprocessing error!") + return original_preprocess(self, request) + + monkeypatch.setattr( + EngineCore, "preprocess_add_request", conditional_failing_preprocess + ) + + llm = LLM(model=MODEL_NAME) + + # Create a failing request by crafting a request with an invalid token + # We need to use a direct approach since LLM.generate tokenizes for us + from vllm.inputs import TokensPrompt + + # This should raise an exception due to the preprocessing failure + # Special token id to trigger the failure + failing_prompt = TokensPrompt(prompt_token_ids=[333]) + outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10)) # type: ignore + assert len(outputs) == 1 + assert len(outputs[0].outputs[0].token_ids) == 0 + assert outputs[0].finished + assert outputs[0].outputs[0].finish_reason == "error" + + # Verify the engine is still functional with a normal request + outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10)) + assert len(outputs) == 1 + assert len(outputs[0].outputs[0].token_ids) > 0 + assert outputs[0].outputs[0].finish_reason in ("stop", "length") diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 76a0e8e25a4ae..1e2cc2241ba95 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -547,6 +547,13 @@ def test_spec_decode_logprobs( sampling_params = SamplingParams( temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False ) + penalty_sampling_params = SamplingParams( + temperature=0, + logprobs=top_logprobs, + max_tokens=10, + ignore_eos=False, + presence_penalty=-1.0, + ) method, model_name, spec_model_name = model_setup max_model_len = 256 @@ -558,14 +565,17 @@ def test_spec_decode_logprobs( seed=42, logprobs_mode=logprobs_mode, gpu_memory_utilization=0.4, + enable_prefix_caching=False, + ) + ref_results = ref_llm.generate( + [prompt, prompt], [sampling_params, penalty_sampling_params] ) - ref_results = ref_llm.generate([prompt], sampling_params) # Collect logprobs outputs from reference LLM. ref_logprobs = [] - for output in ref_results[0].outputs: - for logprobs in output.logprobs: - for token_id in logprobs: - ref_logprobs.append(logprobs[token_id]) + for results in ref_results: + for output in results.outputs: + for logprobs in output.logprobs: + ref_logprobs.extend(logprobs.values()) del ref_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() @@ -587,14 +597,17 @@ def test_spec_decode_logprobs( # Force prefill chunking enable_chunked_prefill=True, max_num_batched_tokens=32, + enable_prefix_caching=False, + ) + spec_results = spec_llm.generate( + [prompt, prompt], [sampling_params, penalty_sampling_params] ) - spec_results = spec_llm.generate([prompt], sampling_params) # Collect logprobs outputs from spec decode LLM. spec_logprobs = [] - for output in spec_results[0].outputs: - for logprobs in output.logprobs: - for token_id in logprobs: - spec_logprobs.append(logprobs[token_id]) + for results in spec_results: + for output in results.outputs: + for logprobs in output.logprobs: + spec_logprobs.extend(logprobs.values()) del spec_llm torch.cuda.empty_cache() cleanup_dist_env_and_memory() diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 0eae279acf5be..03e3bb7594910 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -761,7 +761,7 @@ class rocm_aiter_ops: @classmethod @if_aiter_supported - def is_linear_fp8_enaled(cls) -> bool: + def is_linear_fp8_enabled(cls) -> bool: return cls.is_linear_enabled() @classmethod diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index cf7f17a033be3..78bd8d4e64115 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool: return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability) -def cutlass_blockwise_scaled_grouped_mm( - output: torch.Tensor, - a: torch.Tensor, - b: torch.Tensor, - scales_a: torch.Tensor, - scales_b: torch.Tensor, - problem_sizes: torch.Tensor, - expert_offsets: torch.Tensor, -): - torch.ops._C.cutlass_blockwise_scaled_grouped_mm( - output, a, b, scales_a, scales_b, problem_sizes, expert_offsets - ) - - def cutlass_scaled_fp4_mm( a: torch.Tensor, b: torch.Tensor, diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index ed0021db204ac..416b996df9f22 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): ) CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend" # Placeholder for third-party/custom backends - must be registered before use - CUSTOM = "" + # set to None to avoid alias with other backend, whose value is an empty string + CUSTOM = None def get_path(self, include_classname: bool = True) -> str: """Get the class path for this backend (respects overrides). @@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend" GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend" # Placeholder for third-party/custom backends - must be registered before use - CUSTOM = "" + # set to None to avoid alias with other backend, whose value is an empty string + CUSTOM = None def get_path(self, include_classname: bool = True) -> str: """Get the class path for this backend (respects overrides). diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py index 16106f3c93a6a..f347fb3fbba51 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/attention/ops/merge_attn_states.py @@ -15,7 +15,7 @@ def merge_attn_states( output_lse: torch.Tensor | None = None, ) -> None: # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel - # is not support for FP8 dtype, fallback to use Triton kernel. + # does not support FP8 dtype, fallback to use Triton kernel. def supported_dtypes(o: torch.Tensor) -> bool: return o.dtype in [torch.float32, torch.half, torch.bfloat16] diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index f61c8e9b89c24..c946dbd8a2c4e 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -189,9 +189,14 @@ def kernel_unified_attention_2d( + 1 ) - # adjust for potential padding in the last q_block by considering the - # actual sequence length - max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + if USE_MM_PREFIX: + # image bidirectional attention ranges require a full range + # including q_block padding to make sure doc mask is correct + max_seq_prefix_len = tl.maximum(max_seq_prefix_len, seq_len) + else: + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) # calculate the number of tiles that need to be processed to # cover the longest sequence prefix (due to causal masking, tiles beyond @@ -202,7 +207,8 @@ def kernel_unified_attention_2d( # Default: keep previous global behavior tile_start = 0 tile_end = num_tiles - if SLIDING_WINDOW > 0: + # TODO(Isotr0py): sliding window pruning with image bidirectional mask + if SLIDING_WINDOW > 0 and not USE_MM_PREFIX: # Query rows covered by this Q-block qpos_lo = q_block_local_idx * BLOCK_Q qpos_hi = tl.minimum( @@ -357,6 +363,12 @@ def kernel_unified_attention_2d( L = L * alpha + l_j M = m_j + if SLIDING_WINDOW: + qpos_lo = q_block_local_idx * BLOCK_Q + V = tl.where( + (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0 + ) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc += tl.dot(P.to(V.dtype), V) @@ -672,6 +684,12 @@ def kernel_unified_attention_3d( L = L * alpha + l_j M = m_j + if SLIDING_WINDOW: + qpos_lo = q_block_local_idx * BLOCK_Q + V = tl.where( + (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0 + ) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc += tl.dot(P.to(V.dtype), V) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0e91dd57420a8..3c77fad41d077 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -18,6 +18,7 @@ from vllm.config.lora import LoRAConfig from vllm.config.model import ( ModelConfig, iter_architecture_defaults, + str_dtype_to_torch_dtype, try_match_architecture_defaults, ) from vllm.config.multimodal import MultiModalConfig @@ -72,6 +73,7 @@ __all__ = [ # From vllm.config.model "ModelConfig", "iter_architecture_defaults", + "str_dtype_to_torch_dtype", "try_match_architecture_defaults", # From vllm.config.multimodal "MultiModalConfig", diff --git a/vllm/config/model.py b/vllm/config/model.py index 1de9d15cf8c52..dd2b7b9d7a786 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -71,7 +71,7 @@ else: logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] -ConvertType = Literal["none", "embed", "classify", "reward"] +ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"] ConvertOption = Literal["auto", ConvertType] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] @@ -843,12 +843,18 @@ class ModelConfig: producer_name = quant_cfg.get("producer", {}).get("name") if producer_name == "modelopt": quant_algo = quant_cfg.get("quantization", {}).get("quant_algo") - if quant_algo == "FP8": - quant_cfg["quant_method"] = "modelopt" - elif quant_algo == "NVFP4": - quant_cfg["quant_method"] = "modelopt_fp4" - elif quant_algo is not None: - raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") + if quant_algo is not None: + quant_algo_upper = str(quant_algo).upper() + if quant_algo_upper in { + "FP8", + "FP8_PER_CHANNEL_PER_TOKEN", + "FP8_PB_WO", + }: + quant_cfg["quant_method"] = "modelopt" + elif quant_algo_upper == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + else: + raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") return quant_cfg @@ -1849,6 +1855,11 @@ _STR_DTYPE_TO_TORCH_DTYPE = { "bfloat16": torch.bfloat16, } + +def str_dtype_to_torch_dtype(type: str): + return _STR_DTYPE_TO_TORCH_DTYPE.get(type) + + # model_type -> reason _FLOAT16_NOT_SUPPORTED_MODELS = { "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..bf656cf23de65 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -71,7 +71,11 @@ class EngineClient(ABC): truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model.""" + """Generate outputs for a request from a pooling model. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove this argument in v0.15. + """ ... @abstractmethod diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 04967cbe268dd..422a8c18e8e98 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -51,6 +51,9 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, UsageInfo, ) +from vllm.entrypoints.openai.serving_chat_stream_harmony import ( + extract_harmony_streaming_delta, +) from vllm.entrypoints.openai.serving_engine import ( GenerationError, OpenAIServing, @@ -253,18 +256,31 @@ class OpenAIServingChat(OpenAIServing): truncate_tool_call_ids(request) validate_request_params(request) - if ( - request.tool_choice == "auto" - and not (self.enable_auto_tools and tool_parser is not None) + # Check if tool parsing is unavailable (common condition) + tool_parsing_unavailable = ( + tool_parser is None and not isinstance(tokenizer, MistralTokenizer) and not self.use_harmony + ) + + # Validate tool_choice when tool parsing is required but unavailable + if tool_parsing_unavailable and request.tool_choice not in ( + None, + "none", ): - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - '"auto" tool choice requires ' - "--enable-auto-tool-choice and --tool-call-parser to be set" - ) + if request.tool_choice == "auto" and not self.enable_auto_tools: + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + '"auto" tool choice requires ' + "--enable-auto-tool-choice and --tool-call-parser to be set" + ) + elif request.tool_choice != "auto": + # "required" or named tool requires tool parser + return self.create_error_response( + f'tool_choice="{request.tool_choice}" requires ' + "--tool-call-parser to be set" + ) if request.tools is None or ( request.tool_choice == "none" @@ -299,7 +315,10 @@ class OpenAIServingChat(OpenAIServing): ) else: # For GPT-OSS. - conversation, engine_prompts = self._make_request_with_harmony(request) + should_include_tools = tool_dicts is not None + conversation, engine_prompts = self._make_request_with_harmony( + request, should_include_tools + ) except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") @@ -792,6 +811,11 @@ class OpenAIServingChat(OpenAIServing): delta_text += harmony_parser.last_content_delta or "" cur_channel = harmony_parser.current_channel cur_recipient = harmony_parser.current_recipient + # handle the case where several tokens where generated at once + # including the final token, leading to a delta in the text + # but the current channel to be empty (start state) + if not cur_channel and delta_text: + cur_channel = "final" else: delta_text = output.text @@ -821,64 +845,17 @@ class OpenAIServingChat(OpenAIServing): current_token_ids = as_list(output.token_ids) if self.use_harmony: - if cur_channel == "final": - delta_message = DeltaMessage(content=delta_text) - elif cur_channel == "analysis": - if request.include_reasoning: - delta_message = DeltaMessage(reasoning=delta_text) - else: - delta_message = None - elif ( - cur_channel == "commentary" - and cur_recipient - and cur_recipient.startswith("functions.") - ): - # Count completed tool calls to determine index - base_index = 0 - for msg in harmony_parser.messages: - if ( - msg.channel == "commentary" - and msg.recipient - and msg.recipient.startswith("functions.") - ): - base_index += 1 - - if prev_recipient != cur_recipient: - tool_name = cur_recipient.split("functions.", 1)[1] - delta_message = DeltaMessage( - tool_calls=[ - DeltaToolCall( - id=make_tool_call_id(), - type="function", - function=DeltaFunctionCall( - name=tool_name, - arguments="", - ), - index=base_index, - ) - ] - ) - elif delta_text: - delta_message = DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=base_index, - function=DeltaFunctionCall( - arguments=delta_text - ), - ) - ] - ) - else: - delta_message = None - - if delta_message is not None: - harmony_tools_streamed[i] = True - elif cur_channel == "commentary": - # Tool call preambles meant to be shown to the user - delta_message = DeltaMessage(content=delta_text) - else: - delta_message = None + delta_message, tools_streamed_flag = ( + extract_harmony_streaming_delta( + harmony_parser=harmony_parser, + cur_channel=cur_channel, + cur_recipient=cur_recipient, + prev_recipient=prev_recipient, + delta_text=delta_text, + include_reasoning=request.include_reasoning, + ) + ) + harmony_tools_streamed[i] |= tools_streamed_flag # handle streaming deltas for tools with named tool_choice elif tool_choice_function_name: if ( @@ -1833,6 +1810,7 @@ class OpenAIServingChat(OpenAIServing): def _make_request_with_harmony( self, request: ChatCompletionRequest, + should_include_tools: bool = True, ): messages: list[OpenAIMessage] = [] @@ -1850,13 +1828,16 @@ class OpenAIServingChat(OpenAIServing): reasoning_effort=request.reasoning_effort, browser_description=None, python_description=None, - with_custom_tools=request.tools is not None, + with_custom_tools=should_include_tools, ) messages.append(sys_msg) # Add developer message. - dev_msg = get_developer_message(tools=request.tools) - messages.append(dev_msg) + if request.tools: + dev_msg = get_developer_message( + tools=request.tools if should_include_tools else None + ) + messages.append(dev_msg) # Add user message. messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) diff --git a/vllm/entrypoints/openai/serving_chat_stream_harmony.py b/vllm/entrypoints/openai/serving_chat_stream_harmony.py new file mode 100644 index 0000000000000..1b5ae620651c6 --- /dev/null +++ b/vllm/entrypoints/openai/serving_chat_stream_harmony.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Harmony-specific streaming delta extraction for chat completions. + +This module handles the extraction of DeltaMessage objects from +harmony parser state during streaming chat completions. +""" + +from openai_harmony import StreamableParser + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import ( + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, +) + + +def extract_harmony_streaming_delta( + harmony_parser: StreamableParser, + cur_channel: str | None, + cur_recipient: str | None, + prev_recipient: str | None, + delta_text: str, + include_reasoning: bool, +) -> tuple[DeltaMessage | None, bool]: + """ + Extract a DeltaMessage from harmony parser state during streaming. + + Args: + harmony_parser: The StreamableParser instance tracking parse state + cur_channel: Current channel ("final", "analysis", "commentary", etc.) + cur_recipient: Current recipient (e.g., "functions.my_func") + prev_recipient: Previous recipient for detecting tool call transitions + delta_text: The text delta to include in the message + include_reasoning: Whether to include reasoning content + + Returns: + A tuple of (DeltaMessage or None, tools_streamed_flag) + """ + tools_streamed = False + + if cur_channel == "final": + delta_message = DeltaMessage(content=delta_text) + elif ( + (cur_channel == "commentary" or cur_channel == "analysis") + and cur_recipient + and cur_recipient.startswith("functions.") + ): + # Count completed tool calls to determine index + base_index = 0 + for msg in harmony_parser.messages: + if ( + (msg.channel == "commentary" or msg.channel == "analysis") + and msg.recipient + and msg.recipient.startswith("functions.") + ): + base_index += 1 + + if prev_recipient != cur_recipient: + tool_name = cur_recipient.split("functions.", 1)[1] + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + id=make_tool_call_id(), + type="function", + function=DeltaFunctionCall( + name=tool_name, + arguments="", + ), + index=base_index, + ) + ] + ) + elif delta_text: + delta_message = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=base_index, + function=DeltaFunctionCall(arguments=delta_text), + ) + ] + ) + else: + delta_message = None + + if delta_message is not None: + tools_streamed = True + elif cur_channel == "commentary": + # Tool call preambles meant to be shown to the user + delta_message = DeltaMessage(content=delta_text) + elif cur_channel == "analysis": + if include_reasoning: + delta_message = DeltaMessage(reasoning=delta_text) + else: + delta_message = None + else: + delta_message = None + + return delta_message, tools_streamed diff --git a/vllm/envs.py b/vllm/envs.py index f6db42e9124d6..1d4128d74b95c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -24,6 +24,7 @@ if TYPE_CHECKING: LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: str | None = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 + VLLM_ENGINE_READY_TIMEOUT_S: int = 600 VLLM_API_KEY: str | None = None VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False S3_ACCESS_KEY_ID: str | None = None @@ -604,6 +605,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int( os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60") ), + # Timeout in seconds for waiting for engine cores to become ready + # during startup. Default is 600 seconds (10 minutes). + "VLLM_ENGINE_READY_TIMEOUT_S": lambda: int( + os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600") + ), # API key for vLLM API server "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None), # Whether to log responses from API Server for debugging diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 8fee4038b60b8..3d248e7fb9945 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.utils import activation_without_mul +from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( + ZeroExpertFusedMoE, +) from vllm.triton_utils import HAS_TRITON _config: dict[str, Any] | None = None @@ -54,6 +57,7 @@ __all__ = [ "FusedMoEPrepareAndFinalize", "RoutingMethodType", "SharedFusedMoE", + "ZeroExpertFusedMoE", "activation_without_mul", "override_config", "get_config", diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index a9a2990ca2b53..d581e91f36d03 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_Scheme, ) from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_triton_kernels from vllm.utils.math_utils import cdiv @@ -39,6 +40,7 @@ if has_triton_kernels(): def _get_config_dtype_str( dtype: torch.dtype, use_fp8_w8a8: bool = False, + use_fp8_w8a16: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, ocp_mx_scheme: str | None = None, @@ -50,6 +52,8 @@ def _get_config_dtype_str( """ if use_fp8_w8a8: return "fp8_w8a8" + elif use_fp8_w8a16: + return "fp8_w8a16" elif use_int8_w8a16: return "int8_w8a16" elif use_int4_w4a16: @@ -319,6 +323,10 @@ class FusedMoEQuantConfig: def use_int8_w8a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == torch.int8 + @property + def use_fp8_w8a16(self) -> bool: + return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype() + @property def use_int4_w4a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == "int4" @@ -362,6 +370,7 @@ class FusedMoEQuantConfig: """ return _get_config_dtype_str( use_fp8_w8a8=self.use_fp8_w8a8, + use_fp8_w8a16=self.use_fp8_w8a16, use_int8_w8a16=self.use_int8_w8a16, use_int4_w4a16=self.use_int4_w4a16, ocp_mx_scheme=self.ocp_mx_scheme, @@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config( ) -> FusedMoEQuantConfig: """ Construct a quant config for 16-bit float activations and int4 weights. - Note: Activations are pre-quantized. """ group_shape = GroupShape(*block_shape) if block_shape is not None else None return FusedMoEQuantConfig( @@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config( ) +def fp8_w8a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and fp8 weights. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc( + current_platform.fp8_dtype(), group_shape, w1_scale, None, None + ), + _w2=FusedMoEQuantDesc( + current_platform.fp8_dtype(), group_shape, w2_scale, None, None + ), + ) + + def int8_w8a16_moe_quant_config( w1_scale: torch.Tensor, w2_scale: torch.Tensor, @@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config( ) -> FusedMoEQuantConfig: """ Construct a quant config for 16-bit float activations and int8 weights. - Note: Activations are pre-quantized. """ group_shape = GroupShape(*block_shape) if block_shape is not None else None return FusedMoEQuantConfig( diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 4a0b4e82c1b39..9281780fca478 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP, ) -from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache +from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -896,162 +896,6 @@ def cutlass_moe_fp4( ) -def _valid_cutlass_block_scaled_grouped_gemm( - w1: torch.Tensor, - w2: torch.Tensor, - inplace: bool, - activation: str, - apply_router_weight_on_input: bool, - expert_map: torch.Tensor | None, -) -> bool: - def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int): - return N % 128 == 0 and K % 128 == 0 - - _, K, N = w2.size() - if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K): - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. " - "N: %s, K: %s", - N, - K, - ) - return False - - if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). " - "w1.dtype: %s, w2.dtype: %s", - w1.dtype, - w2.dtype, - ) - return False - - if expert_map is not None: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported." - ) - return False - - if activation != "silu": - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported." - ) - return False - - if apply_router_weight_on_input: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled:" - " apply_router_weight_on_input is not supported." - ) - return False - - if inplace: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: inplace is not supported." - ) - return False - - return True - - -# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8. -def run_cutlass_block_scaled_fused_experts( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, -) -> torch.Tensor: - w1_q = w1.transpose(1, 2) - w2_q = w2.transpose(1, 2) - w1_scale = w1_scale.transpose(1, 2) - w2_scale = w2_scale.transpose(1, 2) - - assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" - assert a.shape[0] == topk_ids.shape[0], ( - "a and topk_ids must have the same batch size" - ) - assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn" - assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn" - assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1" - assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2" - assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch" - assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch" - assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch" - assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype" - - out_dtype = a.dtype - num_experts = w1_q.size(0) - m = a.size(0) - k = w1_q.size(1) - n = w2_q.size(1) - - topk = topk_ids.size(1) - - a_q, a1_scale = _fp8_quantize( - a, A_scale=None, per_act_token=False, block_shape=[128, 128] - ) - device = a_q.device - - expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device) - problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device) - problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device) - - a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) - c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) - - ops.get_cutlass_moe_mm_data( - topk_ids, - expert_offsets, - problem_sizes1, - problem_sizes2, - a_map, - c_map, - num_experts, - n, - k, - ) - - rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype) - rep_a1_scales = a1_scale[a_map] - - c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device) - c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device) - - ops.cutlass_blockwise_scaled_grouped_mm( - c1, - rep_a_q, - w1_q, - rep_a1_scales, - w1_scale, - problem_sizes1, - expert_offsets[:-1], - ) - - intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device) - torch.ops._C.silu_and_mul(intermediate, c1) - - intermediate_q, a2_scale = _fp8_quantize( - intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128] - ) - - ops.cutlass_blockwise_scaled_grouped_mm( - c2, - intermediate_q, - w2_q, - a2_scale, - w2_scale, - problem_sizes2, - expert_offsets[:-1], - ) - - return ( - c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype) - ).sum(dim=1) - - # W4A8 def run_cutlass_moe_w4a8_fp8( output: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 92d72b75656cd..295a2a28156ed 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( batched_moe_align_block_size, moe_align_block_size, ) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP, @@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_moe_intermediate_size, marlin_quant_input, ) +from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): is_k_full: bool = True, ): # TODO (varun) : Enable activation quantization - assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, ( - "Supports only mxfp4_w4a16 or int4_w4a16" - ) + assert ( + quant_config.use_mxfp4_w4a16 + or quant_config.use_int4_w4a16 + or quant_config.use_fp8_w8a16 + ), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16" self.w13_g_idx = w13_g_idx self.w2_g_idx = w2_g_idx self.w13_g_idx_sort_indices = w13_g_idx_sort_indices @@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): @property def quant_type_id(self) -> int: # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4 - return ( - scalar_types.uint4b8.id - if self.quant_config.use_int4_w4a16 - else scalar_types.float4_e2m1f.id - ) + if self.quant_config.use_int4_w4a16: + return scalar_types.uint4b8.id + elif self.quant_config.use_mxfp4_w4a16: + return scalar_types.float4_e2m1f.id + elif ( + self.quant_config.use_fp8_w8a16 + and current_platform.fp8_dtype() == torch.float8_e4m3fn + ): + return scalar_types.float8_e4m3fn.id + else: + raise NotImplementedError("Unsupported quantization type.") def moe_problem_size( self, @@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase): ops.moe_sum(input, output) -def modular_marlin_fused_moe( - quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None -) -> mk.FusedMoEModularKernel: - return mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - MarlinExperts(quant_config), - shared_experts, - ) - - class BatchedMarlinExperts(MarlinExpertsBase): def __init__( self, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 37f8e7780f999..c8d80ae023d43 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -25,10 +25,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, _get_config_dtype_str, ) -from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - _valid_cutlass_block_scaled_grouped_gemm, - run_cutlass_block_scaled_fused_experts, -) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( _valid_deep_gemm, deep_gemm_moe_fp8, @@ -1678,11 +1674,9 @@ def fused_experts( expert_map: torch.Tensor | None = None, quant_config: FusedMoEQuantConfig | None = None, allow_deep_gemm: bool = False, - allow_cutlass_block_scaled_grouped_gemm: bool = False, ) -> torch.Tensor: if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG - use_fp8_w8a8 = quant_config.use_fp8_w8a8 # For now, disable DeepGemm for small N (<= 512) until better # permute/unpermute ops are available. @@ -1712,23 +1706,6 @@ def fused_experts( a2_scale=quant_config.a2_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) - elif ( - allow_cutlass_block_scaled_grouped_gemm - and use_fp8_w8a8 - and _valid_cutlass_block_scaled_grouped_gemm( - w1, w2, inplace, activation, apply_router_weight_on_input, expert_map - ) - ): - assert quant_config is not None - return run_cutlass_block_scaled_fused_experts( - a=hidden_states, - w1=w1, - w2=w2, - w1_scale=quant_config.w1_scale, - w2_scale=quant_config.w2_scale, - topk_weights=topk_weights, - topk_ids=topk_ids, - ) else: return dispatch_fused_experts_func(inplace)( hidden_states=hidden_states, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 9c9bc2514bb4b..30ff1bf2f008a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, zero_expert_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): expert_map=None if self.disable_expert_map else layer.expert_map, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6a65b06014bca..2e7267d56d838 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, RoutingMethodType, ) -from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) @@ -350,8 +349,6 @@ class FusedMoE(CustomOp): num_redundant_experts: int = 0, has_bias: bool = False, is_sequence_parallel=False, - zero_expert_num: int | None = 0, - zero_expert_type: str | None = None, expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, routing_method_type: int | None = None, @@ -409,8 +406,6 @@ class FusedMoE(CustomOp): self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts - self.zero_expert_num = zero_expert_num - self.zero_expert_type = zero_expert_type # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping @@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp): self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + ) -> tuple[torch.Tensor, torch.Tensor]: """ Route the input hidden states to the top-k experts based on the router logits. Returns: - (topk_weights, topk_ids, zero_expert_result) - (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): - The weights, expert ids, and zero expert computation result. + (topk_weights, topk_ids) + (tuple[torch.Tensor, torch.Tensor]): + The weights and expert ids. **Compatibility**: When EPLB is not enabled, the returned ids are equivalent to global logical ids, so should be compatible with @@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp): assert topk_ids.dtype == indices_type or indices_type is None - # Compute zero expert result if needed - if ( - self.zero_expert_num is not None - and self.zero_expert_num > 0 - and self.zero_expert_type is not None - and self.global_num_experts is not None - ): - zero_expert_result = zero_experts_compute_triton( - expert_indices=topk_ids, - expert_scales=topk_weights, - num_experts=self.global_num_experts, - zero_expert_type=self.zero_expert_type, - hidden_states=hidden_states, - ) - else: - zero_expert_result = None - return topk_weights, topk_ids, zero_expert_result + return topk_weights, topk_ids def must_reduce_shared_expert_outputs(self) -> bool: """ @@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp): fused_output = torch.ops.vllm.moe_forward( hidden_states, router_logits, self.layer_name ) - if self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(fused_output, tuple) - fused_output, zero_expert_result = fused_output - return (reduce_output(fused_output) + zero_expert_result)[ - ..., :og_hidden_states - ] - else: - return reduce_output(fused_output)[..., :og_hidden_states] + return reduce_output(fused_output)[..., :og_hidden_states] else: if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we @@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp): final_hidden_states, ) - if self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, tuple) - assert self.shared_experts is None - final_hidden_states, zero_expert_result = final_hidden_states - if zero_expert_result is not None: - final_hidden_states += zero_expert_result - if not skip_result_store: if self.shared_experts is None: full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( @@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp): shared_output, final_hidden_states, ) - elif self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, tuple) - final_hidden_states, zero_expert_result = final_hidden_states def combine_output(states: torch.Tensor) -> torch.Tensor: if do_naive_dispatch_combine: @@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp): final_hidden_states[0], combine_output(final_hidden_states[1]), ) - elif self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, torch.Tensor) - return (combine_output(final_hidden_states), zero_expert_result) else: return combine_output(final_hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 6182f10aa70f0..82dbccf3fa9da 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -6,6 +6,7 @@ import torch import torch.nn.functional as F import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp @@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, ) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum @@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts - from .fused_moe import TritonExperts, fused_experts + from .fused_moe import TritonExperts else: - fused_experts = None # type: ignore + TritonExperts = None # type: ignore if current_platform.is_tpu(): from .moe_pallas import fused_moe as fused_moe_pallas @@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) else: layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) + elif current_platform.is_cuda_alike(): + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonExperts(self.moe_quant_config), + shared_experts=None, + ) def apply( self, @@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): router_logits=router_logits, ) - def get_fused_moe_quant_config( - self, layer: torch.nn.Module - ) -> FusedMoEQuantConfig | None: + def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: if self.moe.has_bias: return biased_moe_quant_config( layer.w13_bias, @@ -295,7 +304,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, zero_expert_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: - result = fused_experts( + result = self.kernel( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -330,19 +339,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): topk_ids=topk_ids, inplace=True, activation=layer.activation, - quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result def forward_cpu( self, diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py new file mode 100644 index 0000000000000..97d21767f4fc3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager + +import torch +from torch import nn + +from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +class ZeroExpertFusedMoE(FusedMoE): + """ + A FusedMoE operation that also computes the results of zero experts. + Zero experts perform identity operations (scaled pass-through) instead + of full MLP computations. + + This class uses memoization to avoid redundant routing computation: + routing is computed once and reused for both zero expert computation + and the main FusedMoE forward pass. + """ + + def __init__( + self, + zero_expert_num: int, + zero_expert_type: str, + router: nn.Module, + **kwargs, + ): + # ZeroExpertFusedMoE manages its own custom_routing_function for memoization + assert ( + "custom_routing_function" not in kwargs + or kwargs.get("custom_routing_function") is None + ), ( + "ZeroExpertFusedMoE does not support external custom_routing_function. " + "It manages its own for routing memoization." + ) + + # Automatically slice router's e_score_correction_bias to only include + # real experts (not zero_experts) for the base FusedMoE. + # The full bias will be used temporarily in forward() for routing. + if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs: + num_real_experts = kwargs["num_experts"] + router_bias = router.e_score_correction_bias + user_bias = kwargs.get("e_score_correction_bias") + + # Use router's bias if: + # 1. User didn't provide bias, or + # 2. User provided full bias (same size as router) + if user_bias is None or user_bias.shape[0] == router_bias.shape[0]: + kwargs["e_score_correction_bias"] = router_bias[:num_real_experts] + + # FusedMoE no longer accepts zero_expert_num/zero_expert_type. + # We handle zero experts ourselves in forward(). + super().__init__(**kwargs) + # Store the actual zero_expert_num and zero_expert_type for our own use + self._actual_zero_expert_num = zero_expert_num + self._actual_zero_expert_type = zero_expert_type + self._router = router # Full router (includes zero experts) + + # Expose zero_expert_num and zero_expert_type as attributes for + # compatibility with quantization methods that check these attributes + self.zero_expert_num = 0 + self.zero_expert_type = None + + # Memoization state for routing results + self._memoized_topk_weights: torch.Tensor | None = None + self._memoized_topk_ids: torch.Tensor | None = None + + # Create custom_routing_function to reuse memoized routing results + def custom_routing_function(hidden_states, gating_output, topk, renormalize): + """Return memoized `topk_weights` and `topk_ids`.""" + if self._memoized_topk_weights is None or self._memoized_topk_ids is None: + raise RuntimeError( + "ZeroExpertFusedMoE: routing results not memoized. " + "Call select_experts first to compute routing." + ) + return self._memoized_topk_weights, self._memoized_topk_ids + + self.custom_routing_function = custom_routing_function + + @contextmanager + def _temporarily_set_attrs(self, **attrs): + """ + Temporarily set attributes using object.__setattr__ and restore them. + + This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues. + When PyTorch Dynamo traces the forward pass, it cannot handle + nn.Module.__setattr__ calls (which include parameter registration logic), + resulting in "Unsupported" errors. Using object.__setattr__ directly + sets the attribute without triggering nn.Module's custom __setattr__, + allowing Dynamo to trace the code successfully. + """ + originals = {key: getattr(self, key) for key in attrs} + try: + for key, value in attrs.items(): + object.__setattr__(self, key, value) + yield + finally: + for key, value in originals.items(): + object.__setattr__(self, key, value) + + def _compute_zero_expert_result( + self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ) -> torch.Tensor | None: + """Compute zero expert results using pre-computed routing.""" + if ( + self._actual_zero_expert_num is None + or self._actual_zero_expert_num <= 0 + or self._actual_zero_expert_type is None + ): + return None + + return zero_experts_compute_triton( + expert_indices=topk_ids.clone(), + expert_scales=topk_weights.clone(), + num_experts=self.logical_num_experts, + zero_expert_type=self._actual_zero_expert_type, + hidden_states=hidden_states, + ) + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, # Full logits including zero experts + ) -> torch.Tensor: + """ + Forward pass with zero expert support and routing memoization. + + Args: + hidden_states: Input hidden states + router_logits: Full router logits (including zero experts) + + Returns: + Combined output from real experts and zero experts + """ + # Prepare temporary attribute overrides for routing computation + temp_attrs = { + "custom_routing_function": None, # Disable for first routing + } + if self._router is not None: + temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias + + # Compute routing with temporary attributes + # Pass full router_logits (including zero experts) so that zero experts + # can be properly identified in topk_ids + with self._temporarily_set_attrs(**temp_attrs): + topk_weights, topk_ids = self.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, # Full logits (includes zero experts) + ) + + # Compute zero expert result if needed + zero_expert_result = self._compute_zero_expert_result( + hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + + # Memoize routing results for reuse in super().forward() + self._memoized_topk_weights = topk_weights + self._memoized_topk_ids = topk_ids + + # Slice router_logits for real experts only + router_logits_sliced = router_logits[..., : self.logical_num_experts] + + # Compute real expert results (will reuse memoized routing via + # custom_routing_function) + # zero_expert_num is already 0, so FusedMoE won't handle zero experts + fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits_sliced, + ) + + # Combine results + # Both zero_expert_result and fused_out are computed from the same + # hidden_states, so they should be on the same device. + if zero_expert_result is not None: + fused_out = fused_out + zero_expert_result + + # Clear memoization after use + self._memoized_topk_weights = None + self._memoized_topk_ids = None + + return fused_out diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dfcc601a1c530..402f0bf69ceaa 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", + "ModelOptFp8PcPtLinearMethod", + "ModelOptFp8PbWoLinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", "HQQMarlinMethod", @@ -277,6 +279,7 @@ class LinearBase(CustomOp): self.params_dtype = params_dtype self.quant_config = quant_config self.prefix = prefix + self.allow_fp8_block_shape_mismatch = False if quant_config is None: self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod() else: @@ -475,6 +478,7 @@ class ColumnParallelLinear(LinearBase): disable_tp=disable_tp, ) + self._maybe_allow_fp8_block_shape_mismatch() self.gather_output = gather_output if output_sizes is None: @@ -509,6 +513,33 @@ class ColumnParallelLinear(LinearBase): self.register_parameter("bias", None) self.update_param_tp_status() + def _maybe_allow_fp8_block_shape_mismatch(self) -> None: + quant_config = getattr(self, "quant_config", None) + weight_block = getattr(quant_config, "weight_block_size", None) + if ( + weight_block is None + or len(weight_block) < 1 + or len(self.output_partition_sizes) <= 1 + ): + return + + try: + block_n = int(weight_block[0]) + except (ValueError, TypeError): + return + + if block_n <= 0: + return + + if any(size % block_n != 0 for size in self.output_partition_sizes): + self.allow_fp8_block_shape_mismatch = True + logger.debug( + "Allowing FP8 block shape mismatch for %s (block_n=%d, partitions=%s)", + getattr(self, "prefix", ""), + block_n, + self.output_partition_sizes, + ) + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): output_dim = getattr(param, "output_dim", None) @@ -906,9 +937,11 @@ class QKVParallelLinear(ColumnParallelLinear): *, return_bias: bool = True, disable_tp: bool = False, + v_head_size: int | None = None, ): self.hidden_size = hidden_size self.head_size = head_size + self.v_head_size = v_head_size if v_head_size is not None else head_size self.total_num_heads = total_num_heads if total_num_kv_heads is None: total_num_kv_heads = total_num_heads @@ -924,12 +957,14 @@ class QKVParallelLinear(ColumnParallelLinear): self.num_kv_head_replicas = 1 input_size = self.hidden_size output_size = ( - (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size - ) + self.num_heads * self.head_size + + self.num_kv_heads * self.head_size + + self.num_kv_heads * self.v_head_size + ) * tp_size self.output_sizes = [ self.num_heads * self.head_size * tp_size, # q_proj self.num_kv_heads * self.head_size * tp_size, # k_proj - self.num_kv_heads * self.head_size * tp_size, # v_proj + self.num_kv_heads * self.v_head_size * tp_size, # v_proj ] super().__init__( @@ -950,7 +985,8 @@ class QKVParallelLinear(ColumnParallelLinear): "q": 0, "k": self.num_heads * self.head_size, "v": (self.num_heads + self.num_kv_heads) * self.head_size, - "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size, + "total": (self.num_heads + self.num_kv_heads) * self.head_size + + self.num_kv_heads * self.v_head_size, } return shard_offset_mapping.get(loaded_shard_id) @@ -958,7 +994,7 @@ class QKVParallelLinear(ColumnParallelLinear): shard_size_mapping = { "q": self.num_heads * self.head_size, "k": self.num_kv_heads * self.head_size, - "v": self.num_kv_heads * self.head_size, + "v": self.num_kv_heads * self.v_head_size, } return shard_size_mapping.get(loaded_shard_id) @@ -985,7 +1021,7 @@ class QKVParallelLinear(ColumnParallelLinear): ( "v", (self.total_num_heads + self.total_num_kv_heads) * self.head_size, - self.total_num_kv_heads * self.head_size, + self.total_num_kv_heads * self.v_head_size, ), ] @@ -1110,7 +1146,7 @@ class QKVParallelLinear(ColumnParallelLinear): ( "v", (self.total_num_heads + self.total_num_kv_heads) * self.head_size, - self.total_num_kv_heads * self.head_size, + self.total_num_kv_heads * self.v_head_size, ), ] use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) @@ -1139,11 +1175,12 @@ class QKVParallelLinear(ColumnParallelLinear): "v": ( (self.total_num_heads + self.total_num_kv_heads) * self.head_size, - self.total_num_kv_heads * self.head_size, + self.total_num_kv_heads * self.v_head_size, ), "total": ( - (self.total_num_heads + 2 * self.total_num_kv_heads) - * self.head_size, + (self.total_num_heads + self.total_num_kv_heads) + * self.head_size + + self.total_num_kv_heads * self.v_head_size, 0, ), } @@ -1170,7 +1207,7 @@ class QKVParallelLinear(ColumnParallelLinear): shard_size = self.num_kv_heads * self.head_size elif loaded_shard_id == "v": shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size - shard_size = self.num_kv_heads * self.head_size + shard_size = self.num_kv_heads * self.v_head_size # Special case for Quantized Weights. # If quantized, we need to adjust the offset and size to account # for the packing. @@ -1199,10 +1236,11 @@ class QKVParallelLinear(ColumnParallelLinear): ), "v": ( (self.num_heads + self.num_kv_heads) * self.head_size, - self.num_kv_heads * self.head_size, + self.num_kv_heads * self.v_head_size, ), "total": ( - (self.num_heads + 2 * self.num_kv_heads) * self.head_size, + (self.num_heads + self.num_kv_heads) * self.head_size + + self.num_kv_heads * self.v_head_size, 0, ), } diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 314848721a80a..602d02d2f15a4 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 1fd959cb3423d..efe5677045e4b 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fc359a3067a9c..f4038801c266b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): e_score_correction_bias=layer.e_score_correction_bias, ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): f"{layer.activation} not supported for Marlin MoE." ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." ) assert self.moe_quant_config is not None - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index ee99572f5f499..758a54c10605a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): ) self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() - self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() if self.weight_block_size is not None: assert not self.is_static_input_scheme diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 11097cf36f5ca..56b11b22f7ff5 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ec3fc5ace17d8..d19b20798ed06 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import Enum -from functools import partial from typing import TYPE_CHECKING, Any, Optional import torch @@ -33,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, RoutingMethodType, fp8_w8a8_moe_quant_config, + fp8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.linear import ( LinearBase, @@ -51,7 +50,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8, build_flashinfer_fp8_cutlass_moe_prepare_finalize, - flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend, register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, @@ -97,7 +95,6 @@ from vllm.model_executor.parameter import ( ) from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform -from vllm.scalar_type import scalar_types from vllm.utils.deep_gemm import ( is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -118,20 +115,21 @@ class Fp8MoeBackend(Enum): FLASHINFER_TRTLLM = 1 FLASHINFER_CUTLASS = 2 DEEPGEMM = 3 - CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4 - MARLIN = 5 - TRITON = 6 + MARLIN = 4 + TRITON = 5 def get_fp8_moe_backend( block_quant: bool, moe_parallel_config: FusedMoEParallelConfig, with_lora_support: bool, -) -> Fp8MoeBackend: +) -> Fp8MoeBackend | None: """ Select the primary FP8 MoE backend Note: Shape-specific fallbacks may still occur at runtime. """ + if current_platform.is_xpu(): + return None if with_lora_support: return Fp8MoeBackend.TRITON # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100. @@ -191,17 +189,6 @@ def get_fp8_moe_backend( logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local") return Fp8MoeBackend.DEEPGEMM - # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights - if ( - current_platform.is_cuda() - and current_platform.is_device_capability_family(100) - and block_quant - ): - logger.info_once( - "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local" - ) - return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM - # default to Triton logger.info_once("Using Triton backend for FP8 MoE") return Fp8MoeBackend.TRITON @@ -306,6 +293,13 @@ class Fp8Config(QuantizationConfig): return UnquantizedLinearMethod() return XPUFp8LinearMethod(fp8_config) elif isinstance(layer, FusedMoE): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedFusedMoEMethod(layer.moe_config) + return XPUFp8MoEMethod(fp8_config, layer) elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) @@ -420,7 +414,7 @@ class Fp8LinearMethod(LinearMethodBase): if vllm_is_batch_invariant(): self.use_marlin = False - self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() self.use_deep_gemm = is_deep_gemm_supported() self.weight_block_size = self.quant_config.weight_block_size @@ -734,27 +728,33 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) self.marlin_input_dtype = None - self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN self.flashinfer_moe_backend: FlashinferMoeBackend | None = None if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS - if self.block_quant: - assert self.weight_block_size == [128, 128], ( - f"Only support weight_block_size == [128, 128], " - f"got {self.weight_block_size}" + if self.block_quant and self.weight_block_size != [128, 128]: + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend only supports block " + "size [128, 128]." + ) + if not self.block_quant: + if layer.renormalize or layer.custom_routing_function is not None: + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend does custom routing " + f"function or renormalization, but got {layer.renormalize} and " + f"{layer.custom_routing_function}." + ) + if layer.scoring_func != "sigmoid": + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend only supports " + f"'sigmoid' scoring function, but got {layer.scoring_func}." + ) + if layer.activation != "silu": + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend only supports SiLU " + "activation function, but got {layer.activation}." ) - self.flashinfer_moe_fn = partial( - flashinfer_cutlass_moe_fp8, - moe=self.moe, - use_deepseek_fp8_block_scale=self.block_quant, - ) - - self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM - self.allow_cutlass_block_scaled_grouped_gemm = ( - self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM - ) def create_weights( self, @@ -943,7 +943,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. - if self.allow_deep_gemm: + if self.fp8_backend == Fp8MoeBackend.DEEPGEMM: dg_w13_weight, dg_w13_weight_scale_inv = ( deepgemm_post_process_fp8_weight_block( wq=layer.w13_weight.data, @@ -1046,7 +1046,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) layer.w13_weight.data = w13_weight.data - if self.use_marlin: + if self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype ) @@ -1054,13 +1054,82 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale + # NOTE(rob): this is a WIP refactor. We are first migrating + # all of the kernels in the TP case to use mk. Once this is + # done, then we will initialzie the TP case and DP/EP case + # via the same code path (i.e. via maybe_init_modular_kernel). + # NOTE(rob): in progress migrating all into this format. + if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, + ) + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferAllGatherMoEPrepareAndFinalize, + ) + + config = self.get_fused_moe_quant_config(layer) + assert config is not None + self.moe_quant_config = config + + self.kernel = mk.FusedMoEModularKernel( + FlashInferAllGatherMoEPrepareAndFinalize( + use_dp=(self.moe.dp_size > 1), + use_deepseek_fp8_block_scale=self.block_quant, + ), + FlashInferExperts( + out_dtype=torch.get_default_dtype(), + quant_config=self.moe_quant_config, + ep_rank=self.moe.ep_rank, + ep_size=self.moe.ep_size, + tp_rank=self.moe.tp_rank, + tp_size=self.moe.tp_size, + use_dp=(self.moe.dp_size > 1), + use_deepseek_fp8_block_scale=self.block_quant, + ), + ) + self.use_inplace = False + + elif self.fp8_backend in [ + Fp8MoeBackend.DEEPGEMM, + Fp8MoeBackend.TRITON, + Fp8MoeBackend.MARLIN, + ]: + from vllm.model_executor.layers.fused_moe import ( + TritonOrDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + MarlinExperts, + ) + from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, + ) + + config = self.get_fused_moe_quant_config(layer) + assert config is not None + self.moe_quant_config = config + use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN + allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM + moe_kernel = ( + MarlinExperts(quant_config=self.moe_quant_config) + if use_marlin + else TritonOrDeepGemmExperts( + quant_config=self.moe_quant_config, + allow_deep_gemm=allow_deep_gemm, + ) + ) + + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), moe_kernel + ) + self.use_inplace = True + def maybe_make_prepare_finalize( self, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: if ( self.rocm_aiter_moe_enabled - or self.use_marlin + or self.fp8_backend == Fp8MoeBackend.MARLIN or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): return None @@ -1092,7 +1161,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): TritonOrDeepGemmExperts, ) - assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( + assert ( + self.fp8_backend != Fp8MoeBackend.MARLIN + ) and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet." ) @@ -1106,7 +1177,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): assert max_num_tokens_per_rank is not None experts_impl = ( - BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts + BatchedDeepGemmExperts + if self.fp8_backend == Fp8MoeBackend.DEEPGEMM + else BatchedTritonExperts ) logger.debug( "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s", @@ -1141,14 +1214,18 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) return TritonOrDeepGemmExperts( quant_config=self.moe_quant_config, - allow_deep_gemm=self.allow_deep_gemm, + allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), ) def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - if self.use_marlin: - return None + if self.fp8_backend == Fp8MoeBackend.MARLIN: + return fp8_w8a16_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + block_shape=self.weight_block_size, + ) return fp8_w8a8_moe_quant_config( w1_scale=( @@ -1179,6 +1256,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + # TODO(rob): convert this to MK. if layer.enable_eplb: raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.") assert layer.activation == "silu", ( @@ -1231,18 +1309,17 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) - select_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) - topk_weights, topk_ids, zero_expert_result = select_result - if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_fused_experts, ) + # TODO(rob): convert this to MK. result = rocm_aiter_fused_experts( x, layer.w13_weight, @@ -1254,80 +1331,21 @@ class Fp8MoEMethod(FusedMoEMethodBase): expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) - elif self.use_marlin: - assert layer.activation == "silu", ( - f"{layer.activation} not supported for Marlin MoE." - ) - result = fused_marlin_moe( + else: + result = self.kernel( x, layer.w13_weight, layer.w2_weight, - None, - None, - layer.w13_weight_scale, - layer.w2_weight_scale, - router_logits, topk_weights, topk_ids, - quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - input_dtype=self.marlin_input_dtype, - workspace=layer.workspace, - ) - elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert layer.activation == "silu", ( - f"Expected 'silu' activation but got {layer.activation}" - ) - if not self.block_quant: - assert ( - not layer.renormalize and layer.custom_routing_function is not None - ) - assert layer.scoring_func == "sigmoid", ( - f"Expected 'sigmoid' scoring func but got {layer.scoring_func}" - ) - # Delegate to CUTLASS FlashInfer path; function already bound with - # use_deepseek_fp8_block_scale for block-quant when applicable - result = self.flashinfer_moe_fn( - x, - layer, - topk_weights, - topk_ids, - inplace=False, + inplace=self.use_inplace, activation=layer.activation, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, ) - else: - from vllm.model_executor.layers.fused_moe import fused_experts - result = fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=layer.activation, - global_num_experts=layer.global_num_experts, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - expert_map=layer.expert_map, - quant_config=self.moe_quant_config, - allow_deep_gemm=self.allow_deep_gemm, - allow_cutlass_block_scaled_grouped_gemm=( - self.allow_cutlass_block_scaled_grouped_gemm - ), - ) - - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result class Fp8OnlineMoEMethod(Fp8MoEMethod): @@ -1471,7 +1489,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): replace_parameter(layer, "w2_weight", shuffled_w2) # Rushuffle weights for MARLIN if needed. - if self.use_marlin: + if self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 9dd734f2fea6a..9600bb42295dc 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): "fused GGUF MoE method." ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 347c7b2008d12..d2dafca99a230 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index a5db086fb4729..7994c838ad548 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -51,7 +51,7 @@ class QuantFP8(CustomOp): self.column_major_scales = column_major_scales self.use_ue8m0 = use_ue8m0 - self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled() self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index f33ee43727f19..9de2924ec71b1 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -6,13 +6,8 @@ from typing import Any, Optional import torch from packaging import version from torch.nn import Module -from torch.nn.parameter import Parameter from vllm._ipex_ops import ipex_ops as ops -from vllm.model_executor.layers.fused_moe import ( - FusedMoEMethodBase, - FusedMoeWeightScaleSupported, -) from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.linear import ( LinearBase, @@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import ( QuantizationMethods, ) from vllm.model_executor.layers.quantization.awq import AWQLinearMethod -from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod +from vllm.model_executor.layers.quantization.fp8 import ( + Fp8Config, + Fp8LinearMethod, + Fp8OnlineMoEMethod, +) from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - maybe_create_device_identity, -) -from vllm.model_executor.parameter import ModelWeightParameter -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform MIN_IPEX_VERSION = "2.6.0" @@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod): def __init__(self, quant_config: Fp8Config): super().__init__(quant_config) - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - maybe_create_device_identity() - - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - layer.logical_widths = output_partition_sizes - layer.input_size_per_partition = input_size_per_partition - layer.output_size_per_partition = output_size_per_partition - layer.orig_dtype = params_dtype - layer.weight_block_size = None - weight = ModelWeightParameter( - data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=params_dtype, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) - def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return # If checkpoint not serialized fp8, quantize the weights. if not self.quant_config.is_checkpoint_fp8_serialized: qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) # Update the layer with the new values. - layer.weight = Parameter(qweight, requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) + replace_parameter(layer, "weight", qweight.data) + replace_parameter(layer, "weight_scale", weight_scale.data) layer.input_scale = None def apply( @@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod): return output -class XPUFp8MoEMethod(FusedMoEMethodBase): +class XPUFp8MoEMethod(Fp8OnlineMoEMethod): def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(layer.moe_config) + super().__init__(quant_config, layer) self.quant_config = quant_config - def create_weights( - self, - layer: Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - layer.intermediate_size_per_partition = intermediate_size_per_partition - layer.hidden_size = hidden_size - layer.num_experts = num_experts - layer.orig_dtype = params_dtype - layer.weight_block_size = None - # WEIGHTS - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=params_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=params_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # Allocate 2 scales for w1 and w3 respectively. - # They will be combined to a single scale after weight loading. - w13_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False - ) - w2_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, dtype=torch.float32), requires_grad=False - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} - ) - # INPUT_SCALES - layer.w13_input_scale = None - layer.w2_input_scale = None - def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return if not self.quant_config.is_checkpoint_fp8_serialized: fp8_dtype = current_platform.fp8_dtype() w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) @@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) ) - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w2_weight", w2_weight) + import intel_extension_for_pytorch as ipex ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index faaa45b861de7..eb14f9ec378c4 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -30,7 +30,7 @@ from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig class MarlinLinearKernel(MPLinearKernel): @classmethod def get_min_capability(cls) -> int: - return 80 + return 75 @classmethod def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index aa3937d4c03ff..afbefe1fedc18 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( select_cutlass_fp8_gemm_impl, swap_w13_to_w31, ) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp, +) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( get_marlin_input_dtype, ) @@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, + cutlass_block_fp8_supported, requantize_with_max_scale, ) -from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) from vllm.scalar_type import scalar_types from vllm.utils.flashinfer import ( flashinfer_scaled_fp4_mm, @@ -88,7 +97,16 @@ if TYPE_CHECKING: logger = init_logger(__name__) -QUANT_ALGOS = ["FP8", "NVFP4"] +QUANT_ALGOS = [ + # FP8 (per-tensor weight + optional static activation scale). + "FP8", + # FP8 per-channel weight scale + per-token activation scale. + "FP8_PER_CHANNEL_PER_TOKEN", + # FP8 per-block weight-only (ModelOpt may emit this as lowercase). + "FP8_PB_WO", + # FP4 + "NVFP4", +] KV_CACHE_QUANT_ALGOS = ["FP8"] @@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig): if not quant_method: raise ValueError("Missing 'quant_algo' in quantization config") + # Normalize quant_algo for robust matching (ModelOpt may emit lowercase). + quant_method = str(quant_method).upper() + if kv_cache_quant_method is None: # No KV cache quantization, keep this branch just to have this comment pass @@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig): f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_method)}" ) + else: + kv_cache_quant_method = kv_cache_quant_method.upper() if not isinstance(exclude_modules, list): raise ValueError( @@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): def __init__( self, + quant_method: str, is_checkpoint_fp8_serialized: bool, kv_cache_quant_method: str | None, exclude_modules: list[str], ) -> None: super().__init__(exclude_modules) + self.quant_method = quant_method self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.kv_cache_quant_method = kv_cache_quant_method if is_checkpoint_fp8_serialized: logger.warning( - "Detected ModelOpt fp8 checkpoint. Please note that" - " the format is experimental and could change." + "Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note " + "that the format is experimental and could change.", + quant_method, + ) + + # Select LinearMethod implementation based on quant_algo. + if self.quant_method == "FP8": + self.LinearMethodCls = ModelOptFp8LinearMethod + elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN": + self.LinearMethodCls = ModelOptFp8PcPtLinearMethod + elif self.quant_method == "FP8_PB_WO": + self.LinearMethodCls = ModelOptFp8PbWoLinearMethod + else: + raise ValueError( + "Unsupported ModelOpt FP8 quant_algo for vLLM: " + f"{self.quant_method}. Supported: FP8 / " + "FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO." ) def get_name(self) -> QuantizationMethods: @@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): if "quantization" in hf_quant_cfg: quant_config = hf_quant_cfg["quantization"] if isinstance(quant_config, dict): - quant_algo = quant_config.get("quant_algo", "") - if "FP8" in quant_algo: + quant_algo = str(quant_config.get("quant_algo", "")) + if "FP8" in quant_algo.upper(): return "modelopt" else: # Check for compressed-tensors style config with specific quant_algo - quant_algo = hf_quant_cfg.get("quant_algo", "") - if isinstance(quant_algo, str) and "FP8" in quant_algo: + quant_algo = str(hf_quant_cfg.get("quant_algo", "")) + if "FP8" in quant_algo.upper(): return "modelopt" return None @@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): ) -> "ModelOptFp8Config": is_checkpoint_fp8_serialized = "FP8" in quant_method - return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules) + return cls( + quant_method, + is_checkpoint_fp8_serialized, + kv_cache_quant_method, + exclude_modules, + ) class ModelOptFp8LinearMethod(LinearMethodBase): @@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase): ) +class ModelOptFp8PcPtLinearMethod(LinearMethodBase): + """Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints. + + Expected checkpoint structure (per Linear): + - weight: fp8-e4m3fn, shape [out, in] + - weight_scale: fp32, shape [out] (per-output-channel) + - no input_scale (activations are dynamically quantized per-token) + """ + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + self.fp8_linear = Fp8LinearOp( + act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "FP8_PER_CHANNEL_PER_TOKEN currently only supports " + "FP8-serialized checkpoints." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + weight_scale = ChannelQuantScaleParameter( + data=torch.empty(output_size_per_partition, dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + layer.weight = Parameter(layer.weight.t(), requires_grad=False) + layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + bias=bias, + ) + + +class ModelOptFp8PbWoLinearMethod(LinearMethodBase): + """Linear method for ModelOpt FP8_PB_WO checkpoints. + + ModelOpt exports `weight_scale` as a 4D tensor: + [out_blk, 1, in_blk, 1] + where block size is typically 128 for both dims. + + vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant. + """ + + _WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128) + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + block_n, block_k = self._WEIGHT_BLOCK_SIZE + self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE) + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(block_n, block_k), + act_quant_group_shape=GroupShape(1, block_k), + cutlass_block_fp8_supported=cutlass_block_fp8_supported(), + use_aiter_and_is_supported=False, + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "FP8_PB_WO currently only supports FP8-serialized checkpoints." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + # Expose block size so the v2 weight loaders can translate offsets from + # element-space -> block-space for BlockQuantScaleParameter. + layer.weight_block_size = self.weight_block_size + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + block_n, block_k = self._WEIGHT_BLOCK_SIZE + if output_size_per_partition % block_n != 0: + raise ValueError( + "ModelOpt FP8_PB_WO requires out_features divisible by " + f"{block_n}, got {output_size_per_partition}." + ) + if input_size_per_partition % block_k != 0: + raise ValueError( + "ModelOpt FP8_PB_WO requires in_features divisible by " + f"{block_k}, got {input_size_per_partition}." + ) + + out_blks = output_size_per_partition // block_n + in_blks = input_size_per_partition // block_k + + # Match ModelOpt's exported shape so weight loading works without a + # custom loader: [out_blk, 1, in_blk, 1] + weight_scale = BlockQuantScaleParameter( + data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32), + input_dim=2, + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + # Keep weight in [out, in] layout for W8A8BlockFp8LinearOp. + layer.weight = Parameter(layer.weight.data, requires_grad=False) + + scale = layer.weight_scale + if scale.dim() == 4: + # [out_blk, 1, in_blk, 1] -> [out_blk, in_blk] + scale = scale.squeeze(1).squeeze(-1) + elif scale.dim() != 2: + raise ValueError( + "Unexpected ModelOpt FP8_PB_WO weight_scale shape: " + f"{tuple(scale.shape)}." + ) + + layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.w8a8_block_fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + bias=bias, + ) + + class ModelOptFp8MoEMethod(FusedMoEMethodBase): """MoE method for ModelOpt FP8. Supports loading FP8 checkpoints with static weight scale and @@ -796,7 +1038,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) # Expert selection - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1599,7 +1841,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): x_routing, _ = x else: x_routing = x - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x_routing, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 4bedb951a33f5..513f6f7b21abc 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase): from vllm.model_executor.layers.fused_moe import fused_experts assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index c50753270b86e..40a2831881369 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError("EPLB is not supported for mxfp4") if self.mxfp4_backend == Mxfp4Backend.MARLIN: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -989,7 +989,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): ): from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 0b9b098afb1f6..81970480319ab 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index b2ecb0b175f81..dce9c661ec332 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index bdc3d1fc7232d..15ea9f7d60fff 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( + DeepGemmQuantScaleFMT, fp8_gemm_nt, is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp: self.act_quant_group_shape = act_quant_group_shape self.is_deep_gemm_supported = is_deep_gemm_supported() self.is_hopper = current_platform.is_device_capability(90) - self.is_blackwell = current_platform.is_device_capability_family(100) self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() # Get the correct blockscale mul and input quant operations. @@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp: weight: torch.Tensor, weight_scale: torch.Tensor, ) -> torch.Tensor: - if self.use_deep_gemm_e8m0 and self.is_blackwell: + if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0: q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( input_2d, group_size=self.act_quant_group_shape.col, @@ -1252,6 +1252,14 @@ def validate_fp8_block_shape( """Validate block quantization shapes for tensor parallelism.""" from vllm.distributed import get_tensor_model_parallel_world_size + if getattr(layer, "allow_fp8_block_shape_mismatch", False): + logger.debug( + "Skipping FP8 block shape validation for layer %s due to detected" + " mismatch allowance.", + getattr(layer, "prefix", ""), + ) + return + tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size()) block_n, block_k = block_size[0], block_size[1] diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 072b46f055210..3de2b6509e460 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -48,7 +48,7 @@ def query_marlin_supported_quant_types( -1 if capability_tuple is None else capability_tuple.to_int() ) - if device_capability < 80: + if device_capability < 75: return [] # - has_zp is True: return quant_types that has zero points @@ -594,9 +594,15 @@ def apply_awq_marlin_linear( a_scales = None if input_dtype == torch.int8: + assert quant_type == scalar_types.uint4, ( + "W8A8-INT8 is not supported by marlin kernel." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) a_scales = a_scales * input_global_scale elif input_dtype == torch.float8_e4m3fn: + assert quant_type == scalar_types.uint4, ( + "INT8 weight + FP8 activation is not supported." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) output = ops.gptq_marlin_gemm( @@ -649,9 +655,15 @@ def apply_rtn_marlin_linear( a_scales = None if input_dtype == torch.int8: + assert quant_type == scalar_types.uint4b8, ( + "W8A8-INT8 is not supported by marlin kernel." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) a_scales = a_scales * input_global_scale elif input_dtype == torch.float8_e4m3fn: + assert quant_type == scalar_types.uint4b8, ( + "INT8 weight + FP8 activation is not supported." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) output = ops.gptq_marlin_gemm( diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index b94d5bbf36540..4d0a34c3be119 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -23,7 +23,7 @@ logger = init_logger(__name__) def is_fp4_marlin_supported(): - return current_platform.has_device_capability(80) + return current_platform.has_device_capability(75) def nvfp4_marlin_process_scales(marlin_scales): @@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin( ) is_nvfp4 = hasattr(layer, "weight_scale_2") + if input_dtype is not None and input_dtype.itemsize == 1: + if is_nvfp4: + raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.") + elif input_dtype != torch.float8_e4m3fn: + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + group_size = 16 if is_nvfp4 else 32 part_size_n = layer.output_size_per_partition @@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin( ) is_nvfp4 = hasattr(layer, "w13_weight_scale_2") + if input_dtype is not None and input_dtype.itemsize == 1: + if is_nvfp4: + raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.") + elif input_dtype != torch.float8_e4m3fn: + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + group_size = 16 if is_nvfp4 else 32 e = layer.num_experts diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index c67e4f437cf0c..4d2f2fd71ad36 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -11,7 +11,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales, - marlin_quant_input, should_use_atomic_add_reduce, ) from vllm.model_executor.utils import replace_parameter @@ -22,7 +21,7 @@ logger = init_logger(__name__) def is_fp8_marlin_supported(): - return current_platform.has_device_capability(80) + return current_platform.has_device_capability(75) def fp8_fused_exponent_bias_into_scales(scales): @@ -63,13 +62,11 @@ def apply_fp8_marlin_linear( inputs = reshaped_x a_scales = None if input_dtype is not None and input_dtype.itemsize == 1: - if input_dtype != torch.float8_e4m3fn: - raise RuntimeError("FP8 weight + INT8 activation is not supported.") - - inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn) + # inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn) + raise RuntimeError("Marlin W8A8 is not supported.") output = ops.gptq_marlin_gemm( - a=reshaped_x, + a=inputs, c=None, b_q_weight=weight, b_bias=bias, @@ -102,6 +99,8 @@ def prepare_fp8_layer_for_marlin( "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads." ) + if input_dtype is not None and input_dtype.itemsize == 1: + raise RuntimeError("Marlin W8A8 is not supported.") part_size_n = layer.output_size_per_partition part_size_k = layer.input_size_per_partition @@ -145,10 +144,20 @@ def prepare_fp8_layer_for_marlin( # marlin kernel only support channel-wise and group-wise quantization # we need to convert the scales if weight_block_size is None: + logical_widths = getattr(layer, "logical_widths", []) if scales.nelement() == 1: # tensor-wise quantization -> channel-wise quantization # (1, 1) =>(repeat)=> (1, size_n) scales = scales.view(1, 1).repeat_interleave(part_size_n, 1) + elif scales.nelement() == len(logical_widths): + # tensor-wise quantization with logical_widths -> + # channel-wise quantization + assert sum(logical_widths) == part_size_n, ( + f"Sum of logical_widths ({sum(logical_widths)}) must be equal " + f"to part_size_n ({part_size_n})" + ) + lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64) + scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1) elif scales.nelement() > 1 and scales.nelement() != part_size_n: assert part_size_n % scales.nelement() == 0 s_size = scales.nelement() @@ -199,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin( "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads." ) + if input_dtype is not None and input_dtype.itemsize == 1: + raise RuntimeError("Marlin W8A8 is not supported.") e = layer.num_experts k = layer.hidden_size diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 50660c6ecc223..b86cd9f001d61 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -178,6 +178,37 @@ class ApplyRotaryEmb(CustomOp): output = output.to(origin_dtype) return output + def _pre_process( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]: + origin_shape = x.shape + if len(origin_shape) == 3: + # x: [seq_len, num_heads, head_size] + x = x.unsqueeze(0) + + origin_dtype = x.dtype + if self.enable_fp32_compute: + x = x.float() + cos = cos.float() + sin = sin.float() + + return x, cos, sin, origin_shape, origin_dtype + + def _post_process( + self, + output: torch.Tensor, + origin_shape: torch.Size, + origin_dtype: torch.dtype, + ) -> torch.Tensor: + if len(origin_shape) == 3: + output = output.squeeze(0) + if self.enable_fp32_compute: + output = output.to(origin_dtype) + return output + def forward_native( self, x: torch.Tensor, @@ -197,16 +228,7 @@ class ApplyRotaryEmb(CustomOp): ) -> torch.Tensor: from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb - origin_dtype = x.dtype - if self.enable_fp32_compute: - x = x.float() - cos = cos.float() - sin = sin.float() - - origin_shape = x.shape - if len(origin_shape) == 3: - # x: [seq_len, num_heads, head_size] - x = x.unsqueeze(0) + x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin) """ Arguments of apply_rotary_emb() in vllm_flash_attn: @@ -218,10 +240,7 @@ class ApplyRotaryEmb(CustomOp): interleaved = not self.is_neox_style output = apply_rotary_emb(x, cos, sin, interleaved) - if len(origin_shape) == 3: - output = output.squeeze(0) - if self.enable_fp32_compute: - output = output.to(origin_dtype) + output = self._post_process(output, origin_shape, origin_dtype) return output def forward_hip( @@ -231,16 +250,7 @@ class ApplyRotaryEmb(CustomOp): sin: torch.Tensor, ) -> torch.Tensor: if self.apply_rotary_emb_flash_attn is not None: - origin_dtype = x.dtype - if self.enable_fp32_compute: - x = x.float() - cos = cos.float() - sin = sin.float() - - origin_shape = x.shape - if len(origin_shape) == 3: - # x: [seq_len, num_heads, head_size] - x = x.unsqueeze(0) + x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin) """ Arguments of apply_rotary() in flash_attn: @@ -254,10 +264,7 @@ class ApplyRotaryEmb(CustomOp): x, cos, sin, interleaved=interleaved ).type_as(x) - if len(origin_shape) == 3: - output = output.squeeze(0) - if self.enable_fp32_compute: - output = output.to(origin_dtype) + output = self._post_process(output, origin_shape, origin_dtype) else: # Falling back to PyTorch native implementation. output = self.forward_native(x, cos, sin) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 74b02e4c62583..08d7a851ac9ab 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -189,7 +189,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ) convert_type = model_config.convert_type - if convert_type != "none" and supports_multimodal(model_cls): + if convert_type not in ["none", "mm_encoder_only"] and supports_multimodal( + model_cls + ): logger.debug_once("Detected conversion of Multi Modal model.") converted = try_create_mm_pooling_model_cls(model_cls) if converted is not None: @@ -200,6 +202,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], if convert_type == "none": pass + elif convert_type == "mm_encoder_only": + logger.debug_once("Converting to mm encoder only model.") + from vllm.model_executor.models.adapters import as_mm_encoder_only_model + + model_cls = as_mm_encoder_only_model(model_cls) elif convert_type == "embed": logger.debug_once("Converting to embedding model.") model_cls = as_embedding_model(model_cls) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 504de9fe10871..acf1e57a59a97 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -520,3 +520,64 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]): method = getattr(text_config, "method", None) assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" return SEQ_CLS_LOAD_METHODS[method](model, weights) + + +def as_mm_encoder_only_model(cls: _T) -> _T: + """ + Subclass an existing vLLM vl model to support mm encoder only for + EPD encoder instances. + """ + if not hasattr(cls, "embed_multimodal"): + # Submodel case: return the original class. + return cls + + if not hasattr(cls, "get_language_model_spec"): + raise TypeError(f"{cls} need to implement `get_language_model_spec` method.") + + lm_model_cls, lm_attr = cls.get_language_model_spec() + + if lm_model_cls is None or lm_attr is None: + raise TypeError( + f"{cls}.get_language_model_spec() must return (lm_model_cls, lm_attr)" + ) + + class DummyLM(nn.Module): + def __init__(self, *args, **kwargs): + self.make_empty_intermediate_tensors = None + + class ModelForMMEncoderOnly(cls): + def __init__( + self, + *, + vllm_config: "VllmConfig", + prefix: str = "", + **kwargs: Any, + ) -> None: + self.is_mm_encoder_only_model = True + origin_init = lm_model_cls.__init__ + try: + lm_model_cls.__init__ = DummyLM.__init__ + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) + + if hasattr(self, lm_attr): + delattr(self, lm_attr) + finally: + lm_model_cls.__init__ = origin_init + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + from .utils import AutoWeightsLoader + + origin_init_ = AutoWeightsLoader.__init__ + + def _new_init_(self, *args, **kwargs): + origin_init_(self, *args, **kwargs) + self.skip_prefixes = (self.skip_prefixes or []) + [f"{lm_attr}."] + + try: + AutoWeightsLoader.__init__ = _new_init_ + result = super().load_weights(weights) + finally: + AutoWeightsLoader.__init__ = origin_init_ + return result + + return ModelForMMEncoderOnly # type: ignore diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index 98229c6d4ca1b..cf45fb9fe8370 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -487,7 +487,7 @@ class BagelForConditionalGeneration( # Split by image return tuple(vision_embeds) - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: """Get multimodal embeddings from input.""" image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index a3624b1cfa5f2..ccac8a6066429 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -401,7 +401,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # of attention tokens that would fit mamba_page_size: # e.g. for mamba page size = 788kB # attn_1_token = 2kB -> fits ~394 tokens - # then round up to a mulitple of 256 -> 512 tokens + # then round up to a multiple of 256 -> 512 tokens # End result: # attn_block_size = 512 # mamba_block_size = 512 (aligned to a multiple of chunk_size) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 6670143cda250..22d43a4bae18a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -878,8 +878,11 @@ class Indexer(nn.Module): ) q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1)) - q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1) - k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1) + # `rotary_emb` is shape-preserving; `q_pe` is already + # [num_tokens, n_head, rope_dim]. + q = torch.cat([q_pe, q_nope], dim=-1) + # `k_pe` is [num_tokens, 1, rope_dim] (MQA). + k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1) # we only quant q here since k quant is fused with cache insertion q = q.view(-1, self.head_dim) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 541d3b2beff83..6fb09be7c67f2 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -21,7 +21,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights.""" +"""Inference-only GLM-4.5, GLM-4.6, GLM-4.7 model +compatible with HuggingFace weights.""" import typing from collections.abc import Callable, Iterable diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb99d57e8b8c7..67c65a44dcf7f 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -141,6 +141,14 @@ class SupportsMultiModal(Protocol): """ ... + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return None, None + @overload def embed_input_ids(self, input_ids: Tensor) -> Tensor: ... @@ -302,6 +310,10 @@ def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool: return getattr(model, "supports_encoder_tp_data", False) +def supports_mm_encoder_only(model: type[object] | object) -> bool: + return getattr(model, "is_mm_encoder_only_model", False) + + @overload def supports_multimodal_pruning( model: type[object], diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index c5441283f9711..774737387639b 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig): self.intermediate_size = ( self.ffn_hidden_size if hasattr(self, "ffn_hidden_size") - else self.intermediate_size + else intermediate_size ) if hasattr(self, "moe_intermediate_size"): self.moe_intermediate_size = self.moe_intermediate_size @@ -280,10 +280,6 @@ class LongcatMoe(nn.Module): ): super().__init__() self.hidden_size = hidden_size - self.zero_expert_num = config.zero_expert_num - self.zero_expert_type = config.zero_expert_type - self.routed_scaling_factor = config.routed_scaling_factor - self.enable_eplb = enable_eplb # Gate always runs at half / full precision for now. self.rounter_params_dtype = params_dtype if config.router_dtype == "float32": @@ -291,25 +287,27 @@ class LongcatMoe(nn.Module): self.router = LongcatRouter( config=config, - zero_expert_num=self.zero_expert_num, + zero_expert_num=config.zero_expert_num, rounter_params_dtype=self.rounter_params_dtype, prefix=f"{prefix}.gate", ) - self.experts = FusedMoE( + assert config.zero_expert_num is not None + assert config.zero_expert_type is not None + self.experts = ZeroExpertFusedMoE( + zero_expert_num=config.zero_expert_num, + zero_expert_type=config.zero_expert_type, + router=self.router, num_experts=num_experts, top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, reduce_results=True, params_dtype=params_dtype, - e_score_correction_bias=self.router.e_score_correction_bias, renormalize=False, quant_config=quant_config, prefix=f"{prefix}.experts", - zero_expert_num=self.zero_expert_num, - zero_expert_type=self.zero_expert_type, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, routed_scaling_factor=config.routed_scaling_factor, ) @@ -317,11 +315,34 @@ class LongcatMoe(nn.Module): num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - router_logits = self.router(hidden_states.to(self.rounter_params_dtype)) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits + # Align to FusedMoE padded hidden size to avoid dim mismatch + padded_hidden = self.experts.hidden_size + if hidden_dim < padded_hidden: + hidden_states_padded = torch.nn.functional.pad( + hidden_states, + (0, padded_hidden - hidden_dim), + mode="constant", + value=0.0, + ) + else: + hidden_states_padded = hidden_states + + router_logits_full = self.router( + hidden_states_padded.to(self.rounter_params_dtype) ) + # ZeroExpertFusedMoE handles routing memoization and zero expert computation + # internally. Pass full router_logits (including zero experts) so that + # zero experts can be properly identified in routing. + final_hidden_states = self.experts( + hidden_states=hidden_states_padded, + router_logits=router_logits_full, # Full logits (includes zero experts) + ) + + # Crop back to original hidden dimension if padded earlier + if padded_hidden != hidden_dim: + final_hidden_states = final_hidden_states[..., :hidden_dim] + return final_hidden_states.view(num_tokens, hidden_dim) @@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module): hidden_states = self.self_attn[0]( positions=positions, hidden_states=hidden_states, + llama_4_scaling=None, ) hidden_states, residual = self.post_attention_layernorm[0]( @@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module): hidden_states = self.self_attn[1]( positions=positions, hidden_states=hidden_states, + llama_4_scaling=None, ) hidden_states, residual = self.post_attention_layernorm[1]( hidden_states, residual diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py new file mode 100644 index 0000000000000..12b486f001e03 --- /dev/null +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -0,0 +1,720 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from itertools import islice + +import torch +from torch import nn + +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention +from vllm.config import ( + CacheConfig, + VllmConfig, + get_current_vllm_config, + str_dtype_to_torch_dtype, +) +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiMoV2MLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QuantizationConfig | None = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class MiMoV2MoE(nn.Module): + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + is_nextn: bool = False, + ): + super().__init__() + + config = vllm_config.model_config.hf_text_config + parallel_config = vllm_config.parallel_config + quant_config = vllm_config.quant_config + + self.tp_size = get_tensor_model_parallel_world_size() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.n_routed_experts + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + + if self.tp_size > config.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.n_routed_experts}." + ) + + if config.hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now." + ) + + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = self.ep_rank * self.n_local_physical_experts + self.physical_expert_end = ( + self.physical_expert_start + self.n_local_physical_experts + ) + + dtype = getattr(config, "moe_router_dtype", "float32") + self.gate_dtype = str_dtype_to_torch_dtype(dtype) + self.gate = nn.Linear( + config.hidden_size, + config.n_routed_experts, + bias=False, + dtype=self.gate_dtype, + ) + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts, dtype=self.gate_dtype) + ) + + self.experts = FusedMoE( + num_experts=self.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=True, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + scoring_func="sigmoid", + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + assert hidden_states.dim() <= 2, "MiMoV2MoE only supports 1D or 2D inputs" + is_input_1d = hidden_states.dim() == 1 + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.gate_dtype is not None: + gate_input = hidden_states.to(self.gate_dtype) + else: + gate_input = hidden_states + router_logits = self.gate(gate_input) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + + return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states + + +class MiMoV2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + v_head_dim: int | None = None, + sliding_window_size: int = -1, + attention_bias: bool = False, + add_swa_attention_sink_bias: bool = False, + layer_id: int = 0, + rope_theta: float = 1000000, + max_position_embeddings: int = 32768, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + partial_rotary_factor: float = 1.0, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + self.layer_id = layer_id + tp_size = get_tensor_model_parallel_world_size() + + self.total_num_heads = num_heads + self.num_heads = self.total_num_heads // tp_size + + self.total_num_kv_heads = num_kv_heads + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + + self.head_dim = head_dim + + self.v_head_dim = v_head_dim if v_head_dim is not None else head_dim + + self.q_size = self.num_heads * self.head_dim + self.k_size = self.num_kv_heads * self.head_dim + self.v_size = self.num_kv_heads * self.v_head_dim + + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=attention_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + v_head_size=self.v_head_dim, + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.v_head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=True, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + head_size=self.head_dim, + max_position=max_position_embeddings, + rope_parameters={ + "rope_type": "default", + "rope_theta": rope_theta, + "partial_rotary_factor": partial_rotary_factor, + }, + ) + + self.attention_sink_bias = ( + torch.nn.Parameter(torch.empty(self.num_heads), requires_grad=False) + if add_swa_attention_sink_bias + else None + ) + + sliding_window = sliding_window_size if sliding_window_size > -1 else None + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=sliding_window, + attn_type=AttentionType.DECODER, + prefix=f"{prefix}.attn", + sinks=self.attention_sink_bias, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + + v = v.view(-1, self.num_kv_heads, self.v_head_dim) + v = torch.nn.functional.pad(v, [0, self.head_dim - self.v_head_dim], value=0) + v = v.view(-1, self.num_kv_heads * self.head_dim) + + attn_output = self.attn(q, k, v) + + attn_output = attn_output.view(-1, self.num_heads, self.head_dim)[ + ..., : self.v_head_dim + ].reshape(-1, self.num_heads * self.v_head_dim) + + output, _ = self.o_proj(attn_output) + return output + + +class MiMoV2FlashDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_text_config + quant_config = vllm_config.quant_config + layer_id = extract_layer_index(prefix) + + self.hidden_size = config.hidden_size + self.config = config + self.layer_id = layer_id + + rope_theta = getattr(config, "rope_theta", 1000000) + max_position_embeddings = getattr(config, "max_position_embeddings", 32768) + + if self.is_compressed_softmax_layer(): + self.self_attn = MiMoV2Attention( + hidden_size=self.hidden_size, + num_heads=config.swa_num_attention_heads, + num_kv_heads=config.swa_num_key_value_heads, + head_dim=config.swa_head_dim, + v_head_dim=getattr(config, "swa_v_head_dim", None), + sliding_window_size=config.sliding_window_size, + attention_bias=config.attention_bias, + add_swa_attention_sink_bias=getattr( + config, "add_swa_attention_sink_bias", False + ), + layer_id=layer_id, + rope_theta=getattr(config, "swa_rope_theta", rope_theta), + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0), + prefix=f"{prefix}.self_attn", + ) + else: + self.self_attn = MiMoV2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + head_dim=config.head_dim, + v_head_dim=getattr(config, "v_head_dim", None), + sliding_window_size=-1, # normal attention + attention_bias=config.attention_bias, + layer_id=layer_id, + rope_theta=rope_theta, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0), + prefix=f"{prefix}.self_attn", + ) + + self.is_layer_sparse = self.is_moe_layer(layer_id) + if self.is_layer_sparse: + self.mlp = MiMoV2MoE( + vllm_config=vllm_config, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = MiMoV2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.layernorm_epsilon + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + def is_moe_layer(self, layer_idx: int) -> bool: + return ( + hasattr(self.config, "moe_layer_freq") + and layer_idx >= 0 + and not isinstance(self.config.moe_layer_freq, int) + and self.config.moe_layer_freq[layer_idx] + ) + + def is_compressed_softmax_layer(self) -> bool: + return self.config.hybrid_layer_pattern[self.layer_id] == 1 + + +class MiMoV2Model(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config.get_text_config() + quant_config = vllm_config.quant_config + eplb_config = vllm_config.parallel_config.eplb_config + + self.config = config + self.quant_config = quant_config + self.vocab_size = config.vocab_size + self.num_redundant_experts = eplb_config.num_redundant_experts + self.v_scale = getattr(config, "attention_value_scale", None) + + if get_pp_group().is_first_rank or ( + config.tie_word_embeddings and get_pp_group().is_last_rank + ): + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiMoV2FlashDecoderLayer( + vllm_config=vllm_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon) + else: + self.norm = PPMissingLayer() + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for idx, layer in enumerate( + islice(self.layers, self.start_layer, self.end_layer) + ): + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + + hidden_states, _ = self.norm(hidden_states, residual) + + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts, + num_redundant_experts=self.num_redundant_experts, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + continue + if "mtp" in name: + continue + + if self.quant_config is not None: + cache_scale_name = self.quant_config.get_cache_scale(name) + if cache_scale_name is not None and cache_scale_name in params_dict: + param = params_dict[cache_scale_name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + + kv_scale = loaded_weight + if kv_scale.dim() > 0 and kv_scale.numel() > 1: + kv_scale = kv_scale.view(-1)[0] + + weight_loader(param, kv_scale) + loaded_params.add(cache_scale_name) + continue + + expert_matched = False + for param_name, weight_name, expert_id, shard_id in expert_params_mapping: + if weight_name not in name: + continue + + name_rewritten = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name_rewritten, self): + continue + + if ( + name_rewritten.endswith(".bias") or name_rewritten.endswith("_bias") + ) and name_rewritten not in params_dict: + continue + + if name_rewritten not in params_dict: + continue + + param = params_dict[name_rewritten] + weight_loader = param.weight_loader + + weight_loader( + param, + loaded_weight, + name_rewritten, + shard_id=shard_id, + expert_id=expert_id, + ) + loaded_params.add(name_rewritten) + expert_matched = True + break + + if expert_matched: + continue + + stacked_matched = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name_rewritten = name.replace(weight_name, param_name) + + if ( + name_rewritten.endswith(".bias") + and name_rewritten not in params_dict + ): + continue + + if is_pp_missing_parameter(name_rewritten, self): + continue + + if name_rewritten not in params_dict: + continue + + param = params_dict[name_rewritten] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + + if param_name == "qkv_proj" and shard_id == "v": + v_scale = ( + self.v_scale + if self.v_scale is not None + else getattr(self.config, "attention_value_scale", None) + ) + if v_scale is not None and ( + name.endswith("weight_scale_inv") or name.endswith(".bias") + ): + loaded_weight *= float(v_scale) + + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name_rewritten) + + stacked_matched = True + break + + if stacked_matched: + continue + + if name.endswith(".bias") and name not in params_dict: + continue + + orig_name = name + mapped_name = maybe_remap_kv_scale_name(name, params_dict) + name = mapped_name if mapped_name is not None else orig_name + + if name not in params_dict: + continue + + param = params_dict[name] + + if "attention_sink_bias" in name: + total_heads = loaded_weight.shape[0] + heads_per_rank = total_heads // tp_size + head_start = tp_rank * heads_per_rank + narrow_weight = loaded_weight.narrow(0, head_start, heads_per_rank) + + param.data.copy_(narrow_weight) + loaded_params.add(name) + else: + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params + + +class MiMoV2FlashForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.config = config + self.quant_config = quant_config + self.model = MiMoV2Model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b730ac0315893..0b44ff622f05b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -34,7 +34,7 @@ import einops import torch import torch.nn as nn import torch.nn.functional as F -from transformers import BatchFeature +from transformers import BatchFeature, Qwen2ForCausalLM from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, @@ -1567,3 +1567,11 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) + + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return Qwen2ForCausalLM, "language_model" diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 089129e443c01..5ca6b3d852ac3 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -323,7 +323,7 @@ class Qwen3Omni_VisionTransformer(nn.Module): hidden_size=self.hidden_size, ) - # vit pos embeding, TODO: spatial_patch_size vs patch_size + # vit pos embedding, TODO: spatial_patch_size vs patch_size if self.apply_vit_abs_pos_embed: self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size) else: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4838f68e06f70..fea73557f1e82 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2090,3 +2090,11 @@ class Qwen3VLForConditionalGeneration( connector="visual.merger", tower_model="visual.", ) + + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return Qwen3LLMForCausalLM, "language_model" diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index d332f51152484..3ba61b52cfdf1 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -152,6 +152,7 @@ _TEXT_GENERATION_MODELS = { "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MiMoForCausalLM": ("mimo", "MiMoForCausalLM"), + "MiMoV2FlashForCausalLM": ("mimo_v2_flash", "MiMoV2FlashForCausalLM"), "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 9d77dee2810c3..fcf9a0d077abe 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -22,6 +22,7 @@ from typing import TYPE_CHECKING import torch from vllm.config.utils import getattr_iter +from vllm.logger import init_logger from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal from vllm.model_executor.models.utils import WeightsMapper from vllm.multimodal import MultiModalKwargsItems @@ -36,6 +37,7 @@ from vllm.multimodal.inputs import ( from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors if TYPE_CHECKING: @@ -52,6 +54,8 @@ DYNAMIC_ARG_DIMS = { "inputs_embeds": 0, } +logger = init_logger(__name__) + class MultiModalProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self): @@ -345,8 +349,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): num_image_patches = kwargs.pop("num_image_patches") kwargs.pop("token_type_ids", None) # used only in `forward` + if pixel_values is not None: - vision_embeddings = self.model.get_image_features(pixel_values, **kwargs) + # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues + # with flash_sdp and mem_efficient_sdp + if current_platform.is_rocm(): + # TODO: [ROCm] Fix accuracy issues with flash backend + logger.debug( + "ROCm platform detected. Forcing math SDP backend " + "for vision encoder. Currently ROCm platform has " + "accuracy issues with `flash_sdp` and" + "`mem_efficient_sdp` backends. See issue: " + "https://github.com/vllm-project/vllm/issues/30167" + ) + with torch.nn.attention.sdpa_kernel( + backends=[torch.nn.attention.SDPBackend.MATH] + ): + vision_embeddings = self.model.get_image_features( + pixel_values, **kwargs + ) + else: + vision_embeddings = self.model.get_image_features( + pixel_values, **kwargs + ) if isinstance(vision_embeddings, torch.Tensor): if vision_embeddings.ndim == 2: @@ -364,6 +389,11 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): ] return vision_embeddings + else: + logger.debug( + "No pixel values or image embeddings provided for multimodal embedding." + ) + return None def get_mrope_input_positions( self, diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index a69afc3176cab..64c03f8d4da94 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -126,6 +126,30 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): return {} +def validate_embedding_ndim( + tensor: torch.Tensor, + modality: str, + index: int | None = None, +) -> None: + """Validate tensor ndim for multimodal embeddings. + + Single embeddings should be 2D (seq_len, hidden_size). + Batched embeddings should be 3D (batch, seq_len, hidden_size). + + Args: + tensor: The tensor to validate. + modality: The modality name for error messages (e.g., "image", "audio"). + index: Optional index for list items, included in error messages. + """ + if tensor.ndim < 2 or tensor.ndim > 3: + idx_str = f" [{index}]" if index is not None else "" + raise ValueError( + f"{modality.capitalize()} embedding{idx_str} must be 2D " + f"(seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), " + f"got {tensor.ndim}D tensor with shape {tuple(tensor.shape)}" + ) + + class EmbeddingItems( ModalityDataItems[torch.Tensor | list[torch.Tensor], torch.Tensor] ): @@ -134,6 +158,63 @@ class EmbeddingItems( or a list of embedding tensors (one per item). """ + def __init__( + self, + data: torch.Tensor | list[torch.Tensor], + modality: str, + expected_hidden_size: int | None = None, + ) -> None: + super().__init__(data, modality) + + # Validate ndim first (before hidden_size which depends on correct ndim) + self._validate_ndim() + + # Validate hidden dimension if expected size is provided + if expected_hidden_size is not None: + self._validate_hidden_size(expected_hidden_size) + + def _validate_ndim(self) -> None: + """Validate that embedding tensors have correct ndim (2D or 3D).""" + if isinstance(self.data, torch.Tensor): + validate_embedding_ndim(self.data, self.modality) + else: + # List of tensors: each should be 2D (seq_len, hidden_size) + for idx, tensor in enumerate(self.data): + if tensor.ndim != 2: + raise ValueError( + f"{self.modality.capitalize()} embedding [{idx}] must be " + f"2D (seq_len, hidden_size), got {tensor.ndim}D tensor " + f"with shape {tuple(tensor.shape)}" + ) + + def _validate_hidden_size(self, expected_hidden_size: int) -> None: + """Validate that embedding hidden dimension matches expected size. + + This validates hidden dimensions to prevent vulnerabilities: Embeddings + with correct ndim but wrong hidden dimension could bypass initial + checks and cause crashes during model inference when dimensions don't match. + """ + if isinstance(self.data, torch.Tensor): + # Batched tensor: shape is (batch, seq_len, hidden_size) + actual_hidden_size = self.data.shape[-1] + if actual_hidden_size != expected_hidden_size: + raise ValueError( + f"{self.modality.capitalize()} embedding hidden dimension " + f"mismatch: got {actual_hidden_size}, but model expects " + f"{expected_hidden_size}. Embedding shape: {tuple(self.data.shape)}" + ) + else: + # List of tensors: each has shape (seq_len, hidden_size) + for idx, tensor in enumerate(self.data): + actual_hidden_size = tensor.shape[-1] + if actual_hidden_size != expected_hidden_size: + raise ValueError( + f"{self.modality.capitalize()} embedding [{idx}] hidden " + f"dimension mismatch: got {actual_hidden_size}, but model " + f"expects {expected_hidden_size}. " + f"Embedding shape: {tuple(tensor.shape)}" + ) + def _unwrap( self, item: torch.Tensor | MediaWithBytes[torch.Tensor] ) -> torch.Tensor: @@ -228,8 +309,12 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): class AudioEmbeddingItems(EmbeddingItems): - def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None: - super().__init__(data, "audio") + def __init__( + self, + data: torch.Tensor | list[torch.Tensor], + expected_hidden_size: int | None = None, + ) -> None: + super().__init__(data, "audio", expected_hidden_size) class ImageSize(NamedTuple): @@ -256,8 +341,12 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): class ImageEmbeddingItems(EmbeddingItems): - def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None: - super().__init__(data, "image") + def __init__( + self, + data: torch.Tensor | list[torch.Tensor], + expected_hidden_size: int | None = None, + ) -> None: + super().__init__(data, "image", expected_hidden_size) class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): @@ -287,8 +376,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): class VideoEmbeddingItems(EmbeddingItems): - def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None: - super().__init__(data, "video") + def __init__( + self, + data: torch.Tensor | list[torch.Tensor], + expected_hidden_size: int | None = None, + ) -> None: + super().__init__(data, "video", expected_hidden_size) _D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) @@ -363,6 +456,10 @@ class MultiModalDataParser: Args: target_sr (float, optional): Enables automatic resampling of audio items to the model's expected sampling rate. + expected_hidden_size (int, optional): Expected hidden dimension for + embedding inputs. If provided, validates that user-supplied + embeddings have the correct hidden size to prevent crashes + during model inference. """ def __init__( @@ -371,6 +468,7 @@ class MultiModalDataParser: target_sr: float | None = None, audio_resample_method: Literal["librosa", "scipy"] = "librosa", video_needs_metadata: bool = False, + expected_hidden_size: int | None = None, ) -> None: super().__init__() @@ -379,6 +477,7 @@ class MultiModalDataParser: method=audio_resample_method, ) self.video_needs_metadata = video_needs_metadata + self.expected_hidden_size = expected_hidden_size @classmethod def is_embeddings( @@ -443,7 +542,7 @@ class MultiModalDataParser: return None if self.is_embeddings(data): - return AudioEmbeddingItems(data) + return AudioEmbeddingItems(data, self.expected_hidden_size) data_items: list[AudioItem] if ( @@ -481,7 +580,7 @@ class MultiModalDataParser: return None if self.is_embeddings(data): - return ImageEmbeddingItems(data) + return ImageEmbeddingItems(data, self.expected_hidden_size) if ( isinstance(data, (PILImage.Image, MediaWithBytes)) @@ -507,7 +606,7 @@ class MultiModalDataParser: return None if self.is_embeddings(data): - return VideoEmbeddingItems(data) + return VideoEmbeddingItems(data, self.expected_hidden_size) data_items: list[VideoItem] if ( diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0390773783961..3bbdab3b393c5 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1330,7 +1330,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser] that has additional subparsers. """ - return MultiModalDataParser() + # Get expected hidden size for embedding validation if mm_embeds enabled + # This validates hidden dimensions to prevent vulnerabilities: embeddings + # with correct ndim but wrong shape could cause crashes at inference time + mm_config = self.info.ctx.model_config.get_multimodal_config() + expected_hidden_size = None + if mm_config.enable_mm_embeds: + expected_hidden_size = self.info.ctx.model_config.get_inputs_embeds_size() + + return MultiModalDataParser(expected_hidden_size=expected_hidden_size) def validate_num_items( self, diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index c237f7cf887c1..b95287906c1fe 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Optional import torch import vllm.envs as envs +from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.torch_utils import cuda_device_count_stateless @@ -204,7 +205,7 @@ class RocmPlatform(Platform): assert block_size == 1, ( "Sparse MLA backend on ROCm only supports block size 1 for now." ) - logger.info_once("Using Sparse MLA backend on V1 engine.") + logger.info_once("Using Sparse MLA backend.") return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path() if attn_selector_config.use_mla: @@ -239,16 +240,16 @@ class RocmPlatform(Platform): return AttentionBackendEnum.FLEX_ATTENTION.get_path() if selected_backend == AttentionBackendEnum.TRITON_ATTN: - logger.info("Using Triton Attention backend on V1 engine.") + logger.info("Using Triton Attention backend.") return AttentionBackendEnum.TRITON_ATTN.get_path() if selected_backend == AttentionBackendEnum.ROCM_ATTN: - logger.info("Using Rocm Attention backend on V1 engine.") + logger.info("Using Rocm Attention backend.") return AttentionBackendEnum.ROCM_ATTN.get_path() if selected_backend == AttentionBackendEnum.ROCM_AITER_FA: if on_gfx9(): - logger.info("Using Aiter Flash Attention backend on V1 engine.") + logger.info("Using Aiter Flash Attention backend.") return AttentionBackendEnum.ROCM_AITER_FA.get_path() else: raise ValueError( @@ -257,25 +258,25 @@ class RocmPlatform(Platform): ) if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN: - logger.info("Using Aiter Unified Attention backend on V1 engine.") + logger.info("Using Aiter Unified Attention backend.") return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path() # Handle automatic backend selection based on environment variables if selected_backend is None: # Priority 1: Check for AITER Unified Attention (must check before MHA) if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: - logger.info("Using Aiter Unified Attention backend on V1 engine.") + logger.info("Using Aiter Unified Attention backend.") return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path() # Priority 2: Check for AITER MHA (Flash Attention) # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1) if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): - logger.info("Using Aiter Flash Attention backend on V1 engine.") + logger.info("Using Aiter Flash Attention backend.") return AttentionBackendEnum.ROCM_AITER_FA.get_path() # Priority 3: Check for ROCM_ATTN (prefill-decode split) if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION: - logger.info("Using Rocm Attention backend on V1 engine.") + logger.info("Using Rocm Attention backend.") return AttentionBackendEnum.ROCM_ATTN.get_path() # Priority 4: Check for AITER enabled without specific flags @@ -285,11 +286,19 @@ class RocmPlatform(Platform): and on_gfx9() and envs.VLLM_ROCM_USE_AITER_MHA is not False ): - logger.info("Using Aiter Flash Attention backend on V1 engine.") + logger.info("Using Aiter Flash Attention backend.") return AttentionBackendEnum.ROCM_AITER_FA.get_path() + # Priority 5: If model is Encoder-only self-attention type + if ( + attn_selector_config.attn_type is not None + and attn_selector_config.attn_type == AttentionType.ENCODER_ONLY + ): + logger.info("Using FlexAttention backend.") + return AttentionBackendEnum.FLEX_ATTENTION.get_path() + # Default: Triton Unified Attention - logger.info("Using Triton Attention backend on V1 engine.") + logger.info("Using Triton Attention backend.") return AttentionBackendEnum.TRITON_ATTN.get_path() raise RuntimeError( @@ -324,14 +333,19 @@ class RocmPlatform(Platform): from vllm._aiter_ops import rocm_aiter_ops - if rocm_aiter_ops.is_mha_enabled(): - # Note: AITER FA is only supported for Qwen-VL models. - # TODO: Add support for other VL models in their model class. + if rocm_aiter_ops.is_enabled(): + logger.info_once("Using AITER Flash Attention backend for ViT model.") return AttentionBackendEnum.ROCM_AITER_FA - if on_gfx9() and find_spec("flash_attn") is not None: + if ( + on_gfx9() + and find_spec("flash_attn") is not None + and (dtype == torch.float16 or dtype == torch.bfloat16) + ): + logger.info_once("Using Flash Attention backend for ViT model.") return AttentionBackendEnum.FLASH_ATTN + logger.info_once("Using Torch SDPA backend for ViT model.") return AttentionBackendEnum.TORCH_SDPA @classmethod @@ -394,7 +408,7 @@ class RocmPlatform(Platform): parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() - use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled() + use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled() if compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index de3d1296ec734..48a36b4c6634c 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -104,7 +104,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser): # 3. Both BOT and EOT have been outputted. elif has_bot_token and has_eot_token: return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :] - # 4. Only EOT has been outputted => this should not have occured for a model + # 4. Only EOT has been outputted => this should not have occurred for a model # well prompted and trained. else: return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :] diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py index 181d8bcba9553..ee92727e1c9a4 100644 --- a/vllm/tool_parsers/__init__.py +++ b/vllm/tool_parsers/__init__.py @@ -42,6 +42,10 @@ _TOOL_PARSERS_TO_REGISTER = { "glm4_moe_tool_parser", "Glm4MoeModelToolParser", ), + "glm47": ( + "glm47_moe_tool_parser", + "Glm47MoeModelToolParser", + ), "granite-20b-fc": ( "granite_20b_fc_tool_parser", "Granite20bFCToolParser", diff --git a/vllm/tool_parsers/glm47_moe_tool_parser.py b/vllm/tool_parsers/glm47_moe_tool_parser.py new file mode 100644 index 0000000000000..ae42a640d9413 --- /dev/null +++ b/vllm/tool_parsers/glm47_moe_tool_parser.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import regex as re + +from vllm.logger import init_logger +from vllm.tokenizers import TokenizerLike +from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser + +logger = init_logger(__name__) + + +class Glm47MoeModelToolParser(Glm4MoeModelToolParser): + def __init__(self, tokenizer: TokenizerLike): + super().__init__(tokenizer) + self.func_detail_regex = re.compile( + r"(.*?)(.*?)?", re.DOTALL + ) + self.func_arg_regex = re.compile( + r"(.*?)(?:\\n|\s)*(.*?)", + re.DOTALL, + ) diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py index a1ab75f548bfc..67bd0e61620da 100644 --- a/vllm/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/tool_parsers/minimax_m2_tool_parser.py @@ -138,37 +138,167 @@ class MinimaxM2ToolParser(ToolParser): return name_str def _convert_param_value(self, value: str, param_type: str) -> Any: - """Convert parameter value to the correct type.""" + """Convert parameter value to the correct type (legacy single-type version).""" + return self._convert_param_value_with_types(value, [param_type]) + + def _extract_types_from_schema(self, schema: Any) -> list[str]: + """ + Extract all possible types from a JSON schema definition. + Handles anyOf, oneOf, allOf, type arrays, and enum fields. + + Args: + schema: The JSON schema definition for a parameter + + Returns: + List of type strings (e.g., ["string", "integer", "null"]) + """ + if schema is None: + return ["string"] + + if not isinstance(schema, dict): + return ["string"] + + types: set[str] = set() + + # Handle direct "type" field + if "type" in schema: + type_value = schema["type"] + if isinstance(type_value, str): + types.add(type_value) + elif isinstance(type_value, list): + for t in type_value: + if isinstance(t, str): + types.add(t) + + # Handle enum - infer types from enum values + if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]: + for value in schema["enum"]: + if value is None: + types.add("null") + elif isinstance(value, bool): + types.add("boolean") + elif isinstance(value, int): + types.add("integer") + elif isinstance(value, float): + types.add("number") + elif isinstance(value, str): + types.add("string") + elif isinstance(value, list): + types.add("array") + elif isinstance(value, dict): + types.add("object") + + # Handle anyOf, oneOf, allOf - recursively extract types + for choice_field in ("anyOf", "oneOf", "allOf"): + if choice_field in schema and isinstance(schema[choice_field], list): + for choice in schema[choice_field]: + extracted = self._extract_types_from_schema(choice) + types.update(extracted) + + # If no types found, default to string + if not types: + return ["string"] + + return list(types) + + def _convert_param_value_with_types( + self, value: str, param_types: list[str] + ) -> Any: + """ + Convert parameter value to the correct type based on a list of possible types. + Tries each type in order until one succeeds. + + Args: + value: The string value to convert + param_types: List of possible type strings + + Returns: + The converted value + """ if value.lower() == "null": return None - param_type = param_type.lower() - if param_type in ["string", "str", "text"]: + # Normalize types + normalized_types = [t.lower() for t in param_types] + + # Try null first if it's in the list + if "null" in normalized_types or value.lower() in ("null", "none", "nil"): + return None + + # Try each type in order of preference (most specific first, string as fallback) + # Priority: integer > number > boolean > object > array > string + type_priority = [ + "integer", + "int", + "number", + "float", + "boolean", + "bool", + "object", + "array", + "string", + "str", + "text", + ] + + for param_type in type_priority: + if param_type not in normalized_types: + continue + + if param_type in ["string", "str", "text"]: + return value + elif param_type in ["integer", "int"]: + try: + return int(value) + except (ValueError, TypeError): + continue + elif param_type in ["number", "float"]: + try: + val = float(value) + return val if val != int(val) else int(val) + except (ValueError, TypeError): + continue + elif param_type in ["boolean", "bool"]: + lower_val = value.lower().strip() + if lower_val in ["true", "1", "yes", "on"]: + return True + elif lower_val in ["false", "0", "no", "off"]: + return False + continue + elif param_type in ["object", "array"]: + try: + return json.loads(value) + except json.JSONDecodeError: + continue + + # Fallback: try JSON parse, then return as string + try: + return json.loads(value) + except json.JSONDecodeError: return value - elif param_type in ["integer", "int"]: - try: - return int(value) - except (ValueError, TypeError): - return value - elif param_type in ["number", "float"]: - try: - val = float(value) - return val if val != int(val) else int(val) - except (ValueError, TypeError): - return value - elif param_type in ["boolean", "bool"]: - return value.lower() in ["true", "1"] - elif param_type in ["object", "array"]: - try: - return json.loads(value) - except json.JSONDecodeError: - return value - else: - # Try JSON parse first, fallback to string - try: - return json.loads(value) - except json.JSONDecodeError: - return value + + def _get_param_types_from_config( + self, param_name: str, param_config: dict + ) -> list[str]: + """ + Get parameter types from parameter configuration. + Handles anyOf, oneOf, allOf, and direct type definitions. + + Args: + param_name: The name of the parameter + param_config: The properties dict from the tool schema + + Returns: + List of type strings + """ + if param_name not in param_config: + return ["string"] + + param_schema = param_config[param_name] + if not isinstance(param_schema, dict): + return ["string"] + + return self._extract_types_from_schema(param_schema) def _parse_single_invoke( self, invoke_str: str, tools: list | None @@ -207,17 +337,11 @@ class MinimaxM2ToolParser(ToolParser): if param_value.endswith("\n"): param_value = param_value[:-1] - # Get parameter type - param_type = "string" - if ( - param_name in param_config - and isinstance(param_config[param_name], dict) - and "type" in param_config[param_name] - ): - param_type = param_config[param_name]["type"] + # Get parameter types (supports anyOf/oneOf/allOf) + param_type = self._get_param_types_from_config(param_name, param_config) # Convert value - param_dict[param_name] = self._convert_param_value( + param_dict[param_name] = self._convert_param_value_with_types( param_value, param_type ) @@ -593,7 +717,7 @@ class MinimaxM2ToolParser(ToolParser): # Store raw value for later processing self.accumulated_params[self.current_param_name] = param_value - # Get parameter configuration for type conversion + # Get parameter configuration with anyOf support param_config = {} if self.streaming_request and self.streaming_request.tools: for tool in self.streaming_request.tools: @@ -610,17 +734,12 @@ class MinimaxM2ToolParser(ToolParser): param_config = params["properties"] break - # Get parameter type - param_type = "string" - if ( - self.current_param_name in param_config - and isinstance(param_config[self.current_param_name], dict) - and "type" in param_config[self.current_param_name] - ): - param_type = param_config[self.current_param_name]["type"] + # Get parameter types (supports anyOf/oneOf/allOf) + param_type = self._get_param_types_from_config( + self.current_param_name, param_config + ) - # Convert param value to appropriate type - converted_value = self._convert_param_value( + converted_value = self._convert_param_value_with_types( param_value, param_type ) diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py index db92ea8982d70..da1a9c773f78f 100644 --- a/vllm/tool_parsers/openai_tool_parser.py +++ b/vllm/tool_parsers/openai_tool_parser.py @@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser): elif msg.channel == "commentary" and not msg.recipient: commentary_content = msg_text + # Extract partial content from the parser state if the generation was truncated + if parser.current_content: + if parser.current_channel == "final": + final_content = parser.current_content + elif ( + parser.current_channel == "commentary" and not parser.current_recipient + ): + commentary_content = parser.current_content + return ExtractedToolCallInformation( tools_called=len(tool_calls) > 0, tool_calls=tool_calls, diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 3d4f8449ad3b6..56c9ca361eaef 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -32,15 +32,34 @@ class DeepGemmQuantScaleFMT(Enum): # element contains 4 scale values. UE8M0 = 2 - @staticmethod - def from_oracle() -> "DeepGemmQuantScaleFMT": - if not is_deep_gemm_e8m0_used(): - return DeepGemmQuantScaleFMT.FLOAT32 - return ( - DeepGemmQuantScaleFMT.UE8M0 - if current_platform.is_device_capability_family(100) - else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0 + @classmethod + def init_oracle_cache(cls) -> None: + """Initialize the oracle decision and store it in the class cache""" + cached = getattr(cls, "_oracle_cache", None) + if cached is not None: + return + + use_e8m0 = ( + envs.VLLM_USE_DEEP_GEMM_E8M0 + and is_deep_gemm_supported() + and (_fp8_gemm_nt_impl is not None) ) + if not use_e8m0: + cls._oracle_cache = cls.FLOAT32 # type: ignore + return + + cls._oracle_cache = ( # type: ignore + cls.UE8M0 + if current_platform.is_device_capability_family(100) + else cls.FLOAT32_CEIL_UE8M0 + ) + + @classmethod + def from_oracle(cls) -> "DeepGemmQuantScaleFMT": + """Return the pre-initialized oracle decision""" + cached = getattr(cls, "_oracle_cache", None) + assert cached is not None, "DeepGemmQuantScaleFMT oracle cache not initialized" + return cached @functools.cache @@ -149,6 +168,7 @@ def _lazy_init() -> None: _transform_sf_into_required_layout_impl = getattr( _dg, "transform_sf_into_required_layout", None ) + DeepGemmQuantScaleFMT.init_oracle_cache() def get_num_sms() -> int: @@ -369,7 +389,7 @@ def should_use_deepgemm_for_fp8_linear( # Verify DeepGEMM N/K dims requirements # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul - # test inside kernels/quatization/test_block_fp8.py + # test inside kernels/quantization/test_block_fp8.py N_MULTIPLE = 64 K_MULTIPLE = 128 diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a6ee241c41151..1cbe4718f2e5c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,6 +4,7 @@ import asyncio import os import socket import time +import warnings from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, cast @@ -627,6 +628,9 @@ class AsyncLLM(EngineClient): The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove truncate_prompt_tokens in v0.15. """ try: @@ -641,9 +645,19 @@ class AsyncLLM(EngineClient): if tokenization_kwargs is None: tokenization_kwargs = {} + + if truncate_prompt_tokens is not None: + warnings.warn( + "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` " + "is deprecated and will be removed in v0.15. " + "Please use `pooling_params.truncate_prompt_tokens` instead.", + DeprecationWarning, + stacklevel=2, + ) + _validate_truncation_size( self.model_config.max_model_len, - truncate_prompt_tokens, + pooling_params.truncate_prompt_tokens, tokenization_kwargs, ) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9e2571201a684..40c3e9a515e18 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -43,9 +43,11 @@ from vllm.v1.core.kv_cache_utils import ( from vllm.v1.core.sched.interface import SchedulerInterface from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.engine import ( + EngineCoreOutput, EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, + FinishReason, ReconfigureDistributedRequest, ReconfigureRankType, UtilityOutput, @@ -1055,9 +1057,14 @@ class EngineCoreProc(EngineCore): request_type = EngineCoreRequestType(bytes(type_frame.buffer)) # Deserialize the request data. + request: Any if request_type == EngineCoreRequestType.ADD: - request = add_request_decoder.decode(data_frames) - request = self.preprocess_add_request(request) + req: EngineCoreRequest = add_request_decoder.decode(data_frames) + try: + request = self.preprocess_add_request(req) + except Exception: + self._handle_request_preproc_error(req) + continue else: request = generic_decoder.decode(data_frames) @@ -1141,6 +1148,30 @@ class EngineCoreProc(EngineCore): # Limit the number of buffers to reuse. reuse_buffers.append(buffer) + def _handle_request_preproc_error(self, request: EngineCoreRequest) -> None: + """Log and return a request-scoped error response for exceptions raised + from the add request preprocessing in the input socket processing thread. + """ + logger.exception( + "Unexpected error pre-processing request %s", request.request_id + ) + self.output_queue.put_nowait( + ( + request.client_index, + EngineCoreOutputs( + engine_index=self.engine_index, + finished_requests={request.request_id}, + outputs=[ + EngineCoreOutput( + request_id=request.request_id, + new_token_ids=[], + finish_reason=FinishReason.ERROR, + ) + ], + ), + ) + ) + class DPEngineCoreProc(EngineCoreProc): """ZMQ-wrapper for running EngineCore in background process diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 807db8275fbf5..cacbc805e84f8 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -20,6 +20,7 @@ import zmq import zmq.asyncio from vllm.config import VllmConfig +from vllm.envs import VLLM_ENGINE_READY_TIMEOUT_S from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask @@ -528,7 +529,9 @@ class MPClient(EngineCoreClient): identities = set(self.core_engines) sync_input_socket = zmq.Socket.shadow(self.input_socket) while identities: - if not sync_input_socket.poll(timeout=600_000): + if not sync_input_socket.poll( + timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000 # convert to ms + ): raise TimeoutError( "Timed out waiting for engines to send" "initial message on input socket." @@ -1340,7 +1343,9 @@ class DPLBAsyncMPClient(DPAsyncMPClient): # Wait for ready messages from new engines on the input socket sync_input_socket = zmq.Socket.shadow(self.input_socket) while new_engine_identities: - if not sync_input_socket.poll(timeout=600_000): + if not sync_input_socket.poll( + timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000 # convert to ms + ): raise TimeoutError( "Timed out waiting for new engines to send initial " "message on input socket." diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 50b91d8292ee8..f2338e9b4b7d0 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -119,8 +119,14 @@ class RejectionSampler(nn.Module): raw_target_logits = logits[target_logits_indices] # Use float32 for the target_logits. raw_target_logits = raw_target_logits.to(torch.float32) + target_logits = raw_target_logits + if not self.is_processed_logprobs_mode: + # Clone raw_target_logits before applying processors to preserve + # the original raw logits for logprobs computation, since + # apply_logits_processors modifies the tensor in-place. + target_logits = target_logits.clone() target_logits = self.apply_logits_processors( - raw_target_logits, sampling_metadata, metadata + target_logits, sampling_metadata, metadata ) # [num_tokens, vocab_size] # NOTE(woosuk): `target_logits` can be updated in place inside the diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33be4dccfc710..0a17923e89989 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -66,6 +66,7 @@ from vllm.model_executor.models.interfaces import ( SupportsXDRoPE, is_mixture_of_experts, supports_eagle3, + supports_mm_encoder_only, supports_mrope, supports_multimodal_pruning, supports_transcription, @@ -1641,7 +1642,10 @@ class GPUModelRunner( ) -> None: attn_group = self.attn_groups[kv_cache_gid][attn_gid] builder = attn_group.get_metadata_builder(ubid or 0) - cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder)) + kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec + if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs): + kv_cache_spec = kv_cache_spec.kv_cache_specs[attn_group.layer_names[0]] + cache_key = (kv_cache_spec, type(builder)) cascade_attn_prefix_len = ( cascade_attn_prefix_lens[kv_cache_gid][attn_gid] @@ -4064,6 +4068,11 @@ class GPUModelRunner( remove_lora: If False, dummy LoRAs are not destroyed after the run activate_lora: If False, dummy_run is performed without LoRAs. """ + if supports_mm_encoder_only(self.model): + # The current dummy run only covers LM execution, so we can skip it. + # mm encoder dummy run may need to add in the future. + return torch.tensor([]), torch.tensor([]) + assert ( cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() @@ -4341,6 +4350,11 @@ class GPUModelRunner( # The dummy hidden states may contain special values, # like `inf` or `nan`. # To avoid breaking the sampler, we use a random tensor here instead. + + if supports_mm_encoder_only(self.model): + # MM Encoder only model no need to run sampler. + return torch.tensor([]) + hidden_states = torch.rand_like(hidden_states) logits = self.model.compute_logits(hidden_states) @@ -4469,6 +4483,10 @@ class GPUModelRunner( self, hidden_states: torch.Tensor, ) -> PoolerOutput: + if supports_mm_encoder_only(self.model): + # MM Encoder only model not need to run pooler. + return torch.tensor([]) + # Find the task that has the largest output for subsequent steps supported_pooling_tasks = self.get_supported_pooling_tasks() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index bc71351d2cc55..4747388e22b3d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -634,7 +634,12 @@ class Worker(WorkerBase): def profile(self, is_start: bool = True): if self.profiler is None: - raise RuntimeError("Profiling is not enabled.") + raise RuntimeError( + "Profiling is not enabled. Please set --profiler-config to enable " + "profiling. Example: " + "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir" + "=YOUR_DIR_PATH_TO_DUMP_TRACE'" + ) if is_start: self.profiler.start() else: