Merge branch 'main' into fix-rocm-encoder-only-backend

This commit is contained in:
TJian 2025-12-25 01:14:02 +09:00 committed by GitHub
commit 4ab91bcc01
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
278 changed files with 6770 additions and 3036 deletions

View File

@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on chartqa for vllm. # We can use this script to compute baseline accuracy on chartqa for vllm.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9 # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] # pip install "lm-eval[api]>=0.4.9.2"
usage() { usage() {
echo`` echo``

View File

@ -291,6 +291,7 @@ if __name__ == "__main__":
""" """
Arguments: Arguments:
--version <version> : version string for the current build (e.g., commit hash) --version <version> : version string for the current build (e.g., commit hash)
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files --output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@ -318,6 +319,12 @@ if __name__ == "__main__":
required=True, required=True,
help="Directory to store generated index files", help="Directory to store generated index files",
) )
parser.add_argument(
"--wheel-dir",
type=str,
default=None,
help="Directory containing wheel files (default to be same as `version`)",
)
parser.add_argument( parser.add_argument(
"--alias-to-default", "--alias-to-default",
type=str, type=str,
@ -372,7 +379,7 @@ if __name__ == "__main__":
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# keep only "official" files for a non-nightly version (specifed by cli args) # keep only "official" files for a non-nightly version (specified by cli args)
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
if PY_VERSION_RE.match(version): if PY_VERSION_RE.match(version):
# upload-wheels.sh ensures no "dev" is in args.version # upload-wheels.sh ensures no "dev" is in args.version
@ -384,9 +391,10 @@ if __name__ == "__main__":
print("Nightly version detected, keeping all wheel files.") print("Nightly version detected, keeping all wheel files.")
# Generate index and metadata, assuming wheels and indices are stored as: # Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files> # s3://vllm-wheels/{wheel_dir}/<wheel files>
# s3://vllm-wheels/<anything>/<index files> # s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version wheel_dir = args.wheel_dir or version
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
index_base_dir = Path(output_dir) index_base_dir = Path(output_dir)
generate_index_and_metadata( generate_index_and_metadata(

View File

@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"

View File

@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/" echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*" rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi fi

View File

@ -162,7 +162,10 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2) - label: Entrypoints Integration Test (API Server 2)
@ -219,6 +222,9 @@ steps:
- tests/v1/engine/test_engine_core_client.py - tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py - tests/distributed/test_symm_mem_allreduce.py
commands: commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
# test with torchrun tp=2 and external_dp=2 # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2 # test with torchrun tp=2 and pp=2
@ -267,9 +273,10 @@ steps:
- vllm/v1/executor/uniproc_executor.py - vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py - vllm/v1/worker/gpu_worker.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838
#- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep # test with torchrun tp=2 and dp=4 with ep
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min - label: EPLB Algorithm Test # 5min
@ -349,7 +356,9 @@ steps:
- label: V1 Test e2e + engine # 65min - label: V1 Test e2e + engine # 65min
timeout_in_minutes: 90 timeout_in_minutes: 90
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -977,7 +986,10 @@ steps:
- export MIOPEN_DEBUG_CONV_GEMM=0 - export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 5min - label: Multi-Modal Accuracy Eval (Small Models) # 5min
@ -1254,13 +1266,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90 timeout_in_minutes: 90
@ -1286,6 +1298,9 @@ steps:
- tests/v1/shutdown - tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py - tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
# Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
# TODO: Remove when the bug is fixed in a future ROCm release
- export TORCH_NCCL_BLOCKING_WAIT=1
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@ -1339,7 +1354,9 @@ steps:
# end platform plugin tests # end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin - pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y - pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test # end io_processor plugins test
# begin stat_logger plugins test # begin stat_logger plugins test
@ -1508,7 +1525,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####

View File

@ -1109,13 +1109,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min - label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90 timeout_in_minutes: 90
@ -1334,7 +1334,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
@ -1359,6 +1359,7 @@ steps:
- vllm/ - vllm/
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy

View File

@ -145,7 +145,7 @@ steps:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200) - label: Distributed Tests (2 GPUs)(B200)
@ -171,7 +171,7 @@ steps:
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs) - label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30 timeout_in_minutes: 30

1
.github/CODEOWNERS vendored
View File

@ -15,6 +15,7 @@
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang /vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC /vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson CMakeLists.txt @tlrmchlsmth @LucasWilkinson

View File

@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
random.seed(seed) random.seed(seed)
# Set environment variables # Set environment variables
os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant: if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1" os.environ["VLLM_BATCH_INVARIANT"] = "1"
else: else:
@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
max_model_len=max_model_len, max_model_len=max_model_len,
dtype="bfloat16", dtype="bfloat16",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
attention_config={"backend": backend},
enable_prefix_caching=False, enable_prefix_caching=False,
) )
init_time = time.perf_counter() - start_init init_time = time.perf_counter() - start_init

View File

@ -0,0 +1,177 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.triton_utils import triton
from vllm.utils.flashinfer import flashinfer_fp4_quantize
if not current_platform.has_device_capability(100):
raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
PROVIDER_CFGS = {
"vllm": dict(backend="vllm", enabled=True),
"flashinfer": dict(backend="flashinfer", enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
"""Compute global scale for FP4 quantization."""
amax = torch.abs(tensor).max().to(torch.float32)
return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="us (lower is better)",
plot_name="NVFP4 Input Quantization Latency (us)",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
# Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype)
# Compute global scale for activation
a_global_scale = compute_global_scale(a)
quantiles = [0.5, 0.2, 0.8]
cfg = PROVIDER_CFGS[provider]
if cfg["backend"] == "vllm":
# vLLM's FP4 quantization
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: ops.scaled_fp4_quant(a, a_global_scale),
quantiles=quantiles,
)
elif cfg["backend"] == "flashinfer":
# FlashInfer's FP4 quantization
# Use is_sf_swizzled_layout=True to match vLLM's output format
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True
),
quantiles=quantiles,
)
# Convert ms to us for better readability at small batch sizes
to_us = lambda t_ms: t_ms * 1000
return to_us(ms), to_us(max_ms), to_us(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
# Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype)
# Compute global scale
a_global_scale = compute_global_scale(a)
# vLLM quantization
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
# FlashInfer quantization (with swizzled layout to match vLLM's output)
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True
)
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
# Compare outputs
torch.testing.assert_close(
vllm_fp4,
flashinfer_fp4,
)
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
def test_accuracy():
"""Run accuracy tests across various shapes."""
print("\n" + "=" * 60)
print("Running accuracy tests: vLLM vs FlashInfer")
print("=" * 60)
device = "cuda"
dtype = torch.bfloat16
# Test various batch sizes and hidden dimensions
Ms = [1, 1024]
Ks = [4096]
for M in Ms:
for K in Ks:
_test_accuracy_once(M, K, dtype, device)
print("\nAll accuracy tests passed!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
parser.add_argument(
"--save-path",
type=str,
default=None,
help="Path to save benchmark results",
)
parser.add_argument(
"--accuracy",
action="store_true",
help="Run accuracy tests",
)
args = parser.parse_args()
if args.accuracy:
test_accuracy()
for K, N, model in prepare_shapes(args):
print(f"\n{model}, N={N} K={K}")
benchmark.run(
print_data=True,
save_path=args.save_path,
N=N,
K=K,
)
print("\nBenchmark finished!")

View File

@ -9,16 +9,6 @@
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
const torch::Tensor& block_mapping); const torch::Tensor& block_mapping);
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping);
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping);
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping, torch::Tensor& slot_mapping,

View File

@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
} // namespace vllm } // namespace vllm
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping) {
int num_layers = key_caches.size();
TORCH_CHECK(num_layers == value_caches.size());
if (num_layers == 0) {
return;
}
torch::Device cache_device = key_caches[0].device();
TORCH_CHECK(cache_device.is_cuda());
// Create data structures for the kernel.
// Create an array of pointers to the key and value caches.
int64_t key_cache_ptrs[num_layers];
int64_t value_cache_ptrs[num_layers];
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
key_cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
value_cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
}
// block_mapping is a 2D tensor with shape (num_pairs, 2).
int num_pairs = block_mapping.size(0);
// Move the data structures to the GPU.
// NOTE: This synchronizes the CPU and GPU.
torch::Tensor key_cache_ptrs_tensor =
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
.to(cache_device);
torch::Tensor value_cache_ptrs_tensor =
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
.to(cache_device);
// Launch the kernel.
const int numel_per_block = key_caches[0][0].numel();
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, numel_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
key_cache_ptrs_tensor.data_ptr<int64_t>(),
value_cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), numel_per_block);
}));
}
// copy blocks kernel for MLA (assumes a joint KV-cache)
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping) {
int num_layers = kv_caches.size();
if (num_layers == 0) {
return;
}
torch::Device cache_device = kv_caches[0].device();
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
std::vector<int64_t> cache_ptrs(num_layers);
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
}
torch::Tensor cache_ptrs_tensor =
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
.to(cache_device);
int num_pairs = block_mapping.size(0);
// We use the stride instead of numel in case the cache is padded for memory
// alignment reasons, we assume the blocks data (inclusive of any padding)
// is contiguous in memory
int mem_footprint_per_block = kv_caches[0].stride(0);
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, mem_footprint_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
}));
}
namespace vllm { namespace vllm {
// Used to copy/convert one element // Used to copy/convert one element
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
for (int i = 0; i < VEC_SIZE; i++) { for (int i = 0; i < VEC_SIZE; i++) {
amax = fmaxf(amax, fabsf(float(k_val_ptr[i]))); amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
} }
#ifndef USE_ROCM
__syncwarp();
#endif
// Reduced amax // Reduced amax
for (int mask = 16; mask > 0; mask /= 2) { for (int mask = 16; mask > 0; mask /= 2) {
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask)); amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
#endif #endif
} }
#ifndef USE_ROCM
__syncwarp();
#endif
#if defined(__gfx942__) #if defined(__gfx942__)
float scale = fmaxf(amax, 1e-4) / 224.0f; float scale = fmaxf(amax, 1e-4) / 224.0f;
#else #else

View File

@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
#ifndef VLLM_NUMA_DISABLED #ifndef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) { std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str()); bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask != nullptr,
"Failed to parse CPU string: " + cpu_ids);
TORCH_CHECK(omp_cpu_mask->size > 0); TORCH_CHECK(omp_cpu_mask->size > 0);
std::vector<int> omp_cpu_ids; std::vector<int> omp_cpu_ids;
omp_cpu_ids.reserve(omp_cpu_mask->size); omp_cpu_ids.reserve(omp_cpu_mask->size);
@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
// Memory node binding // Memory node binding
if (numa_available() != -1) { if (numa_available() != -1) {
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
std::set<int> node_ids; std::set<int> node_ids;
for (const auto& cpu_id : omp_cpu_ids) { for (const auto& cpu_id : omp_cpu_ids) {
int node_id = numa_node_of_cpu(cpu_id); int node_id = numa_node_of_cpu(cpu_id);
if (node_id != -1) { if (node_id != -1) {
node_ids.insert(node_id); node_ids.insert(node_id);
} }
if (node_id != mem_node_id) {
TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
". All CPUs should be on the same NUMA node for optimal "
"performance. Memory will be bound to NUMA node ",
mem_node_id, ".");
}
} }
// Concatenate all node_ids into a single comma-separated string // Concatenate all node_ids into a single comma-separated string
if (!node_ids.empty()) { if (!node_ids.empty()) {
@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
} }
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str()); bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
bitmask* src_mask = numa_get_membind(); bitmask* src_mask = numa_get_mems_allowed();
int pid = getpid(); int pid = getpid();
@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
std::to_string(errno)); std::to_string(errno));
} }
// restrict memory allocation node. // Restrict memory allocation to the selected NUMA node(s).
numa_set_membind(mask); // Enhances memory locality for the threads bound to those NUMA CPUs.
if (node_ids.size() > 1) {
errno = 0;
numa_set_interleave_mask(mask);
if (errno != 0) {
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
std::to_string(errno));
} else {
TORCH_WARN(
"NUMA binding: Using INTERLEAVE policy for memory "
"allocation across multiple NUMA nodes (nodes: " +
node_ids_str +
"). Memory allocations will be "
"interleaved across the specified NUMA nodes.");
}
} else {
errno = 0;
numa_set_membind(mask);
if (errno != 0) {
TORCH_WARN("numa_set_membind failed. errno: " +
std::to_string(errno));
} else {
TORCH_WARN(
"NUMA binding: Using MEMBIND policy for memory "
"allocation on the NUMA nodes (" +
node_ids_str +
"). Memory allocations will be "
"strictly bound to these NUMA nodes.");
}
}
numa_set_strict(1); numa_set_strict(1);
numa_free_nodemask(mask); numa_free_nodemask(mask);
numa_free_nodemask(src_mask); numa_free_nodemask(src_mask);
} else { } else {
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " + TORCH_WARN(
std::to_string(errno)); "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
std::to_string(errno));
} }
} }
} }

View File

@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
void const* k_weight_void, // RMSNorm weights for key void const* k_weight_void, // RMSNorm weights for key
void const* cos_sin_cache_void, // Pre-computed cos/sin cache void const* cos_sin_cache_void, // Pre-computed cos/sin cache
int64_t const* position_ids, // Position IDs for RoPE int64_t const* position_ids, // Position IDs for RoPE
int const num_tokens // Number of tokens int const num_tokens, // Number of tokens
int const rotary_dim // Dimension for RoPE
) { ) {
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM) #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) || if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
// Calculate cache pointer for this position - similar to // Calculate cache pointer for this position - similar to
// pos_encoding_kernels.cu // pos_encoding_kernels.cu
T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim; T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
int const embed_dim = head_dim / 2; int const embed_dim = rotary_dim / 2;
T_cache const* cos_ptr = cache_ptr; T_cache const* cos_ptr = cache_ptr;
T_cache const* sin_ptr = cache_ptr + embed_dim; T_cache const* sin_ptr = cache_ptr + embed_dim;
int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range
if constexpr (interleave) { if (laneId < rotary_lanes) {
// Perform interleaving. Use pre-computed cos/sin values. if constexpr (interleave) {
// Perform interleaving. Use pre-computed cos/sin values.
#pragma unroll #pragma unroll
for (int i = 0; i < numElemsPerThread / 2; ++i) { for (int i = 0; i < numElemsPerThread / 2; ++i) {
int const idx0 = 2 * i; int const idx0 = 2 * i;
int const idx1 = 2 * i + 1; int const idx1 = 2 * i + 1;
// Global dimension index in the head
int const dim_idx = laneId * numElemsPerThread + idx0;
float const val0 = elements[idx0]; float const val0 = elements[idx0];
float const val1 = elements[idx1]; float const val1 = elements[idx1];
int const dim_idx = laneId * numElemsPerThread + idx0; int const half_dim = dim_idx / 2;
int const half_dim = dim_idx / 2; float const cos_val =
float const cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); float const sin_val =
float const sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[idx0] = val0 * cos_val - val1 * sin_val; elements[idx0] = val0 * cos_val - val1 * sin_val;
elements[idx1] = val0 * sin_val + val1 * cos_val; elements[idx1] = val0 * sin_val + val1 * cos_val;
}
} else {
// Before data exchange with in warp, we need to sync.
__syncwarp();
// Get the data from the other half of the warp. Use pre-computed cos/sin
// values.
#pragma unroll
for (int i = 0; i < numElemsPerThread; i++) {
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
if (laneId < 16) {
elements2[i] = -elements2[i];
} }
} else {
// Before data exchange with in warp, we need to sync.
__syncwarp();
int pairOffset = (rotary_dim / 2) / numElemsPerThread;
// Get the data from the other half of the warp. Use pre-computed
// cos/sin values.
#pragma unroll
for (int i = 0; i < numElemsPerThread; i++) {
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
int dim_idx = laneId * numElemsPerThread + i; if (laneId < pairOffset) {
dim_idx = (dim_idx * 2) % head_dim; elements2[i] = -elements2[i];
int half_dim = dim_idx / 2; }
// Use pre-computed cos/sin from cache int dim_idx = laneId * numElemsPerThread + i;
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[i] = elements[i] * cos_val + elements2[i] * sin_val; dim_idx = (dim_idx * 2) % rotary_dim;
int half_dim = dim_idx / 2;
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
}
// __shfl_xor_sync does not provide memfence. Need to sync again.
__syncwarp();
} }
// __shfl_xor_sync does not provide memfence. Need to sync again.
__syncwarp();
} }
// Store. // Store.
{ {
vec_T vec; vec_T vec;
@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
void launchFusedQKNormRope(void* qkv, int const num_tokens, void launchFusedQKNormRope(void* qkv, int const num_tokens,
int const num_heads_q, int const num_heads_k, int const num_heads_q, int const num_heads_k,
int const num_heads_v, int const head_dim, int const num_heads_v, int const head_dim,
float const eps, void const* q_weight, int const rotary_dim, float const eps,
void const* k_weight, void const* cos_sin_cache, void const* q_weight, void const* k_weight,
bool const interleave, int64_t const* position_ids, void const* cos_sin_cache, bool const interleave,
cudaStream_t stream) { int64_t const* position_ids, cudaStream_t stream) {
constexpr int blockSize = 256; constexpr int blockSize = 256;
int const warpsPerBlock = blockSize / 32; int const warpsPerBlock = blockSize / 32;
@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE> fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>( <<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens); k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
}); });
break; break;
case 128: case 128:
@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE> fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>( <<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens); k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
}); });
break; break;
case 256: case 256:
@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE> fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>( <<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens); k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
}); });
break; break;
default: default:
@ -392,8 +396,11 @@ void fused_qk_norm_rope(
"Query weights size must match head dimension"); "Query weights size must match head dimension");
TORCH_CHECK(k_weight.size(0) == head_dim, TORCH_CHECK(k_weight.size(0) == head_dim,
"Key weights size must match head dimension"); "Key weights size must match head dimension");
TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
"Cos/sin cache dimension must match head_dim"); TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
"rotary_dim must be less than or equal to head_dim");
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() && TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
qkv.scalar_type() == k_weight.scalar_type(), qkv.scalar_type() == k_weight.scalar_type(),
"qkv, q_weight and k_weight must have the same dtype"); "qkv, q_weight and k_weight must have the same dtype");
@ -419,7 +426,8 @@ void fused_qk_norm_rope(
qkv.data_ptr(), static_cast<int>(num_tokens), qkv.data_ptr(), static_cast<int>(num_tokens),
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k), static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
static_cast<int>(num_heads_v), static_cast<int>(head_dim), static_cast<int>(num_heads_v), static_cast<int>(head_dim),
static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(), static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
q_weight.data_ptr(), k_weight.data_ptr(),
cos_sin_cache.data_ptr(), !is_neox, cos_sin_cache.data_ptr(), !is_neox,
reinterpret_cast<int64_t const*>(position_ids.data_ptr()), reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
stream); stream);

View File

@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched."); "Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
// Get the global scaling factor, which will be applied to the SF. // Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is // Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout); rowIdx, colIdx, numKTiles, SFout);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal, out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
sf_out); sf_out);

View File

@ -25,6 +25,7 @@
#include <cuda_fp8.h> #include <cuda_fp8.h>
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cuda_utils.h"
#include "nvfp4_utils.cuh" #include "nvfp4_utils.cuh"
#include "launch_bounds_utils.h" #include "launch_bounds_utils.h"
@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched."); "Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
int tid = blockIdx.x * blockDim.x + threadIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD; int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
int factor = CVT_FP4_SF_VEC_SIZE * 4;
// The actual output_scales dim is computed from the padded numCols.
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
uint32_t* SFout_in_expert = uint32_t* SFout_in_expert =
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numCols, SFout_in_expert); rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out); out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
} }
@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched."); "Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
extern __shared__ uint32_t shared_input_offsets[]; extern __shared__ uint32_t shared_input_offsets[];
// Load input offsets into shared memory. // Load input offsets into shared memory.
@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
int factor = CVT_FP4_SF_VEC_SIZE * 4;
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
uint32_t* SFout_in_expert = uint32_t* SFout_in_expert =
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numCols, SFout_in_expert); rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out); out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
} }
@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input,
void* input_global_scale, void* input_offset_by_experts, void* input_global_scale, void* input_offset_by_experts,
void* output_scale_offset_by_experts, int m_topk, int k, void* output_scale_offset_by_experts, int m_topk, int k,
int n_experts, cudaStream_t stream) { int n_experts, cudaStream_t stream) {
// TODO: this multiProcessorCount should be cached. int multiProcessorCount =
int device; get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
cudaGetDevice(&device);
int multiProcessorCount;
cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
device);
// Grid, Block size. // Grid, Block size.
// Each thread converts 8 values. // Each thread converts 8 values.

View File

@ -35,7 +35,13 @@ template <typename Int>
__host__ __device__ inline Int round_up(Int x, Int y) { __host__ __device__ inline Int round_up(Int x, Int y) {
static_assert(std::is_integral_v<Int>, static_assert(std::is_integral_v<Int>,
"round_up argument must be integral type"); "round_up argument must be integral type");
return (x + y - 1) / y * y; return ((x + y - 1) / y) * y;
}
// Compute effective rows for grid configuration with swizzled SF layouts.
inline int computeEffectiveRows(int m) {
constexpr int ROW_TILE = 128;
return round_up(m, ROW_TILE);
} }
// Use UE4M3 by default. // Use UE4M3 by default.
@ -49,81 +55,57 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched."); "Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
int sf_m = round_up<int>(numRows, 128); int sf_m = round_up<int>(numRows, 128);
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4; int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) { int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
// Each thread writes 4 uint32_t elements.
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
col += blockDim.x * 4) {
SFout[row * sf_n_int + col] = 0x00;
}
}
// Get the global scaling factor, which will be applied to the SF. // Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is // Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0]; float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
// Input tensor row/col loops. // Iterate over all rows and cols including padded ones -
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { // ensures we visit every single scale factor address to initialize it.
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
for (int colIdx = threadIdx.x;
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
colIdx += blockDim.x) { colIdx += blockDim.x) {
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
PackedVec in_vec;
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
// Get the output tensor offset. // If we are outside valid rows OR outside valid columns -> Use Zeros
// Same as inOffset because 8 elements are packed into one uint32_t. if (rowIdx >= numRows || elem_idx >= numCols) {
int64_t outOffset = inOffset; memset(&in_vec, 0, sizeof(PackedVec));
auto& out_pos = out[outOffset];
} else {
// Valid Region: Load actual data
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
}
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout); rowIdx, colIdx, numKTiles, SFout);
out_pos = auto out_val =
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out); cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
// We do NOT write output for padding because the 'out' tensor is not
// padded.
if (rowIdx < numRows && elem_idx < numCols) {
// Same as inOffset because 8 elements are packed into one uint32_t.
out[inOffset] = out_val;
}
} }
} }
} }
template <typename T>
void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
int64_t* output, int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount, cudaStream_t stream) {
// Grid, Block size.
// Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
// Get number of blocks per SM
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
// Launch the cvt kernel.
if (useUE8M0) {
cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
reinterpret_cast<uint32_t*>(SFOuput));
} else {
cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
reinterpret_cast<uint32_t*>(SFOuput));
}
}
// Instantiate the function.
template void invokeFP4Quantization(int m, int n, half const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
cudaStream_t stream);
template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
cudaStream_t stream);
} // namespace vllm } // namespace vllm
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@ -147,13 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
// We don't support e8m0 scales at this moment. // Grid, Block size. Each thread converts 8 values.
bool useUE8M0 = false; dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
int effectiveRows = vllm::computeEffectiveRows(m);
dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] { VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type; using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr()); auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, // NOTE: We don't support e8m0 scales at this moment.
sf_out, useUE8M0, multiProcessorCount, stream); vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out));
}); });
} }

View File

@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
return b; return b;
} }
// Compute SF output offset for swizzled tensor core layout.
// SF layout: [numMTiles, numKTiles, 32, 4, 4]
// Caller must precompute: numKTiles = (numCols + 63) / 64
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF> template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, __device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
int numCols, int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
SFType* SFout) {
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
CVT_FP4_NUM_THREADS_PER_SF == 2); CVT_FP4_NUM_THREADS_PER_SF == 2);
// One pair of threads write one SF to global memory. // One pair of threads write one SF to global memory.
// TODO: stage through smem for packed STG.32 // TODO: stage through smem for packed STG.32
// is it better than STG.8 from 4 threads ? // is it better than STG.8 from 4 threads ?
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
// SF vector index (16 elements share one SF in the K dimension). return nullptr;
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
int32_t mIdx = rowIdx;
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
int32_t mTileIdx = mIdx / (32 * 4);
// SF vector size 16.
int factor = CVT_FP4_SF_VEC_SIZE * 4;
int32_t numKTiles = (numCols + factor - 1) / factor;
int64_t mTileStride = numKTiles * 32 * 4 * 4;
int32_t kTileIdx = (kIdx / 4);
int64_t kTileStride = 32 * 4 * 4;
// M tile layout [32, 4] is column-major.
int32_t outerMIdx = (mIdx % 32);
int64_t outerMStride = 4 * 4;
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
int64_t innerMStride = 4;
int32_t innerKIdx = (kIdx % 4);
int64_t innerKStride = 1;
// Compute the global offset.
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
outerMIdx * outerMStride + innerMIdx * innerMStride +
innerKIdx * innerKStride;
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
} }
return nullptr;
// SF vector index (16 elements share one SF in the K dimension).
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
int32_t mIdx = rowIdx;
// Decompose indices using bitwise ops (all divisors are powers of 2).
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
int32_t mTileIdx = mIdx >> 7; // mIdx / 128
int32_t outerMIdx = mIdx & 31; // mIdx % 32
int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4
int32_t kTileIdx = kIdx >> 2; // kIdx / 4
int32_t innerKIdx = kIdx & 3; // kIdx % 4
// Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
// outerMIdx * 16 + innerMIdx * 4 + innerKIdx
// Use bitwise OR for non-overlapping lower bits.
int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
<< 9 |
(outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
} }
// Quantizes the provided PackedVec into the uint32_t output // Quantizes the provided PackedVec into the uint32_t output

View File

@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()"); "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks); cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
// Copy the cache blocks from src to dst.
cache_ops.def(
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
"Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
cache_ops.def(
"copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
// Reshape the key and value tensors and cache them. // Reshape the key and value tensors and cache them.
cache_ops.def( cache_ops.def(
"reshape_and_cache(Tensor key, Tensor value," "reshape_and_cache(Tensor key, Tensor value,"

View File

@ -183,7 +183,7 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_REGION_NAME=us-west-2
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$USE_SCCACHE" = "1" ]; then \ if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \ echo "Installing sccache..." \
&& case "${TARGETPLATFORM}" in \
linux/arm64) SCCACHE_ARCH="aarch64" ;; \
linux/amd64) SCCACHE_ARCH="x86_64" ;; \
*) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac \
&& export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \ && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \ && tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \ && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \

View File

@ -1,5 +1,5 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="a272dfa8" ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_BRANCH="89075173"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"

View File

@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
RUN apt clean && apt-get update -y && \ RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \ apt-get install -y --no-install-recommends --fix-missing \
@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \ pip install --no-cache-dir \
-r requirements/xpu.txt -r requirements/xpu.txt
# arctic-inference is built from source which needs torch-xpu properly installed
# used for suffix method speculative decoding
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir arctic-inference==0.1.1
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
COPY . . COPY . .

View File

@ -2,4 +2,4 @@
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe. You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).

View File

@ -0,0 +1,5 @@
# llm-d
vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).

View File

@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
- [Helm](frameworks/helm.md) - [Helm](frameworks/helm.md)
- [InftyAI/llmaz](integrations/llmaz.md) - [InftyAI/llmaz](integrations/llmaz.md)
- [llm-d](integrations/llm-d.md)
- [KAITO](integrations/kaito.md) - [KAITO](integrations/kaito.md)
- [KServe](integrations/kserve.md) - [KServe](integrations/kserve.md)
- [Kthena](integrations/kthena.md) - [Kthena](integrations/kthena.md)

View File

@ -139,18 +139,18 @@ token data.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
``` ```
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" } <img src="../assets/design/paged_attention/query.png" alt="query" width="70%" />
</figure> </p>
Each thread defines its own `q_ptr` which points to the assigned Each thread defines its own `q_ptr` which points to the assigned
query token data on global memory. For example, if `VEC_SIZE` is 4 query token data on global memory. For example, if `VEC_SIZE` is 4
and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
total of 128 elements divided into 128 / 4 = 32 vecs. total of 128 elements divided into 128 / 4 = 32 vecs.
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" } <img src="../assets/design/paged_attention/q_vecs.png" alt="q_vecs" width="70%" />
</figure> </p>
```cpp ```cpp
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr`
points to key token data based on `k_cache` at assigned block, points to key token data based on `k_cache` at assigned block,
assigned head and assigned token. assigned head and assigned token.
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" } <img src="../assets/design/paged_attention/key.png" alt="key" width="70%" />
</figure> </p>
The diagram above illustrates the memory layout for key data. It The diagram above illustrates the memory layout for key data. It
assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
elements for one token) that will be processed by 2 threads (one elements for one token) that will be processed by 2 threads (one
thread group) separately. thread group) separately.
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" } <img src="../assets/design/paged_attention/k_vecs.png" alt="k_vecs" width="70%" />
</figure> </p>
```cpp ```cpp
K_vec k_vecs[NUM_VECS_PER_THREAD] K_vec k_vecs[NUM_VECS_PER_THREAD]
@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of
## Value ## Value
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" } <img src="../assets/design/paged_attention/value.png" alt="value" width="70%" />
</figure> </p>
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" } <img src="../assets/design/paged_attention/logits_vec.png" alt="logits_vec" width="50%" />
</figure> </p>
<figure markdown="span"> <p align="center">
![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" } <img src="../assets/design/paged_attention/v_vec.png" alt="v_vec" width="70%" />
</figure> </p>
Now we need to retrieve the value data and perform dot multiplication Now we need to retrieve the value data and perform dot multiplication
with `logits`. Unlike query and key, there is no thread group with `logits`. Unlike query and key, there is no thread group

View File

@ -64,7 +64,7 @@ th:not(:first-child) {
| [CP](../configuration/optimization.md#chunked-prefill) | [](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [CP](../configuration/optimization.md#chunked-prefill) | [](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [APC](automatic_prefix_caching.md) | [](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [APC](automatic_prefix_caching.md) | [](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | | [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) |
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |

View File

@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
Install `vllm` and `lm-evaluation-harness` for evaluation: Install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
Load and run the model in `vllm`: Load and run the model in `vllm`:

View File

@ -18,7 +18,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
## Quantization Process ## Quantization Process

View File

@ -23,7 +23,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
## Quantization Process ## Quantization Process

View File

@ -8,6 +8,16 @@ We recommend installing the library with:
pip install nvidia-modelopt pip install nvidia-modelopt
``` ```
## Supported ModelOpt checkpoint formats
vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
following `quantization.quant_algo` values:
- `FP8`: per-tensor weight scale (+ optional static activation scale).
- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
## Quantizing HuggingFace Models with PTQ ## Quantizing HuggingFace Models with PTQ
You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
if __name__ == "__main__": if __name__ == "__main__":
main() main()
``` ```
## Running the OpenAI-compatible server
To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
```bash
vllm serve <path_to_exported_checkpoint> \
--quantization modelopt \
--host 0.0.0.0 --port 8000
```
## Testing (local checkpoints)
vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
by default in CI. To run the tests locally:
```bash
export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
pytest -q tests/quantization/test_modelopt.py
```

View File

@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
### How FP8 KV Cache Works
The FP8 KV cache implementation follows this workflow:
1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
### Performance Impact ### Performance Impact
The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:

View File

@ -20,7 +20,7 @@ for more installation details.
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```bash
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] pip install vllm "lm-eval[api]>=0.4.9.2"
``` ```
## Quantization Process ## Quantization Process

View File

@ -28,3 +28,4 @@ The backends below live **outside** the main `vllm` repository and follow the
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> | | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> | | Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
| Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> | | Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |
| Apple Silicon (Metal) | N/A, install from source | <https://github.com/vllm-project/vllm-metal> |

View File

@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
!!! tip "GPU-Accelerated Inference with vLLM-Metal"
For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend.
# --8<-- [end:installation] # --8<-- [end:installation]
# --8<-- [start:requirements] # --8<-- [start:requirements]

View File

@ -418,7 +418,7 @@ th {
| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | | ✅︎ | | `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | | ✅︎ |
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | | `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ | | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
@ -490,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | | `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
@ -543,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | |
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | | `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | | `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
@ -562,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
!!! note !!! note
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
!!! note
`nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly.
Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py)
!!! note !!! note
Load the official original `mxbai-rerank-v2` by using the following command. Load the official original `mxbai-rerank-v2` by using the following command.

View File

@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
from langchain_community.llms import VLLM from langchain_community.llms import VLLM
llm = VLLM( llm = VLLM(
model="mosaicml/mpt-7b", model="Qwen/Qwen3-4B",
trust_remote_code=True, # mandatory for hf models trust_remote_code=True, # mandatory for hf models
max_new_tokens=128, max_new_tokens=128,
top_k=10, top_k=10,

View File

@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w
Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py) Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
#### Score Template
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
#### Single inference #### Single inference
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.

View File

@ -5,130 +5,91 @@ Usage:
Single node: Single node:
python examples/offline_inference/data_parallel.py \ python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \ --model="ibm-research/PowerMoE-3b" \
--dp-size=2 \ -dp=2 \
--tp-size=2 -tp=2
Multi-node: Multi-node:
Node 0 (assume the node has ip of 10.99.48.128): Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py \ python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \ --model="ibm-research/PowerMoE-3b" \
--dp-size=2 \ -dp=2 \
--tp-size=2 \ -tp=2 \
--node-size=2 \ --dp-num-nodes=2 \
--node-rank=0 \ --dp-node-rank=0 \
--master-addr=10.99.48.128 \ --dp-master-addr=10.99.48.128 \
--master-port=13345 --dp-master-port=13345
Node 1: Node 1:
python examples/offline_inference/data_parallel.py \ python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \ --model="ibm-research/PowerMoE-3b" \
--dp-size=2 \ -dp=2 \
--tp-size=2 \ -tp=2 \
--node-size=2 \ --dp-num-nodes=2 \
--node-rank=1 \ --dp-node-rank=1 \
--master-addr=10.99.48.128 \ --dp-master-addr=10.99.48.128 \
--master-port=13345 --dp-master-port=13345
""" """
import os import os
from time import sleep from time import sleep
from vllm import LLM, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
def parse_args(): def create_parser():
import argparse parser = FlexibleArgumentParser(description="Data Parallel Inference")
parser = argparse.ArgumentParser(description="Data Parallel Inference") # Add all engine args
EngineArgs.add_cli_args(parser)
parser.set_defaults(
model="ibm-research/PowerMoE-3b",
enable_expert_parallel=True,
)
# Add DP-specific args (separate from engine args to avoid conflicts)
parser.add_argument( parser.add_argument(
"--model", "--dp-num-nodes",
type=int,
default=1,
help="Total number of nodes for data parallel.",
)
parser.add_argument(
"--dp-node-rank",
type=int,
default=0,
help="Rank of the current node for data parallel.",
)
parser.add_argument(
"--dp-master-addr",
type=str, type=str,
default="ibm-research/PowerMoE-3b", default="",
help="Model name or path", help="Master node IP address for DP coordination.",
)
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
parser.add_argument(
"--node-size", type=int, default=1, help="Total number of nodes"
) )
parser.add_argument( parser.add_argument(
"--node-rank", type=int, default=0, help="Rank of the current node" "--dp-master-port",
)
parser.add_argument(
"--master-addr", type=str, default="", help="Master node IP address"
)
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
parser.add_argument(
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
)
parser.add_argument(
"--trust-remote-code", action="store_true", help="Trust remote code."
)
parser.add_argument(
"--max-num-seqs",
type=int, type=int,
default=64, default=0,
help=("Maximum number of sequences to be processed in a single iteration."), help="Master node port for DP coordination.",
)
parser.add_argument(
"--max-model-len",
type=int,
help=("Maximum number of tokens to be processed in a single iteration."),
) )
parser.add_argument( parser.add_argument(
"--timeout", "--timeout",
type=int, type=int,
default=300, default=300,
help=("Number of seconds before unresponsive process is killed."), help="Number of seconds before unresponsive process is killed.",
) )
parser.add_argument(
"--gpu-memory-utilization", return parser
type=float,
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
parser.add_argument(
"--enable-dbo",
action="store_true",
help=("Enable microbatched execution"),
)
parser.add_argument(
"--compilation-config",
type=int,
help=("Compilation optimization (O) mode 0-3."),
)
parser.add_argument(
"--quantization",
type=str,
)
parser.add_argument(
"--disable-expert-parallel",
dest="enable_expert_parallel",
action="store_false",
help="Disable expert parallel (default: enabled).",
)
parser.set_defaults(enable_expert_parallel=True)
return parser.parse_args()
def main( def main(
model,
dp_size, dp_size,
local_dp_rank, local_dp_rank,
global_dp_rank, global_dp_rank,
dp_master_ip, dp_master_ip,
dp_master_port, dp_master_port,
GPUs_per_dp_rank, engine_args,
enforce_eager,
enable_expert_parallel,
trust_remote_code,
max_num_seqs,
max_model_len,
compilation_config,
gpu_memory_utilization,
enable_dbo,
quantization,
): ):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@ -173,19 +134,7 @@ def main(
) )
# Create an LLM. # Create an LLM.
llm = LLM( llm = LLM(**engine_args)
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=enforce_eager,
enable_expert_parallel=enable_expert_parallel,
trust_remote_code=trust_remote_code,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enable_dbo=enable_dbo,
quantization=quantization,
compilation_config=compilation_config,
)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
for i, output in enumerate(outputs): for i, output in enumerate(outputs):
@ -204,22 +153,29 @@ def main(
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() parser = create_parser()
args = vars(parser.parse_args())
dp_size = args.dp_size # Extract DP-specific args (pop to remove from engine_args)
tp_size = args.tp_size dp_size = args.pop("data_parallel_size")
node_size = args.node_size dp_num_nodes = args.pop("dp_num_nodes")
node_rank = args.node_rank dp_node_rank = args.pop("dp_node_rank")
dp_master_addr = args.pop("dp_master_addr")
dp_master_port = args.pop("dp_master_port")
timeout = args.pop("timeout")
if node_size == 1: # Remaining args are engine args
engine_args = args
if dp_num_nodes == 1:
dp_master_ip = "127.0.0.1" dp_master_ip = "127.0.0.1"
dp_master_port = get_open_port() dp_master_port_val = get_open_port()
else: else:
dp_master_ip = args.master_addr dp_master_ip = dp_master_addr
dp_master_port = args.master_port dp_master_port_val = dp_master_port
assert dp_size % node_size == 0, "dp_size should be divisible by node_size" assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
dp_per_node = dp_size // node_size dp_per_node = dp_size // dp_num_nodes
from multiprocessing import Process from multiprocessing import Process
@ -230,34 +186,24 @@ if __name__ == "__main__":
procs = [] procs = []
for local_dp_rank, global_dp_rank in enumerate( for local_dp_rank, global_dp_rank in enumerate(
range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node) range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
): ):
proc = Process( proc = Process(
target=main, target=main,
args=( args=(
args.model,
dp_size, dp_size,
local_dp_rank, local_dp_rank,
global_dp_rank, global_dp_rank,
dp_master_ip, dp_master_ip,
dp_master_port, dp_master_port_val,
tp_size, engine_args,
args.enforce_eager,
args.enable_expert_parallel,
args.trust_remote_code,
args.max_num_seqs,
args.max_model_len,
args.compilation_config,
args.gpu_memory_utilization,
args.enable_dbo,
args.quantization,
), ),
) )
proc.start() proc.start()
procs.append(proc) procs.append(proc)
exit_code = 0 exit_code = 0
for proc in procs: for proc in procs:
proc.join(timeout=args.timeout) proc.join(timeout=timeout)
if proc.exitcode is None: if proc.exitcode is None:
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
proc.kill() proc.kill()

View File

@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
- `--max-num-batched-tokens=<large value>` **(default: 2048)** This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager. - `--max-num-batched-tokens=<large value>` **(default: 2048)** This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
## Local media inputs ## Local media inputs
To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance: To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:

View File

@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from pathlib import Path
from vllm import LLM
model_name = "nvidia/llama-nemotron-rerank-1b-v2"
# Path to template file
template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja"
chat_template = template_path.read_text()
llm = LLM(model=model_name, runner="pooling", trust_remote_code=True)
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
outputs = llm.score(query, documents, chat_template=chat_template)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)

View File

@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example of using the rerank API with template.
run:
vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
"""
import json
import requests
url = "http://127.0.0.1:8000/rerank"
headers = {"accept": "application/json", "Content-Type": "application/json"}
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
data = {
"model": "nvidia/llama-nemotron-rerank-1b-v2",
"query": query,
"documents": documents,
}
def main():
response = requests.post(url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,3 @@
question:{{ (messages | selectattr("role", "eq", "query") | first).content }}
passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}

View File

@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test mteb>=1.38.11, <2 # required for mteb test
transformers==4.57.3 transformers==4.57.3
tokenizers==0.22.0 tokenizers==0.22.0

View File

@ -58,7 +58,7 @@ schemathesis==3.39.15
# OpenAI schema test # OpenAI schema test
# Evaluation and benchmarking # Evaluation and benchmarking
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d lm-eval[api]>=0.4.9.2
jiwer==4.0.0 jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test

View File

@ -34,8 +34,7 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api]>=0.4.9.2 # required for model evaluation test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.3 transformers==4.57.3
tokenizers==0.22.0 tokenizers==0.22.0

View File

@ -441,7 +441,7 @@ lightning-utilities==0.14.3
# torchmetrics # torchmetrics
llvmlite==0.44.0 llvmlite==0.44.0
# via numba # via numba
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d lm-eval==0.4.9.2
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via # via

152
setup.py
View File

@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
sys.platform, sys.platform,
) )
VLLM_TARGET_DEVICE = "empty" VLLM_TARGET_DEVICE = "empty"
elif ( elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
sys.platform.startswith("linux") if torch.version.hip is not None:
and torch.version.cuda is None VLLM_TARGET_DEVICE = "rocm"
and os.getenv("VLLM_TARGET_DEVICE") is None logger.info("Auto-detected ROCm")
and torch.version.hip is None elif torch.version.cuda is not None:
): VLLM_TARGET_DEVICE = "cuda"
# if cuda or hip is not available and VLLM_TARGET_DEVICE is not set, logger.info("Auto-detected CUDA")
# fallback to cpu else:
VLLM_TARGET_DEVICE = "cpu" VLLM_TARGET_DEVICE = "cpu"
def is_sccache_available() -> bool: def is_sccache_available() -> bool:
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
num_jobs = os.cpu_count() num_jobs = os.cpu_count()
nvcc_threads = None nvcc_threads = None
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"): if _is_cuda() and CUDA_HOME is not None:
# `nvcc_threads` is either the value of the NVCC_THREADS try:
# environment variable (if defined) or 1. nvcc_version = get_nvcc_cuda_version()
# when it is set, we reduce `num_jobs` to avoid if nvcc_version >= Version("11.2"):
# overloading the system. # `nvcc_threads` is either the value of the NVCC_THREADS
nvcc_threads = envs.NVCC_THREADS # environment variable (if defined) or 1.
if nvcc_threads is not None: # when it is set, we reduce `num_jobs` to avoid
nvcc_threads = int(nvcc_threads) # overloading the system.
logger.info( nvcc_threads = envs.NVCC_THREADS
"Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads if nvcc_threads is not None:
) nvcc_threads = int(nvcc_threads)
else: logger.info(
nvcc_threads = 1 "Using NVCC_THREADS=%d as the number of nvcc threads.",
num_jobs = max(1, num_jobs // nvcc_threads) nvcc_threads,
)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
except Exception as e:
logger.warning("Failed to get NVCC version: %s", e)
return num_jobs, nvcc_threads return num_jobs, nvcc_threads
@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
# Default build tool to whatever cmake picks. # Default build tool to whatever cmake picks.
build_tool = [] build_tool = []
# Make sure we use the nvcc from CUDA_HOME # Make sure we use the nvcc from CUDA_HOME
if _is_cuda(): if _is_cuda() and CUDA_HOME is not None:
cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"] cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
elif _is_hip(): elif _is_hip() and ROCM_HOME is not None:
cmake_args += [f"-DROCM_PATH={ROCM_HOME}"] cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
other_cmake_args = os.environ.get("CMAKE_ARGS") other_cmake_args = os.environ.get("CMAKE_ARGS")
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
wheels = json.loads(resp.read().decode("utf-8")) wheels = json.loads(resp.read().decode("utf-8"))
return wheels, repo_url return wheels, repo_url
@staticmethod
def is_rocm_system() -> bool:
"""Detect ROCm without relying on torch (for build environment)."""
if os.getenv("ROCM_PATH"):
return True
if os.path.isdir("/opt/rocm"):
return True
if which("rocminfo") is not None:
return True
try:
import torch
return torch.version.hip is not None
except ImportError:
return False
@staticmethod
def find_local_rocm_wheel() -> str | None:
"""Search for a local vllm wheel in common locations."""
import glob
for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
wheels = glob.glob(pattern)
if wheels:
return sorted(wheels)[-1]
return None
@staticmethod
def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
"""Fetch the latest wheel URL from a PyPI-style simple index."""
import platform
from html.parser import HTMLParser
from urllib.parse import urljoin
from urllib.request import urlopen
arch = platform.machine()
class WheelLinkParser(HTMLParser):
def __init__(self):
super().__init__()
self.wheels = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href" and value.endswith(".whl"):
self.wheels.append(value)
simple_url = f"{index_url.rstrip('/')}/{package}/"
print(f"Fetching wheel list from {simple_url}")
with urlopen(simple_url) as resp:
html = resp.read().decode("utf-8")
parser = WheelLinkParser()
parser.feed(html)
for wheel in reversed(parser.wheels):
if arch in wheel:
if wheel.startswith("http"):
return wheel
return urljoin(simple_url, wheel)
raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
@staticmethod
def determine_wheel_url_rocm() -> tuple[str, str | None]:
"""Determine the precompiled wheel for ROCm."""
# Search for local wheel first
local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
if local_wheel is not None:
print(f"Found local ROCm wheel: {local_wheel}")
return local_wheel, None
# Fall back to AMD's PyPI index
index_url = os.getenv(
"VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
)
print(f"Fetching ROCm precompiled wheel from {index_url}")
wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
download_filename = wheel_url.split("/")[-1].split("#")[0]
print(f"Using ROCm precompiled wheel: {wheel_url}")
return wheel_url, download_filename
@staticmethod @staticmethod
def determine_wheel_url() -> tuple[str, str | None]: def determine_wheel_url() -> tuple[str, str | None]:
""" """
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
print(f"Using user-specified precompiled wheel location: {wheel_location}") print(f"Using user-specified precompiled wheel location: {wheel_location}")
return wheel_location, None return wheel_location, None
else: else:
# ROCm: use local wheel or AMD's PyPI index
# TODO: When we have ROCm nightly wheels, we can update this logic.
if precompiled_wheel_utils.is_rocm_system():
return precompiled_wheel_utils.determine_wheel_url_rocm()
import platform import platform
arch = platform.machine() arch = platform.machine()
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so", "vllm/cumem_allocator.abi3.so",
# ROCm-specific libraries
"vllm/_rocm_C.abi3.so",
] ]
flash_attn_regex = re.compile( flash_attn_regex = re.compile(
@ -601,6 +697,8 @@ def get_rocm_version():
# Get the Rocm version from the ROCM_HOME/bin/librocm-core.so # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
# see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
try: try:
if ROCM_HOME is None:
return None
librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
if not librocm_core_file.is_file(): if not librocm_core_file.is_file():
return None return None
@ -745,7 +843,9 @@ if _is_hip():
if _is_cuda(): if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): if envs.VLLM_USE_PRECOMPILED or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
):
# FA3 requires CUDA 12.3 or later # FA3 requires CUDA 12.3 or later
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# Optional since this doesn't get built (produce an .so file) when # Optional since this doesn't get built (produce an .so file) when

View File

@ -557,7 +557,8 @@ def test_rms_group_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties

View File

@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
"evaluate_guards": evaluate_guards, "evaluate_guards": evaluate_guards,
}, },
}, },
max_model_len=1024,
) )
output = model.generate(prompt) output = model.generate(prompt)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import pytest import pytest
import torch import torch
@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
hidden_size: int, hidden_size: int,
eps: float, eps: float,
group_shape: GroupShape, group_shape: GroupShape,
cuda_force_torch: bool, use_aiter: bool = False,
cuda_force_torch: bool = False,
use_aiter_quant_op: bool = True,
*args, *args,
**kwargs, **kwargs,
): ):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.use_aiter = use_aiter
self.use_aiter_quant_op = use_aiter_quant_op
self.cuda_force_torch = cuda_force_torch self.cuda_force_torch = cuda_force_torch
self.group_shape = group_shape
self.enable_quant_fp8_custom_op = None # Will be set later if applicable
self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)] self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
if group_shape.is_per_group():
self.wscale = [ # Setup quantization scale descriptor
torch.rand( static = group_shape == GroupShape.PER_TENSOR and not use_aiter
(hidden_size // group_shape[1], hidden_size // group_shape[1]),
dtype=torch.float32,
)
for _ in range(3)
]
else:
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
static = group_shape == GroupShape.PER_TENSOR
quant_scale = ScaleDesc(torch.float32, static, group_shape) quant_scale = ScaleDesc(torch.float32, static, group_shape)
self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True) self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
# Setup scales
if static: if static:
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
else: else:
self.scale = [None for _ in range(3)] self.scale = [None for _ in range(3)]
# Setup weights
self.w = [ self.w = [
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3) torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
] ]
if not group_shape.is_per_group(): if not group_shape.is_per_group() or use_aiter:
self.w = [self.w[0].t() for _ in range(3)] self.w = [self.w[0].t() for _ in range(3)]
# Setup weight scales
if group_shape.is_per_group(): if group_shape.is_per_group():
scale_size = (
(hidden_size + 128 - 1) // 128
if use_aiter
else hidden_size // group_shape[1]
)
wscale_shape: tuple[int, ...] = (scale_size, scale_size)
else:
wscale_shape = (1,)
self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)]
# Setup FP8 linear operation
is_per_group = group_shape.is_per_group()
if is_per_group and use_aiter:
self.fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(128, 128),
act_quant_group_shape=group_shape,
use_aiter_and_is_supported=use_aiter_quant_op,
)
# AITER blockwise doesn't use enable_quant_fp8_custom_op
elif is_per_group:
self.fp8_linear = W8A8BlockFp8LinearOp( self.fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(group_shape[1], group_shape[1]), weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
act_quant_group_shape=group_shape, act_quant_group_shape=group_shape,
@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
use_aiter_and_is_supported=False, use_aiter_and_is_supported=False,
) )
self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled() self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
elif use_aiter:
self.fp8_linear = Fp8LinearOp(
act_quant_static=False,
act_quant_group_shape=group_shape,
)
self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
else: else:
with override_cutlass_fp8_supported(not cuda_force_torch): with override_cutlass_fp8_supported(not cuda_force_torch):
self.fp8_linear = Fp8LinearOp( self.fp8_linear = Fp8LinearOp(
@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled() self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
self.enable_rms_norm_custom_op = self.norm[0].enabled() self.enable_rms_norm_custom_op = self.norm[0].enabled()
self.group_shape = group_shape
def forward(self, x): def forward(self, x):
# avoid having graph input be an arg to a pattern directly # avoid having graph input be an arg to a pattern directly
@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
y4, resid = self.norm[3](x4, resid) # use resid here y4, resid = self.norm[3](x4, resid) # use resid here
return y4 return y4
def ops_in_model_before(self):
if (
self.use_aiter
and self.group_shape.is_per_group()
and current_platform.is_fp8_fnuz()
):
return [rocm_aiter_ops.get_group_quant_op()]
if self.use_aiter and self.group_shape.is_per_group():
return [torch.ops.vllm.triton_per_token_group_quant_fp8.default]
if self.use_aiter and self.use_aiter_quant_op:
return [rocm_aiter_ops.get_per_token_quant_op()]
if self.use_aiter:
return [QUANT_OPS[self.quant_key]]
if self.enable_quant_fp8_custom_op:
return [QUANT_OPS[self.quant_key]]
return [torch.ops.aten.reciprocal]
def ops_in_model_after(self): def ops_in_model_after(self):
if self.use_aiter and self.group_shape.is_per_group():
from vllm.compilation.rocm_aiter_fusion import (
AiterFusedAddRMSFp8GroupQuantPattern,
AiterRMSFp8GroupQuantPattern,
)
return [
AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP,
AiterRMSFp8GroupQuantPattern.FUSED_OP,
]
if self.use_aiter:
from vllm.compilation.rocm_aiter_fusion import (
AiterFusedAddRMSNormDynamicQuantPattern,
AiterRMSNormDynamicQuantPattern,
)
return [
AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP,
AiterRMSNormDynamicQuantPattern.FUSED_OP,
]
return [ return [
FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)], FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)], FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
] ]
def ops_in_model_before(self):
return (
[QUANT_OPS[self.quant_key]]
if self.enable_quant_fp8_custom_op
else [torch.ops.aten.reciprocal]
)
def ops_in_model_before_partial(self): def ops_in_model_before_partial(self):
return ( return (
[RMS_OP, RMS_ADD_OP] [RMS_OP, RMS_ADD_OP]
@ -155,67 +214,45 @@ GROUP_SHAPES = [
] ]
class TestRmsnormGroupFp8QuantModel(torch.nn.Module): def _run_fusion_test(
def __init__(self, hidden_size: int, eps: float, **kwargs): model,
super().__init__() fusion_pass,
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( vllm_config,
weight_group_shape=GroupShape(128, 128), dtype,
act_quant_group_shape=GroupShape(1, 128), hidden_size,
cutlass_block_fp8_supported=False, num_tokens,
use_aiter_and_is_supported=True, ):
) """Helper function for common fusion test logic.
self.w = [
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
for _ in range(3)
]
scale_hidden_size = (hidden_size + 128 - 1) // 128 Must be called within vllm_config context.
self.wscale = [ """
torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32) noop_pass = NoOpEliminationPass(vllm_config)
for _ in range(3) cleanup_pass = PostCleanupPass(vllm_config)
]
self.norm_weight = [torch.ones(hidden_size) for _ in range(4)] backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
self.eps = eps backend2 = TestBackend(noop_pass, cleanup_pass)
def forward(self, x): x = torch.rand(num_tokens, hidden_size)
# avoid having graph input be an arg to a pattern directly torch._dynamo.mark_dynamic(x, 0)
x = resid = torch.relu(x)
y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0]) model_fused = torch.compile(model, backend=backend)
# make sure resid is used for replacement to work result_fused = model_fused(x)
y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
x2, resid, self.norm_weight[1], self.eps
)
x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1]) model_unfused = torch.compile(model, backend=backend2)
result_unfused = model_unfused(x)
y3, resid = rocm_aiter_ops.rms_norm2d_with_add( if dtype == torch.float16:
x3, resid, self.norm_weight[2], self.eps ATOL, RTOL = (2e-3, 2e-3)
) else:
ATOL, RTOL = (1e-2, 1e-2)
x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2]) torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
y4, resid = rocm_aiter_ops.rms_norm2d_with_add( assert fusion_pass.matched_count == 3
x4, resid, self.norm_weight[3], self.eps backend.check_before_ops(model.ops_in_model_before())
) backend.check_after_ops(model.ops_in_model_after())
return y4
def ops_in_model_before(self): return backend, backend2
return [
torch.ops.vllm.rocm_aiter_rms_norm,
torch.ops.vllm.rocm_aiter_group_fp8_quant,
]
def ops_in_model_before_partial(self):
return []
def ops_in_model_after(self):
return [
torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
]
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("group_shape", GROUP_SHAPES) @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
@pytest.mark.parametrize( @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
"model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op", @pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
list(itertools.product([TestModel], [True, False], [True, False]))
+ [(TestRmsnormGroupFp8QuantModel, False, False)],
)
# cuda_force_torch used to test torch code path on platforms that # cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True. # cutlass_fp8_supported() == True.
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
num_tokens, num_tokens,
eps, eps,
group_shape, group_shape,
model_class,
enable_rms_norm_custom_op, enable_rms_norm_custom_op,
enable_quant_fp8_custom_op, enable_quant_fp8_custom_op,
cuda_force_torch, cuda_force_torch,
): ):
if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
pytest.skip("AITER is not supported on this GPU.")
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity() # needed for certain non-cutlass fp8 paths
if not enable_quant_fp8_custom_op and group_shape.is_per_group(): if not enable_quant_fp8_custom_op and group_shape.is_per_group():
pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization") pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
# Skip test for 64-bit group shape when running with cutlass or deepgemm
if group_shape == GroupShape(1, 64) and ( if group_shape == GroupShape(1, 64) and (
cutlass_block_fp8_supported() or is_deep_gemm_supported() cutlass_block_fp8_supported() or is_deep_gemm_supported()
): ):
@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
custom_ops.append("+rms_norm") custom_ops.append("+rms_norm")
if enable_quant_fp8_custom_op: if enable_quant_fp8_custom_op:
custom_ops.append("+quant_fp8") custom_ops.append("+quant_fp8")
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype), model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
), ),
), ),
) )
with vllm.config.set_current_vllm_config(vllm_config): with vllm.config.set_current_vllm_config(vllm_config):
# Reshape pass is needed for the fusion pass to work # Setup device before model creation
noop_pass = NoOpEliminationPass(vllm_config) torch.set_default_device("cuda")
if model_class is TestRmsnormGroupFp8QuantModel: torch.set_default_dtype(dtype)
from vllm.compilation.rocm_aiter_fusion import ( torch.manual_seed(1)
RocmAiterRMSNormFp8GroupQuantFusionPass, maybe_create_device_identity()
)
fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config) fusion_pass = RMSNormQuantFusionPass(vllm_config)
else: model = TestModel(
fusion_pass = RMSNormQuantFusionPass(vllm_config)
cleanup_pass = PostCleanupPass(vllm_config)
backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
backend2 = TestBackend(noop_pass, cleanup_pass)
model = model_class(
hidden_size=hidden_size, hidden_size=hidden_size,
eps=eps, eps=eps,
group_shape=group_shape, group_shape=group_shape,
use_aiter=False,
cuda_force_torch=cuda_force_torch, cuda_force_torch=cuda_force_torch,
) )
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
model_fused = torch.compile(model, backend=backend) backend, _ = _run_fusion_test(
result_fused = model_fused(x) model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
)
model_unfused = torch.compile(model, backend=backend2)
result_unfused = model_unfused(x)
if dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
assert fusion_pass.matched_count == 3
backend.check_before_ops(model.ops_in_model_before())
backend.check_before_ops( backend.check_before_ops(
model.ops_in_model_before_partial(), fully_replaced=False model.ops_in_model_before_partial(), fully_replaced=False
) )
backend.check_after_ops(model.ops_in_model_after())
# If RMSNorm custom op is disabled (native/torch impl used), # If RMSNorm custom op is disabled (native/torch impl used),
# there's a risk that the fused add doesn't get included in the # there's a risk that the fused add doesn't get included in the
# replacement and only the rms part gets fused with quant. # replacement and only the rms part gets fused with quant.
# Hence, we check only 2 add nodes are left (final fused rmsnorm add). # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
if ( if not enable_rms_norm_custom_op:
not enable_rms_norm_custom_op
and model_class is not TestRmsnormGroupFp8QuantModel
):
n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g)) n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
# 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each) # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
assert n_add_nodes(backend.graph_pre_pass) == 7 assert n_add_nodes(backend.graph_pre_pass) == 7
assert n_add_nodes(backend.graph_post_pass) == 2 assert n_add_nodes(backend.graph_post_pass) == 2
GROUP_SHAPE_QUANT_OPS_MATCHS = [
(GroupShape.PER_TOKEN, True),
(GroupShape.PER_TOKEN, False),
(GroupShape(1, 128), True),
]
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("hidden_size", [256])
@pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize(
"group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS
)
@pytest.mark.skipif(
(not current_platform.is_rocm() or not IS_AITER_FOUND),
reason="Only test on ROCm with aiter package installed",
)
def test_aiter_fusion_rmsnorm_quant(
dtype: torch.dtype,
hidden_size: int,
num_tokens: int,
eps: float,
group_shape: GroupShape,
use_aiter_quant_op: bool,
monkeypatch: pytest.MonkeyPatch,
):
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm", "+quant_fp8"],
pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True),
),
)
with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass
m.setenv("VLLM_ROCM_USE_AITER", "1")
rocm_aiter_ops.refresh_env_variables()
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity()
fusion_pass = RocmAiterRMSNormFusionPass(vllm_config)
model = TestModel(
hidden_size=hidden_size,
eps=eps,
group_shape=group_shape,
use_aiter=True,
use_aiter_quant_op=use_aiter_quant_op,
)
_run_fusion_test(
model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
)

View File

@ -410,7 +410,7 @@ class HfRunner:
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
model_name, model_name,

View File

@ -511,6 +511,16 @@ def test_human_readable_model_len():
args = parser.parse_args(["--max-model-len", "10.2123451234567t"]) args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
assert args.max_model_len == 10212345123456 assert args.max_model_len == 10212345123456
# Special value -1 for auto-fit to GPU memory
args = parser.parse_args(["--max-model-len", "-1"])
assert args.max_model_len == -1
# 'auto' is an alias for -1
args = parser.parse_args(["--max-model-len", "auto"])
assert args.max_model_len == -1
args = parser.parse_args(["--max-model-len", "AUTO"])
assert args.max_model_len == -1
# Invalid (do not allow decimals with binary multipliers) # Invalid (do not allow decimals with binary multipliers)
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]: for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
with pytest.raises(ArgumentError): with pytest.raises(ArgumentError):

View File

@ -5,6 +5,30 @@ import pytest
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
def add_attention_backend(server_args, attention_config):
"""Append attention backend CLI arg if specified.
Args:
server_args: List of server arguments to extend in-place.
attention_config: Dict with 'backend' key, or None.
"""
if attention_config and "backend" in attention_config:
server_args.extend(["--attention-backend", attention_config["backend"]])
@pytest.fixture(scope="module")
def rocm_aiter_fa_attention():
"""Return attention config for transcription/translation tests on ROCm.
On ROCm, audio tests require ROCM_AITER_FA attention backend.
"""
from vllm.platforms import current_platform
if current_platform.is_rocm():
return {"backend": "ROCM_AITER_FA"}
return None
@pytest.fixture @pytest.fixture
def mary_had_lamb(): def mary_had_lamb():
path = AudioAsset("mary_had_lamb").get_local_path() path = AudioAsset("mary_had_lamb").get_local_path()

View File

@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -8,7 +8,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.multimodal.utils import encode_audio_base64, fetch_audio from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
} }
@pytest.fixture(scope="session")
def url_encoded_audio() -> dict[str, str]:
return {
audio_url: encode_audio_url(*fetch_audio(audio_url))
for audio_url in TEST_AUDIO_URLS
}
def dummy_messages_from_audio_url( def dummy_messages_from_audio_url(
audio_urls: str | list[str], audio_urls: str | list[str],
content_text: str = "What's happening in this audio?", content_text: str = "What's happening in this audio?",
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
audio_url: str, audio_url: str,
base64_encoded_audio: dict[str, str], url_encoded_audio: dict[str, str],
): ):
messages = dummy_messages_from_audio_url( messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
)
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -313,7 +319,7 @@ async def test_chat_streaming_input_audio(
"format": "wav", "format": "wav",
}, },
}, },
{"type": "text", "text": "What's happening in this audio?"}, {"type": "text", "text": "What's a short title for this audio?"},
], ],
} }
] ]

View File

@ -28,7 +28,7 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): # noqa: F811 def server(zephyr_lora_files):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
{"role": "system", "content": "you are a helpful assistant"}, {"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "what is 1+1?"}, {"role": "user", "content": "what is 1+1?"},
] ]
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=5,
logprobs=True, logprobs=True,
top_logprobs=5, top_logprobs=5,
) )
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=37, total_tokens=47 completion_tokens=5, prompt_tokens=37, total_tokens=42
) )
message = choice.message message = choice.message
assert message.content is not None and len(message.content) >= 10 assert message.content is not None and len(message.content) >= 5
assert message.role == "assistant" assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content}) messages.append({"role": "assistant", "content": message.content})
@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=5,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0

View File

@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
"--max-model-len", "--max-model-len",
"8192", "8192",

View File

@ -125,7 +125,7 @@ messages = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
@ -212,7 +212,7 @@ async def test_function_tool_use(
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def k2_server(): # noqa: F811 def k2_server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def multimodal_server(): # noqa: F811 def multimodal_server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def chat_server_with_force_include_usage(request): # noqa: F811 def chat_server_with_force_include_usage(request):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): # noqa: F811 def server():
args = [ args = [
"--max-model-len", "--max-model-len",
"2048", "2048",

View File

@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
"2", "2",
*passed_params, *passed_params,
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib import importlib
import importlib.util
import json import json
import time import time
@ -503,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_code_interpreter(client: OpenAI, model_name: str): async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create( # Code interpreter may need more time for container init + code execution
timeout_value = client.timeout * 3
client_with_timeout = client.with_options(timeout=timeout_value)
response = await client_with_timeout.responses.create(
model=model_name, model=model_name,
# TODO: Ideally should be able to set max tool calls # TODO: Ideally should be able to set max tool calls
# to prevent multi-turn, but it is not currently supported # to prevent multi-turn, but it is not currently supported
@ -867,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.flaky(reruns=3)
async def test_function_call_with_previous_input_messages( async def test_function_call_with_previous_input_messages(
client: OpenAI, model_name: str client: OpenAI, model_name: str
): ):
@ -986,3 +992,23 @@ async def test_function_call_with_previous_input_messages(
assert ( assert (
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
) )
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
response = await client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
temperature=0.0,
max_tokens=250,
)
choice = response.choices[0]
assert choice.finish_reason == "length", (
f"Expected finish_reason='length', got {choice.finish_reason}"
)
assert choice.message.content is not None, (
"Content should not be None when truncated"
)
assert len(choice.message.content) > 0, "Content should not be empty"

View File

@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server_fixture(request, default_server_args): # noqa: F811 def server_fixture(request, default_server_args):
use_server_flag = request.param use_server_flag = request.param
if use_server_flag: if use_server_flag:
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]

View File

@ -955,7 +955,6 @@ class TestServingChatWithHarmony:
input_messages, input_messages,
[ [
{"role": "system"}, {"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]}, {"role": "user", "content": messages[0]["content"]},
], ],
) )
@ -983,7 +982,6 @@ class TestServingChatWithHarmony:
input_messages_2, input_messages_2,
[ [
{"role": "system"}, {"role": "system"},
{"role": "developer"},
{"role": "user"}, {"role": "user"},
# The analysis message should be dropped on subsequent inputs because # The analysis message should be dropped on subsequent inputs because
# of the subsequent assistant message to the final channel. # of the subsequent assistant message to the final channel.
@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony:
) )
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony:
) )
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony:
) )
# Test the Harmony messages for the second turn's input # Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages( verify_harmony_messages(
input_messages_2, input_messages_2,
@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony:
) )
# Test the Harmony messages for the third turn's input # Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
verify_harmony_messages( verify_harmony_messages(
input_messages_3, input_messages_3,
@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony:
) )
# Test the Harmony messages for the fourth turn's input # Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
verify_harmony_messages( verify_harmony_messages(
input_messages_4, input_messages_4,
@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony:
input_messages, input_messages,
[ [
{"role": "system"}, {"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]}, {"role": "user", "content": messages[0]["content"]},
# The reasoning that would have resulted in an analysis message is # The reasoning that would have resulted in an analysis message is
# dropped because of a later assistant message to the final channel. # dropped because of a later assistant message to the final channel.
@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony:
input_messages, input_messages,
[ [
{"role": "system"}, {"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]}, {"role": "user", "content": messages[0]["content"]},
{ {
"role": "assistant", "role": "assistant",
@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony:
input_messages, input_messages,
[ [
{"role": "system"}, {"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]}, {"role": "user", "content": messages[0]["content"]},
{ {
"role": "assistant", "role": "assistant",

View File

@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
add_generation_prompt=True, add_generation_prompt=True,
enable_thinking=False, # default with Qwen3 enable_thinking=False, # default with Qwen3
) )
for ignore_eos in [True, False]: for ignore_eos in [True, False]:
payload = { payload = {
"model": MODEL_NAME, "model": MODEL_NAME,
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
} }
generate_resp = await client.post(GEN_ENDPOINT, json=payload) generate_resp = await client.post(GEN_ENDPOINT, json=payload)
generate_data = generate_resp.json() generate_data = generate_resp.json()
generate_res = tokenizer.decode( gen_token_ids = generate_data["choices"][0]["token_ids"]
generate_data["choices"][0]["token_ids"], skip_special_tokens=True generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
)
payload = { payload = {
"model": MODEL_NAME, "model": MODEL_NAME,
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
"temperature": 0.0, "temperature": 0.0,
"stream": False, "stream": False,
"ignore_eos": ignore_eos, "ignore_eos": ignore_eos,
"chat_template_kwargs": dict(enable_thinking=False), "chat_template_kwargs": {"enable_thinking": False},
} }
completions_resp = await client.post("/v1/chat/completions", json=payload) completions_resp = await client.post("/v1/chat/completions", json=payload)
completions_data = completions_resp.json() completions_data = completions_resp.json()
completions_res = completions_data["choices"][0]["message"]["content"] completions_res = completions_data["choices"][0]["message"]["content"]
if ignore_eos:
# When ignoring EOS, only compare up to the first EOS token
# Post-EOS generation is undefined and may differ
eos_tokens = {
tokenizer.eos_token_id,
*tokenizer.additional_special_tokens_ids,
}
# Find first EOS in generated tokens
eos_pos = None
for i, tid in enumerate(gen_token_ids):
if tid in eos_tokens:
eos_pos = i
break
if eos_pos is not None:
gen_token_ids_truncated = gen_token_ids[:eos_pos]
generate_res = tokenizer.decode(
gen_token_ids_truncated, skip_special_tokens=True
)
# Truncate completions_res to same length for comparison
completions_res = completions_res[: len(generate_res)]
assert generate_res == completions_res assert generate_res == completions_res

View File

@ -9,10 +9,16 @@ import time
import openai import openai
import pytest import pytest
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
# GPU initialization might take take longer
_IS_ROCM = current_platform.is_rocm()
_SERVER_STARTUP_TIMEOUT = 120
_PROCESS_EXIT_TIMEOUT = 15
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_shutdown_on_engine_failure(): async def test_shutdown_on_engine_failure():
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
"2", "2",
"--disable-frontend-multiprocessing", "--disable-frontend-multiprocessing",
], ],
stdout=subprocess.PIPE, # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
stderr=subprocess.PIPE, # stdout/stderr pipes are enabled during ROCm GPU initialization.
text=True, stdout=None if _IS_ROCM else subprocess.PIPE,
stderr=None if _IS_ROCM else subprocess.PIPE,
text=None if _IS_ROCM else True,
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
) )
@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
) )
# Poll until server is ready # Poll until server is ready
while time.time() - start_time < 30: while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
try: try:
await client.completions.create( await client.completions.create(
model=MODEL_NAME, prompt="Hello", max_tokens=1 model=MODEL_NAME, prompt="Hello", max_tokens=1
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
except Exception: except Exception:
time.sleep(0.5) time.sleep(0.5)
if proc.poll() is not None: if proc.poll() is not None:
stdout, stderr = proc.communicate(timeout=1) if _IS_ROCM:
pytest.fail( pytest.fail(f"Server died during startup: {proc.returncode}")
f"Server died during startup. stdout: {stdout}, stderr: {stderr}" else:
) stdout, stderr = proc.communicate(timeout=1)
pytest.fail(
f"Server died during startup. "
f"stdout: {stdout}, stderr: {stderr}"
)
else: else:
proc.terminate() proc.terminate()
proc.wait(timeout=5) proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
pytest.fail("Server failed to start in 30 seconds") pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
# Kill server to simulate crash # Kill server to simulate crash
proc.terminate() proc.terminate()
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
model=MODEL_NAME, prompt="This should fail", max_tokens=1 model=MODEL_NAME, prompt="This should fail", max_tokens=1
) )
return_code = proc.wait(timeout=5) return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
assert return_code is not None assert return_code is not None

View File

@ -7,6 +7,7 @@ import json
import pytest import pytest
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .conftest import add_attention_backend
MISTRAL_FORMAT_ARGS = [ MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "--tokenizer_mode",
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) @pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
async def test_basic_audio(mary_had_lamb, model_name): async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
server_args = ["--enforce-eager"] server_args = ["--enforce-eager"]
if model_name.startswith("mistralai"): if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS server_args += MISTRAL_FORMAT_ARGS
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb): async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (transcribe) requests can pass LoRA through to generate.""" """Ensure STT (transcribe) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
model_name = "ibm-granite/granite-speech-3.3-2b" model_name = "ibm-granite/granite-speech-3.3-2b"
lora_model_name = "speech" lora_model_name = "speech"
server_args = [ server_args = [
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
"--lora-modules", "--lora-modules",
f"{lora_model_name}={model_name}", f"{lora_model_name}={model_name}",
"--max-model-len", "--max-model-len",
"2048", "512" if current_platform.is_rocm() else "2048",
"--max-num-seqs", "--max-num-seqs",
"1", "1",
] ]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio_gemma(foscolo): async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
# Gemma accuracy on some of the audio samples we use is particularly bad, # Gemma accuracy on some of the audio samples we use is particularly bad,
# hence we use a different one here. WER is evaluated separately. # hence we use a different one here. WER is evaluated separately.
model_name = "google/gemma-3n-E2B-it" model_name = "google/gemma-3n-E2B-it"
server_args = ["--enforce-eager"] server_args = ["--enforce-eager"]
add_attention_backend(server_args, rocm_aiter_fa_attention)
with RemoteOpenAIServer( with RemoteOpenAIServer(
model_name, server_args, max_wait_seconds=480 model_name, server_args, max_wait_seconds=480
) as remote_server: ) as remote_server:

View File

@ -14,16 +14,26 @@ import pytest_asyncio
import soundfile as sf import soundfile as sf
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .conftest import add_attention_backend
SERVER_ARGS = ["--enforce-eager"] SERVER_ARGS = ["--enforce-eager"]
def _get_server_args(attention_config):
"""Get server args with attention backend if specified."""
args = SERVER_ARGS.copy()
add_attention_backend(args, attention_config)
return args
@pytest.fixture( @pytest.fixture(
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"] scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
) )
def server(request): def server(request, rocm_aiter_fa_attention):
# Parametrize over model name # Parametrize over model name
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: with RemoteOpenAIServer(
request.param, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
yield remote_server, request.param yield remote_server, request.param
@ -35,10 +45,12 @@ async def client_and_model(server):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_non_asr_model(foscolo): async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
# text to text model # text to text model
model_name = "JackFram/llama-68m" model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: with RemoteOpenAIServer(
model_name, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
res = await client.audio.translations.create( res = await client.audio.translations.create(
model=model_name, file=foscolo, temperature=0.0 model=model_name, file=foscolo, temperature=0.0
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb): async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (translate) requests can pass LoRA through to generate.""" """Ensure STT (translate) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
# NOTE - careful to call this test before the module scoped server # NOTE - careful to call this test before the module scoped server
# fixture, otherwise it'll OOMkill the CI # fixture, otherwise it'll OOMkill the CI
model_name = "ibm-granite/granite-speech-3.3-2b" model_name = "ibm-granite/granite-speech-3.3-2b"
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
"--lora-modules", "--lora-modules",
f"{lora_model_name}={model_name}", f"{lora_model_name}={model_name}",
"--max-model-len", "--max-model-len",
"2048", "512" if current_platform.is_rocm() else "2048",
"--max-num-seqs", "--max-num-seqs",
"1", "1",
] ]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()

View File

@ -7,7 +7,8 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video from vllm.multimodal.utils import encode_video_url, fetch_video
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@ -37,7 +38,16 @@ def server():
json.dumps({"video": MAXIMUM_VIDEOS}), json.dumps({"video": MAXIMUM_VIDEOS}),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: # ROCm: Increase timeouts to handle potential network delays and slower
# video processing when downloading multiple videos from external sources
env_overrides = {}
if current_platform.is_rocm():
env_overrides = {
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
yield remote_server yield remote_server
@ -48,9 +58,9 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_video() -> dict[str, str]: def url_encoded_video() -> dict[str, str]:
return { return {
video_url: encode_video_base64(fetch_video(video_url)[0]) video_url: encode_video_url(fetch_video(video_url)[0])
for video_url in TEST_VIDEO_URLS for video_url in TEST_VIDEO_URLS
} }
@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
video_url: str, video_url: str,
base64_encoded_video: dict[str, str], url_encoded_video: dict[str, str],
): ):
messages = dummy_messages_from_video_url( messages = dummy_messages_from_video_url(url_encoded_video[video_url])
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
video_url: str, video_url: str,
base64_encoded_video: dict[str, str], url_encoded_video: dict[str, str],
): ):
messages = dummy_messages_from_video_url( messages = dummy_messages_from_video_url(url_encoded_video[video_url])
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
@ -291,6 +297,11 @@ async def test_chat_streaming_video(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))] "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
) )
@pytest.mark.flaky(
reruns=2,
reruns_delay=5,
condition=current_platform.is_rocm(),
)
async def test_multi_video_input( async def test_multi_video_input(
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
): ):

View File

@ -9,7 +9,8 @@ import pytest_asyncio
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
], ],
[ [
"The image shows a Venn diagram with three over", "The image shows a Venn diagram with three over",
"The image shows a colorful Venn diagram with", "The image displays a Venn diagram with three over",
], ],
[ [
"This image displays a gradient of colors ranging from", "This image displays a gradient of colors ranging from",
@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
], ],
] ]
EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
# MultiHeadAttention attn_backend: FLASH_ATTN
# with Triton Attention backend
[
"The image shows a wooden boardwalk leading through a",
"The image shows a wooden boardwalk extending into a",
],
[
"The image shows two parrots perched on",
"The image shows two birds perched on a cur",
],
[
"The image shows a Venn diagram with three over",
"The image contains a Venn diagram with three over",
],
[
"This image displays a gradient of colors ranging from",
"This image displays a gradient of colors transitioning from",
],
]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
@ -59,7 +81,16 @@ def server():
json.dumps({"image": MAXIMUM_IMAGES}), json.dumps({"image": MAXIMUM_IMAGES}),
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: # ROCm: Increase timeouts to handle potential network delays and slower
# video processing when downloading multiple videos from external sources
env_overrides = {}
if current_platform.is_rocm():
env_overrides = {
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
yield remote_server yield remote_server
@ -70,11 +101,9 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]: def url_encoded_image(local_asset_server) -> dict[str, str]:
return { return {
image_asset: encode_image_base64( image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
local_asset_server.get_image_asset(image_asset)
)
for image_asset in TEST_IMAGE_ASSETS for image_asset in TEST_IMAGE_ASSETS
} }
@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded(
model_name: str, model_name: str,
raw_image_url: str, raw_image_url: str,
image_url: str, image_url: str,
base64_encoded_image: dict[str, str], url_encoded_image: dict[str, str],
): ):
content_text = "What's in this image?" content_text = "What's in this image?"
messages = dummy_messages_from_image_url( messages = dummy_messages_from_image_url(
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", url_encoded_image[raw_image_url],
content_text, content_text,
) )
@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
image_idx: int, image_idx: int,
base64_encoded_image: dict[str, str], url_encoded_image: dict[str, str],
): ):
# ROCm: Switch expected results based on platform
from vllm.platforms import current_platform
# NOTE: This test also validates that we pass MM data through beam search # NOTE: This test also validates that we pass MM data through beam search
raw_image_url = TEST_IMAGE_ASSETS[image_idx] raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = dummy_messages_from_image_url( if current_platform.is_rocm():
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
) else:
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,

View File

@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
] ]
@pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
) )

View File

@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [ PROMPTS = [
@ -35,6 +30,12 @@ TOKEN_IDS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM( llm = LLM(
@ -44,6 +45,7 @@ def llm():
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
enforce_eager=True, enforce_eager=True,
seed=0, seed=0,
attention_config=attention_config,
) )
yield weakref.proxy(llm) yield weakref.proxy(llm)

View File

@ -9,11 +9,6 @@ import pytest_asyncio
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128 max_model_len = 128
@ -44,6 +39,10 @@ def server():
str(max_model_len), str(max_model_len),
] ]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -0,0 +1,28 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling embed tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Configure ROCm-specific settings based on collected tests."""
if not current_platform.is_rocm():
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)

View File

@ -4,7 +4,7 @@ import os
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
MTEB_EMBED_TASKS, MTEB_EMBED_TASKS,
MTEB_EMBED_TOL, MTEB_EMBED_TOL,
OpenAIClientMtebEncoder, OpenAIClientMtebEncoder,
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "intfloat/e5-small" MODEL_NAME = "intfloat/e5-small"
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
def server(): def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
prompts = ["The chef prepared a delicious meal."] prompts = ["The chef prepared a delicious meal."]
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM( llm = LLM(
@ -32,6 +33,7 @@ def llm():
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
enforce_eager=True, enforce_eager=True,
seed=0, seed=0,
attention_config=attention_config,
) )
yield weakref.proxy(llm) yield weakref.proxy(llm)

View File

@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
decode_pooling_output, decode_pooling_output,
) )
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16" DTYPE = "bfloat16"
if current_platform.is_rocm():
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
@ -53,6 +57,10 @@ def server():
DUMMY_CHAT_TEMPLATE, DUMMY_CHAT_TEMPLATE,
] ]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [ MODELS = [
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
EmbedModelInfo( EmbedModelInfo(
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}'] ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
) )
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(model_info.name, args) as remote_server: with RemoteOpenAIServer(model_info.name, args) as remote_server:
yield remote_server yield remote_server

View File

@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
def _generate_random_text(word_count: int) -> str: def _generate_random_text(word_count: int) -> str:
"""Generate random text with approximately the specified word count.""" """Generate random text with approximately the specified word count."""
@ -228,6 +223,10 @@ def server_with_chunked_processing():
"0.8", "0.8",
] ]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -10,7 +10,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2 MAXIMUM_IMAGES = 2
@ -48,14 +48,6 @@ def server():
yield remote_server yield remote_server
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
def get_hf_prompt_tokens(model_name, content, image_url): def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained( processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4 model_name, trust_remote_code=True, num_crops=4

View File

@ -4,7 +4,7 @@ import os
import pytest import pytest
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_score_utils import (
MTEB_RERANK_LANGS, MTEB_RERANK_LANGS,
MTEB_RERANK_TASKS, MTEB_RERANK_TASKS,
MTEB_RERANK_TOL, MTEB_RERANK_TOL,
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@ -30,6 +25,10 @@ st_main_score = 0.33457
def server(): def server():
args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
# that supports encoder-only models on ROCm.
attention_config = None
if current_platform.is_rocm():
attention_config = {"backend": "FLEX_ATTENTION"}
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
# enable garbage collection # enable garbage collection
llm = LLM( llm = LLM(
@ -30,6 +31,7 @@ def llm():
gpu_memory_utilization=0.75, gpu_memory_utilization=0.75,
enforce_eager=True, enforce_eager=True,
seed=0, seed=0,
attention_config=attention_config,
) )
yield weakref.proxy(llm) yield weakref.proxy(llm)

View File

@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
from vllm.entrypoints.pooling.score.protocol import RerankResponse from vllm.entrypoints.pooling.score.protocol import RerankResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "BAAI/bge-reranker-base" MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16" DTYPE = "bfloat16"
@ -24,6 +19,10 @@ DTYPE = "bfloat16"
def server(): def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse from vllm.entrypoints.pooling.score.protocol import ScoreResponse
from vllm.platforms import current_platform from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [ MODELS = [
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True}, {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
{"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False}, {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
@ -44,6 +39,10 @@ def model(request):
def server(model: dict[str, Any]): def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
# ROCm: Use Flex Attention to support encoder-only self-attention.
if current_platform.is_rocm():
args.extend(["--attention-backend", "FLEX_ATTENTION"])
with RemoteOpenAIServer(model["name"], args) as remote_server: with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server yield remote_server

View File

@ -0,0 +1,351 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
from vllm.entrypoints.score_utils import get_score_prompt
from vllm.inputs import TokensPrompt
from vllm.tokenizers import get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
def assert_prompt_tokenization_consistent(
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert actual_ids == expected_ids, (
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
)
@pytest.fixture(scope="module")
def cross_encoder_model_config():
return ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
@pytest.fixture(scope="module")
def cross_encoder_tokenizer(cross_encoder_model_config):
return get_tokenizer(
CROSS_ENCODER_MODEL_ID,
trust_remote_code=cross_encoder_model_config.trust_remote_code,
)
@pytest.fixture(scope="module")
def llm_reranker_model_config():
"""Model config for LLM-as-reranker style (no pad token)."""
config = ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
# use_pad_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config.hf_config.use_pad_token = False
return config
@pytest.fixture
def tokenization_kwargs():
"""Common tokenization kwargs used across tests."""
return {"add_special_tokens": True, "return_tensors": None}
@pytest.fixture
def mock_model_with_score_template():
"""Mock model class that supports score template and tracks post_process calls."""
class MockModelWithScoreTemplate:
supports_score_template = True
post_process_called: list[TokensPrompt] = []
@staticmethod
def get_score_template(p1: str, p2: str) -> str:
return f"[QUERY]{p1}[SEP][DOC]{p2}"
@staticmethod
def post_process_tokens(prompt: TokensPrompt) -> None:
MockModelWithScoreTemplate.post_process_called.append(prompt)
return MockModelWithScoreTemplate
@pytest.fixture
def mock_model_no_score_template():
"""Mock model class that does not support score template."""
class MockModelNoScoreTemplate:
supports_score_template = False
return MockModelNoScoreTemplate
class TestGetScorePrompt:
"""Tests for the get_score_prompt function."""
def test_tokenization_kwargs_passed_through(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
):
"""Test that tokenization kwargs are properly passed through."""
data_1 = "Query text"
data_2 = "Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs = {
"add_special_tokens": True,
"return_tensors": None,
"truncation": True,
"max_length": 20,
}
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
custom_tokenization_kwargs,
data_1,
data_2,
)
assert isinstance(full_prompt, str)
assert "prompt_token_ids" in engine_prompt
# With max_length=20 and truncation, should not exceed this
assert len(engine_prompt["prompt_token_ids"]) <= 20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert full_ids[: len(actual_ids)] == actual_ids, (
f"Token IDs are not a prefix of full encoding.\n"
f"Full IDs: {full_ids}\n"
f"Actual IDs: {actual_ids}"
)
def test_model_supports_score_template(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template (no score_template arg)."""
with patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query text",
"document text",
)
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
assert "prompt_token_ids" in engine_prompt
assert len(engine_prompt["prompt_token_ids"]) > 0
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_model_supports_score_template_but_custom_template_provided(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template but custom template is provided."""
template = (
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
score_template=template, # Providing a template
)
assert "prompt_token_ids" in engine_prompt
assert full_prompt == "TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_not_using_default_template(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
return_value="test querytest doc",
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"test query",
"test doc",
)
assert full_prompt == "test querytest doc"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_fallback_with_pad_token(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_pad_token=True."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config, # use_pad_token=True
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert "prompt_token_ids" in engine_prompt
# Should have token_type_ids from text_pair encoding
assert "token_type_ids" in engine_prompt
assert "query" in full_prompt
assert "document" in full_prompt
assert full_prompt != "querydocument"
assert (
engine_prompt["prompt_token_ids"]
== cross_encoder_tokenizer(
"query", text_pair="document", add_special_tokens=True
)["input_ids"]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer,
full_prompt,
engine_prompt,
add_special_tokens=False,
)
def test_fallback_without_pad_token(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_pad_token=False."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config, # use_pad_token=False
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert full_prompt == "querydocument"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_post_process_tokens_called(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template.post_process_called.clear()
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
)
# post_process_tokens should have been called once
assert len(mock_model_with_score_template.post_process_called) == 1
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)

View File

@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
) )
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import ( from vllm.multimodal.utils import (
encode_audio_base64, encode_audio_url,
encode_image_base64, encode_image_url,
encode_video_base64, encode_video_url,
) )
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
@ -141,22 +141,19 @@ def mistral_model_config():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def image_url(): def image_url():
image = ImageAsset("cherry_blossom") image = ImageAsset("cherry_blossom")
base64 = encode_image_base64(image.pil_image) return encode_image_url(image.pil_image)
return f"data:image/jpeg;base64,{base64}"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def video_url(): def video_url():
video = VideoAsset("baby_reading", 1) video = VideoAsset("baby_reading", 1)
base64 = encode_video_base64(video.np_ndarrays) return encode_video_url(video.np_ndarrays)
return f"data:video/jpeg;base64,{base64}"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def audio_url(): def audio_url():
audio = AudioAsset("mary_had_lamb") audio = AudioAsset("mary_had_lamb")
base64 = encode_audio_base64(*audio.audio_and_sample_rate) return encode_audio_url(*audio.audio_and_sample_rate)
return f"data:audio/ogg;base64,{base64}"
def _assert_mm_data_is_image_input( def _assert_mm_data_is_image_input(

View File

@ -0,0 +1,11 @@
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--async-scheduling
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"

View File

@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
Qwen3-Next-FP8-EP2.yaml

View File

@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
print(f"Number of questions: {eval_config['num_questions']}") print(f"Number of questions: {eval_config['num_questions']}")
print(f"Number of few-shot examples: {eval_config['num_fewshot']}") print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
print(f"Server args: {' '.join(server_args)}") print(f"Server args: {' '.join(server_args)}")
print(f"Environment variables: {env_dict}")
# Launch server and run evaluation # Launch server and run evaluation
with RemoteOpenAIServer( with RemoteOpenAIServer(

View File

@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"] RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_copy_blocks(
kv_cache_factory,
num_mappings: int,
num_layers: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
seed: int,
kv_cache_dtype: str,
device: str,
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
current_platform.seed_everything(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
assert 2 * num_mappings <= num_blocks
src_blocks = random.sample(range(num_blocks), num_mappings)
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
block_mapping: list[tuple[int, int]] = []
for i in range(num_mappings):
src = src_blocks[i]
dst1 = dst_blocks[2 * i]
dst2 = dst_blocks[2 * i + 1]
block_mapping.append((src, dst1))
block_mapping.append((src, dst2))
# Create the KV caches.
key_caches, value_caches = kv_cache_factory(
num_blocks,
block_size,
num_layers,
num_heads,
head_size,
kv_cache_dtype,
dtype,
seed,
device,
)
# Clone the KV caches.
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
# Call the copy blocks kernel.
block_mapping_tensor = torch.tensor(
block_mapping, dtype=torch.int64, device=device
).view(-1, 2)
opcheck(
torch.ops._C_cache_ops.copy_blocks,
(key_caches, value_caches, block_mapping_tensor),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
cond=(head_size == HEAD_SIZES[0]),
)
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
# Run the reference implementation.
for src, dst in block_mapping:
for cloned_key_cache in cloned_key_caches:
cloned_key_cache[dst].copy_(cloned_key_cache[src])
for cloned_value_cache in cloned_value_caches:
cloned_value_cache[dst].copy_(cloned_value_cache[src])
# Compare the results.
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
torch.testing.assert_close(key_cache, cloned_key_cache)
for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
torch.testing.assert_close(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1) torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_copy_blocks_mla(
kv_lora_rank: int,
qk_rope_head_dim: int,
block_size: int,
num_blocks: int,
num_layers: int,
dtype: torch.dtype,
seed: int,
device: str,
kv_cache_dtype: str,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
entry_size = kv_lora_rank + qk_rope_head_dim
kv_caches = []
for _ in range(num_layers):
kv_cache = _create_mla_cache(
num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
)
_fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
kv_caches.append(kv_cache)
ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
num_mappings = min(2, num_blocks // 2)
src_blocks = random.sample(range(num_blocks), num_mappings)
remaining = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining, 2 * num_mappings)
block_mapping = []
for i in range(num_mappings):
src = src_blocks[i]
dst1 = dst_blocks[2 * i]
dst2 = dst_blocks[2 * i + 1]
block_mapping.append((src, dst1))
block_mapping.append((src, dst2))
block_mapping_tensor = torch.tensor(
block_mapping, dtype=torch.int64, device=device
).view(-1, 2)
for src, dst in block_mapping:
for ref_cache in ref_caches:
ref_cache[dst].copy_(ref_cache[src])
opcheck(
torch.ops._C_cache_ops.copy_blocks_mla,
(kv_caches, block_mapping_tensor),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
)
ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
for kv_cache, ref_cache in zip(kv_caches, ref_caches):
torch.testing.assert_close(kv_cache, ref_cache)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) @pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)

View File

@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16]
IS_NEOX = [True, False] IS_NEOX = [True, False]
EPS_VALUES = [1e-5, 1e-6] EPS_VALUES = [1e-5, 1e-6]
SEEDS = [13] SEEDS = [13]
PARTIAL_ROPE = [True, False]
CUDA_DEVICES = ["cuda:0"] CUDA_DEVICES = ["cuda:0"]
@ -52,6 +53,7 @@ def _apply_qk_norm_rope(
@pytest.mark.parametrize("is_neox", IS_NEOX) @pytest.mark.parametrize("is_neox", IS_NEOX)
@pytest.mark.parametrize("eps", EPS_VALUES) @pytest.mark.parametrize("eps", EPS_VALUES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
@torch.inference_mode() @torch.inference_mode()
def test_fused_qk_norm_rope_matches_reference( def test_fused_qk_norm_rope_matches_reference(
device: str, device: str,
@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference(
is_neox: bool, is_neox: bool,
eps: float, eps: float,
seed: int, seed: int,
rotary_ratio: float,
): ):
torch.set_default_device(device) torch.set_default_device(device)
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference(
k_norm.weight.data.normal_(mean=1.0, std=0.1) k_norm.weight.data.normal_(mean=1.0, std=0.1)
q_weight = q_norm.weight.data q_weight = q_norm.weight.data
k_weight = k_norm.weight.data k_weight = k_norm.weight.data
rotary_dim = int(head_dim * rotary_ratio)
rope = RotaryEmbedding( rope = RotaryEmbedding(
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim, rotary_dim=rotary_dim,
max_position_embeddings=4096, max_position_embeddings=4096,
base=10000.0, base=10000.0,
is_neox_style=is_neox, is_neox_style=is_neox,

View File

@ -258,16 +258,16 @@ class Config:
f"{self.fe_supported_types()}." f"{self.fe_supported_types()}."
) )
# Check block quanization support # Check block quantization support
is_block_quatized = self.quant_block_shape is not None is_block_quantized = self.quant_block_shape is not None
if is_block_quatized and self.quant_dtype is None: if is_block_quantized and self.quant_dtype is None:
return False, "No block quantization support." return False, "No block quantization support."
if is_block_quatized and not self.is_block_quant_supported(): if is_block_quantized and not self.is_block_quant_supported():
return False, "Mismatched block quantization support." return False, "Mismatched block quantization support."
# deep_gemm only works with block-quantized # deep_gemm only works with block-quantized
if self.needs_deep_gemm() and not is_block_quatized: if self.needs_deep_gemm() and not is_block_quantized:
return False, "Needs DeepGEMM but not block quantized." return False, "Needs DeepGEMM but not block quantized."
# Check dependencies (turn into asserts?) # Check dependencies (turn into asserts?)

View File

@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from vllm.model_executor.models.mixtral import MixtralMoE from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
from vllm.v1.worker.workspace import init_workspace_manager
NUM_EXPERTS = [8, 64, 192] NUM_EXPERTS = [8, 64, 192]
EP_SIZE = [1, 4] EP_SIZE = [1, 4]
@ -487,6 +488,7 @@ def test_mixtral_moe(
monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_ADDR", "localhost")
monkeypatch.setenv("MASTER_PORT", "12345") monkeypatch.setenv("MASTER_PORT", "12345")
init_distributed_environment() init_distributed_environment()
init_workspace_manager(torch.cuda.current_device())
# Instantiate our and huggingface's MoE blocks # Instantiate our and huggingface's MoE blocks
vllm_config.compilation_config.static_forward_context = dict() vllm_config.compilation_config.static_forward_context = dict()
@ -533,6 +535,11 @@ def test_mixtral_moe(
torch.cuda.synchronize() torch.cuda.synchronize()
torch.cuda.empty_cache() torch.cuda.empty_cache()
# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
# Run forward passes for both MoE blocks # Run forward passes for both MoE blocks
hf_states, _ = hf_moe.forward(hf_inputs) hf_states, _ = hf_moe.forward(hf_inputs)
vllm_states = vllm_moe.forward(vllm_inputs) vllm_states = vllm_moe.forward(vllm_inputs)

View File

@ -1,11 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
import mteb import mteb
import numpy as np import numpy as np
import requests
import torch import torch
from mteb.models import ModelMeta from mteb.models import ModelMeta
from mteb.types import Array from mteb.types import Array
@ -14,7 +11,6 @@ from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs import tests.ci_envs as ci_envs
from tests.models.utils import ( from tests.models.utils import (
EmbedModelInfo, EmbedModelInfo,
RerankModelInfo,
check_embeddings_close, check_embeddings_close,
get_vllm_extra_kwargs, get_vllm_extra_kwargs,
) )
@ -27,10 +23,6 @@ from tests.models.utils import (
MTEB_EMBED_TASKS = ["STS12"] MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4 MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3
_empty_model_meta = ModelMeta( _empty_model_meta = ModelMeta(
loader=None, loader=None,
@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta(
) )
class VllmMtebEncoder(mteb.EncoderProtocol): class MtebEmbedMixin(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def similarity( def similarity(
self, self,
embeddings1: np.ndarray, embeddings1: np.ndarray,
@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
return sim return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): class VllmMtebEncoder(MtebEmbedMixin):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model): def __init__(self, vllm_model):
self.llm = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def predict( def encode(
self, self,
inputs1: DataLoader[mteb.types.BatchedInput], inputs: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]] # Hoping to discover potential scheduling
corpus = [text for batch in inputs2 for text in batch["text"]] # issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
outputs = self.llm.score( r = self.rng.permutation(len(sentences))
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False sentences = [sentences[i] for i in r]
) outputs = self.llm.embed(sentences, use_tqdm=False)
scores = np.array(outputs) embeds = np.array(outputs)
return scores embeds = embeds[np.argsort(r)]
return embeds
class OpenAIClientMtebEncoder(VllmMtebEncoder): class OpenAIClientMtebEncoder(MtebEmbedMixin):
def __init__(self, model_name: str, client): def __init__(self, model_name: str, client):
self.model_name = model_name self.model_name = model_name
self.client = client self.client = client
@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
return embeds return embeds
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url):
self.model_name = model_name
self.url = url
self.rng = np.random.default_rng(seed=42)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
full_corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = []
for query, corpus in zip(queries, full_corpus):
outputs.append(self.get_score(query, corpus))
scores = np.array(outputs)
return scores
def get_score(self, query, corpus):
response = requests.post(
self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
},
).json()
return response["data"][0]["score"]
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
def get_score(self, query, corpus):
response = requests.post(
self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
},
).json()
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks) tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate( results = mteb.evaluate(
@ -243,12 +161,21 @@ def mteb_test_embed_models(
if model_info.architecture: if model_info.architecture:
assert model_info.architecture in model_config.architectures assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which # Confirm whether the important configs in model_config are correct.
# relates to whether chunked prefill and prefix caching are enabled if model_info.pooling_type is not None:
assert ( assert model_config.pooler_config.pooling_type == model_info.pooling_type
model_config._model_info.default_pooling_type if model_info.attn_type is not None:
== model_info.default_pooling_type assert model_config.attn_type == model_info.attn_type
) if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_embed_task( vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
@ -299,117 +226,3 @@ def mteb_test_embed_models(
# We are not concerned that the vllm mteb results are better # We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing. # than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s")
eval_splits = ["test"]
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
tasks=tasks, languages=languages, eval_splits=eval_splits
)
mteb.evaluate(
bm25s,
mteb_tasks,
prediction_folder=prediction_folder,
show_progress_bar=False,
# don't save results for test runs
cache=None,
overwrite_strategy="always",
)
second_stage_tasks = []
for task in mteb_tasks:
second_stage_tasks.append(
task.convert_to_reranking(
prediction_folder,
top_k=10,
)
)
results = mteb.evaluate(
cross_encoder,
second_stage_tasks,
show_progress_bar=False,
cache=None,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_rerank(
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
)
st_dtype = next(hf_model.model.model.parameters()).dtype
return st_main_score, st_dtype
def mteb_test_rerank_models(
hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebCrossEncoder,
atol=MTEB_RERANK_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS,
)
vllm_dtype = model_config.dtype
head_dtype = model_config.head_dtype
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol

Some files were not shown because too many files have changed in this diff Show More