Merge branch 'main' into autotune-custom-tag

This commit is contained in:
Karim Roukoz 2025-12-23 23:28:49 -05:00 committed by GitHub
commit a8a91797f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
218 changed files with 5811 additions and 3214 deletions

View File

@ -291,6 +291,7 @@ if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--wheel-dir <wheel_directory> : directory containing wheel files (default to be same as `version`)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
@ -318,6 +319,12 @@ if __name__ == "__main__":
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--wheel-dir",
type=str,
default=None,
help="Directory containing wheel files (default to be same as `version`)",
)
parser.add_argument(
"--alias-to-default",
type=str,
@ -372,7 +379,7 @@ if __name__ == "__main__":
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# keep only "official" files for a non-nightly version (specifed by cli args)
# keep only "official" files for a non-nightly version (specified by cli args)
PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$")
if PY_VERSION_RE.match(version):
# upload-wheels.sh ensures no "dev" is in args.version
@ -384,9 +391,10 @@ if __name__ == "__main__":
print("Nightly version detected, keeping all wheel files.")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/{wheel_dir}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version
wheel_dir = args.wheel_dir or version
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
index_base_dir = Path(output_dir)
generate_index_and_metadata(

View File

@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then
echo "Re-generating indices for /$pure_version/"
rm -rf "$INDICES_OUTPUT_DIR/*"
mkdir -p "$INDICES_OUTPUT_DIR"
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
# wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
fi

View File

@ -162,7 +162,10 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
@ -349,7 +352,9 @@ steps:
- label: V1 Test e2e + engine # 65min
timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
agent_pool: mi325_8
# grade: Blocking
source_file_dependencies:
- vllm/
@ -977,7 +982,10 @@ steps:
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 5min
@ -1254,13 +1262,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@ -1339,7 +1347,9 @@ steps:
# end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
@ -1508,7 +1518,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####

View File

@ -1109,13 +1109,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@ -1334,7 +1334,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####
@ -1359,6 +1359,7 @@ steps:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- nvidia-smi
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy

View File

@ -145,7 +145,7 @@ steps:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
@ -171,7 +171,7 @@ steps:
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30

1
.github/CODEOWNERS vendored
View File

@ -15,6 +15,7 @@
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson

View File

@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
"if you intend on running FP8 quantized MoE models on Blackwell.")
else()
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# Machete kernels

View File

@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
random.seed(seed)
# Set environment variables
os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1"
else:
@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=tp_size,
attention_config={"backend": backend},
enable_prefix_caching=False,
)
init_time = time.perf_counter() - start_init

View File

@ -0,0 +1,177 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.triton_utils import triton
from vllm.utils.flashinfer import flashinfer_fp4_quantize
if not current_platform.has_device_capability(100):
raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
PROVIDER_CFGS = {
"vllm": dict(backend="vllm", enabled=True),
"flashinfer": dict(backend="flashinfer", enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor:
"""Compute global scale for FP4 quantization."""
amax = torch.abs(tensor).max().to(torch.float32)
return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="us (lower is better)",
plot_name="NVFP4 Input Quantization Latency (us)",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
# Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype)
# Compute global scale for activation
a_global_scale = compute_global_scale(a)
quantiles = [0.5, 0.2, 0.8]
cfg = PROVIDER_CFGS[provider]
if cfg["backend"] == "vllm":
# vLLM's FP4 quantization
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: ops.scaled_fp4_quant(a, a_global_scale),
quantiles=quantiles,
)
elif cfg["backend"] == "flashinfer":
# FlashInfer's FP4 quantization
# Use is_sf_swizzled_layout=True to match vLLM's output format
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True
),
quantiles=quantiles,
)
# Convert ms to us for better readability at small batch sizes
to_us = lambda t_ms: t_ms * 1000
return to_us(ms), to_us(max_ms), to_us(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str):
"""Test accuracy between vLLM and FlashInfer FP4 quantization."""
# Create input tensor
a = torch.randn((M, K), device=device, dtype=dtype)
# Compute global scale
a_global_scale = compute_global_scale(a)
# vLLM quantization
vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale)
# FlashInfer quantization (with swizzled layout to match vLLM's output)
flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize(
a, a_global_scale, is_sf_swizzled_layout=True
)
flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn)
# Compare outputs
torch.testing.assert_close(
vllm_fp4,
flashinfer_fp4,
)
print(f"M={M}, K={K}, dtype={dtype}: PASSED")
def test_accuracy():
"""Run accuracy tests across various shapes."""
print("\n" + "=" * 60)
print("Running accuracy tests: vLLM vs FlashInfer")
print("=" * 60)
device = "cuda"
dtype = torch.bfloat16
# Test various batch sizes and hidden dimensions
Ms = [1, 1024]
Ks = [4096]
for M in Ms:
for K in Ks:
_test_accuracy_once(M, K, dtype, device)
print("\nAll accuracy tests passed!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark NVFP4 quantization: vLLM vs FlashInfer"
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
parser.add_argument(
"--save-path",
type=str,
default=None,
help="Path to save benchmark results",
)
parser.add_argument(
"--accuracy",
action="store_true",
help="Run accuracy tests",
)
args = parser.parse_args()
if args.accuracy:
test_accuracy()
for K, N, model in prepare_shapes(args):
print(f"\n{model}, N={N} K={K}")
benchmark.run(
print_data=True,
save_path=args.save_path,
N=N,
K=K,
)
print("\nBenchmark finished!")

View File

@ -9,16 +9,6 @@
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
const torch::Tensor& block_mapping);
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping);
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping);
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping,

View File

@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
} // namespace vllm
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping) {
int num_layers = key_caches.size();
TORCH_CHECK(num_layers == value_caches.size());
if (num_layers == 0) {
return;
}
torch::Device cache_device = key_caches[0].device();
TORCH_CHECK(cache_device.is_cuda());
// Create data structures for the kernel.
// Create an array of pointers to the key and value caches.
int64_t key_cache_ptrs[num_layers];
int64_t value_cache_ptrs[num_layers];
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
key_cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
value_cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
}
// block_mapping is a 2D tensor with shape (num_pairs, 2).
int num_pairs = block_mapping.size(0);
// Move the data structures to the GPU.
// NOTE: This synchronizes the CPU and GPU.
torch::Tensor key_cache_ptrs_tensor =
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
.to(cache_device);
torch::Tensor value_cache_ptrs_tensor =
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
.to(cache_device);
// Launch the kernel.
const int numel_per_block = key_caches[0][0].numel();
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, numel_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
key_cache_ptrs_tensor.data_ptr<int64_t>(),
value_cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), numel_per_block);
}));
}
// copy blocks kernel for MLA (assumes a joint KV-cache)
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping) {
int num_layers = kv_caches.size();
if (num_layers == 0) {
return;
}
torch::Device cache_device = kv_caches[0].device();
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
std::vector<int64_t> cache_ptrs(num_layers);
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
}
torch::Tensor cache_ptrs_tensor =
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
.to(cache_device);
int num_pairs = block_mapping.size(0);
// We use the stride instead of numel in case the cache is padded for memory
// alignment reasons, we assume the blocks data (inclusive of any padding)
// is contiguous in memory
int mem_footprint_per_block = kv_caches[0].stride(0);
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, mem_footprint_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
}));
}
namespace vllm {
// Used to copy/convert one element

View File

@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel(
void const* k_weight_void, // RMSNorm weights for key
void const* cos_sin_cache_void, // Pre-computed cos/sin cache
int64_t const* position_ids, // Position IDs for RoPE
int const num_tokens // Number of tokens
int const num_tokens, // Number of tokens
int const rotary_dim // Dimension for RoPE
) {
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
if constexpr ((std::is_same_v<scalar_t_in, c10::BFloat16>) ||
@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel(
// Calculate cache pointer for this position - similar to
// pos_encoding_kernels.cu
T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim;
int const embed_dim = head_dim / 2;
T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim;
int const embed_dim = rotary_dim / 2;
T_cache const* cos_ptr = cache_ptr;
T_cache const* sin_ptr = cache_ptr + embed_dim;
if constexpr (interleave) {
// Perform interleaving. Use pre-computed cos/sin values.
int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range
if (laneId < rotary_lanes) {
if constexpr (interleave) {
// Perform interleaving. Use pre-computed cos/sin values.
#pragma unroll
for (int i = 0; i < numElemsPerThread / 2; ++i) {
int const idx0 = 2 * i;
int const idx1 = 2 * i + 1;
for (int i = 0; i < numElemsPerThread / 2; ++i) {
int const idx0 = 2 * i;
int const idx1 = 2 * i + 1;
// Global dimension index in the head
int const dim_idx = laneId * numElemsPerThread + idx0;
float const val0 = elements[idx0];
float const val1 = elements[idx1];
float const val0 = elements[idx0];
float const val1 = elements[idx1];
int const dim_idx = laneId * numElemsPerThread + idx0;
int const half_dim = dim_idx / 2;
float const cos_val =
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float const sin_val =
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
int const half_dim = dim_idx / 2;
float const cos_val =
CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float const sin_val =
CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[idx0] = val0 * cos_val - val1 * sin_val;
elements[idx1] = val0 * sin_val + val1 * cos_val;
}
} else {
// Before data exchange with in warp, we need to sync.
__syncwarp();
// Get the data from the other half of the warp. Use pre-computed cos/sin
// values.
#pragma unroll
for (int i = 0; i < numElemsPerThread; i++) {
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16);
if (laneId < 16) {
elements2[i] = -elements2[i];
elements[idx0] = val0 * cos_val - val1 * sin_val;
elements[idx1] = val0 * sin_val + val1 * cos_val;
}
} else {
// Before data exchange with in warp, we need to sync.
__syncwarp();
int pairOffset = (rotary_dim / 2) / numElemsPerThread;
// Get the data from the other half of the warp. Use pre-computed
// cos/sin values.
#pragma unroll
for (int i = 0; i < numElemsPerThread; i++) {
elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset);
int dim_idx = laneId * numElemsPerThread + i;
dim_idx = (dim_idx * 2) % head_dim;
int half_dim = dim_idx / 2;
// Use pre-computed cos/sin from cache
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
if (laneId < pairOffset) {
elements2[i] = -elements2[i];
}
int dim_idx = laneId * numElemsPerThread + i;
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
dim_idx = (dim_idx * 2) % rotary_dim;
int half_dim = dim_idx / 2;
float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim));
float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim));
elements[i] = elements[i] * cos_val + elements2[i] * sin_val;
}
// __shfl_xor_sync does not provide memfence. Need to sync again.
__syncwarp();
}
// __shfl_xor_sync does not provide memfence. Need to sync again.
__syncwarp();
}
// Store.
{
vec_T vec;
@ -312,10 +316,10 @@ template <typename scalar_t_in, typename scalar_t_cache>
void launchFusedQKNormRope(void* qkv, int const num_tokens,
int const num_heads_q, int const num_heads_k,
int const num_heads_v, int const head_dim,
float const eps, void const* q_weight,
void const* k_weight, void const* cos_sin_cache,
bool const interleave, int64_t const* position_ids,
cudaStream_t stream) {
int const rotary_dim, float const eps,
void const* q_weight, void const* k_weight,
void const* cos_sin_cache, bool const interleave,
int64_t const* position_ids, cudaStream_t stream) {
constexpr int blockSize = 256;
int const warpsPerBlock = blockSize / 32;
@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 64, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens);
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
});
break;
case 128:
@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 128, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens);
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
});
break;
case 256:
@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens,
fusedQKNormRopeKernel<scalar_t_in, scalar_t_cache, 256, INTERLEAVE>
<<<gridDim, blockDim, 0, stream>>>(
qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight,
k_weight, cos_sin_cache, position_ids, num_tokens);
k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim);
});
break;
default:
@ -392,8 +396,11 @@ void fused_qk_norm_rope(
"Query weights size must match head dimension");
TORCH_CHECK(k_weight.size(0) == head_dim,
"Key weights size must match head dimension");
TORCH_CHECK(cos_sin_cache.size(1) == head_dim,
"Cos/sin cache dimension must match head_dim");
TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even");
TORCH_CHECK(cos_sin_cache.size(1) <= head_dim,
"rotary_dim must be less than or equal to head_dim");
TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() &&
qkv.scalar_type() == k_weight.scalar_type(),
"qkv, q_weight and k_weight must have the same dtype");
@ -419,7 +426,8 @@ void fused_qk_norm_rope(
qkv.data_ptr(), static_cast<int>(num_tokens),
static_cast<int>(num_heads_q), static_cast<int>(num_heads_k),
static_cast<int>(num_heads_v), static_cast<int>(head_dim),
static_cast<float>(eps), q_weight.data_ptr(), k_weight.data_ptr(),
static_cast<int>(cos_sin_cache.size(1)), static_cast<float>(eps),
q_weight.data_ptr(), k_weight.data_ptr(),
cos_sin_cache.data_ptr(), !is_neox,
reinterpret_cast<int64_t const*>(position_ids.data_ptr()),
stream);

View File

@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
// Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)).
@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout);
rowIdx, colIdx, numKTiles, SFout);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(out_silu_mul, SFScaleVal,
sf_out);

View File

@ -25,6 +25,7 @@
#include <cuda_fp8.h>
#include "dispatch_utils.h"
#include "cuda_utils.h"
#include "nvfp4_utils.cuh"
#include "launch_bounds_utils.h"
@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
// (448.f / (Alpha_A / 6.f)).
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
int factor = CVT_FP4_SF_VEC_SIZE * 4;
// The actual output_scales dim is computed from the padded numCols.
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
uint32_t* SFout_in_expert =
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
}
@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
extern __shared__ uint32_t shared_input_offsets[];
// Load input offsets into shared memory.
@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
int factor = CVT_FP4_SF_VEC_SIZE * 4;
int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
uint32_t* SFout_in_expert =
SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
SFout + output_scale_offset_by_experts[expert_idx] * numKTiles;
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert);
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
}
@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input,
void* input_global_scale, void* input_offset_by_experts,
void* output_scale_offset_by_experts, int m_topk, int k,
int n_experts, cudaStream_t stream) {
// TODO: this multiProcessorCount should be cached.
int device;
cudaGetDevice(&device);
int multiProcessorCount;
cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount,
device);
int multiProcessorCount =
get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
// Grid, Block size.
// Each thread converts 8 values.

View File

@ -35,7 +35,13 @@ template <typename Int>
__host__ __device__ inline Int round_up(Int x, Int y) {
static_assert(std::is_integral_v<Int>,
"round_up argument must be integral type");
return (x + y - 1) / y * y;
return ((x + y - 1) / y) * y;
}
// Compute effective rows for grid configuration with swizzled SF layouts.
inline int computeEffectiveRows(int m) {
constexpr int ROW_TILE = 128;
return round_up(m, ROW_TILE);
}
// Use UE4M3 by default.
@ -49,81 +55,57 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
"Vec size is not matched.");
// Precompute SF layout parameter (constant for entire kernel).
int32_t const numKTiles = (numCols + 63) / 64;
int sf_m = round_up<int>(numRows, 128);
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
// Each thread writes 4 uint32_t elements.
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
col += blockDim.x * 4) {
SFout[row * sf_n_int + col] = 0x00;
}
}
int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
// Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)).
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
// Input tensor row/col loops.
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
// Iterate over all rows and cols including padded ones -
// ensures we visit every single scale factor address to initialize it.
for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
for (int colIdx = threadIdx.x;
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
colIdx += blockDim.x) {
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
PackedVec in_vec;
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
// Get the output tensor offset.
// Same as inOffset because 8 elements are packed into one uint32_t.
int64_t outOffset = inOffset;
auto& out_pos = out[outOffset];
// If we are outside valid rows OR outside valid columns -> Use Zeros
if (rowIdx >= numRows || elem_idx >= numCols) {
memset(&in_vec, 0, sizeof(PackedVec));
} else {
// Valid Region: Load actual data
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
}
auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numCols, SFout);
rowIdx, colIdx, numKTiles, SFout);
out_pos =
auto out_val =
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
// We do NOT write output for padding because the 'out' tensor is not
// padded.
if (rowIdx < numRows && elem_idx < numCols) {
// Same as inOffset because 8 elements are packed into one uint32_t.
out[inOffset] = out_val;
}
}
}
}
template <typename T>
void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
int64_t* output, int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount, cudaStream_t stream) {
// Grid, Block size.
// Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
// Get number of blocks per SM
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
// Launch the cvt kernel.
if (useUE8M0) {
cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
reinterpret_cast<uint32_t*>(SFOuput));
} else {
cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
reinterpret_cast<uint32_t*>(SFOuput));
}
}
// Instantiate the function.
template void invokeFP4Quantization(int m, int n, half const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
cudaStream_t stream);
template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
float const* SFScale, int64_t* output,
int32_t* SFOuput, bool useUE8M0,
int multiProcessorCount,
cudaStream_t stream);
} // namespace vllm
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
@ -147,13 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
// We don't support e8m0 scales at this moment.
bool useUE8M0 = false;
// Grid, Block size. Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
int effectiveRows = vllm::computeEffectiveRows(m);
dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM));
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
sf_out, useUE8M0, multiProcessorCount, stream);
// NOTE: We don't support e8m0 scales at this moment.
vllm::cvt_fp16_to_fp4<cuda_type, false><<<grid, block, 0, stream>>>(
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out));
});
}
}

View File

@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
return b;
}
// Compute SF output offset for swizzled tensor core layout.
// SF layout: [numMTiles, numKTiles, 32, 4, 4]
// Caller must precompute: numKTiles = (numCols + 63) / 64
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
int numCols,
SFType* SFout) {
__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(
int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) {
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
CVT_FP4_NUM_THREADS_PER_SF == 2);
// One pair of threads write one SF to global memory.
// TODO: stage through smem for packed STG.32
// is it better than STG.8 from 4 threads ?
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
// SF vector index (16 elements share one SF in the K dimension).
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
int32_t mIdx = rowIdx;
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
int32_t mTileIdx = mIdx / (32 * 4);
// SF vector size 16.
int factor = CVT_FP4_SF_VEC_SIZE * 4;
int32_t numKTiles = (numCols + factor - 1) / factor;
int64_t mTileStride = numKTiles * 32 * 4 * 4;
int32_t kTileIdx = (kIdx / 4);
int64_t kTileStride = 32 * 4 * 4;
// M tile layout [32, 4] is column-major.
int32_t outerMIdx = (mIdx % 32);
int64_t outerMStride = 4 * 4;
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
int64_t innerMStride = 4;
int32_t innerKIdx = (kIdx % 4);
int64_t innerKStride = 1;
// Compute the global offset.
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
outerMIdx * outerMStride + innerMIdx * innerMStride +
innerKIdx * innerKStride;
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) {
return nullptr;
}
return nullptr;
// SF vector index (16 elements share one SF in the K dimension).
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
int32_t mIdx = rowIdx;
// Decompose indices using bitwise ops (all divisors are powers of 2).
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
int32_t mTileIdx = mIdx >> 7; // mIdx / 128
int32_t outerMIdx = mIdx & 31; // mIdx % 32
int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4
int32_t kTileIdx = kIdx >> 2; // kIdx / 4
int32_t innerKIdx = kIdx & 3; // kIdx % 4
// Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 +
// outerMIdx * 16 + innerMIdx * 4 + innerKIdx
// Use bitwise OR for non-overlapping lower bits.
int64_t SFOffset = (static_cast<int64_t>(mTileIdx) * numKTiles + kTileIdx)
<< 9 |
(outerMIdx << 4) | (innerMIdx << 2) | innerKIdx;
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
}
// Quantizes the provided PackedVec into the uint32_t output

View File

@ -1,373 +0,0 @@
#include "core/registration.h"
#include <torch/all.h>
#include <cutlass/arch/arch.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/thread/linear_combination.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/group_array_problem_shape.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/util/command_line.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/gett.hpp"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include <cassert>
using namespace cute;
template <typename ElementAB, typename ElementC, typename ElementAccumulator,
typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
__global__ void get_ggemm_starts(
int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
ElementC** out_offsets, ElementAccumulator** a_scale_offsets,
ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int,
ElementAB* b_base_as_int, ElementC* out_base_as_int,
ElementAccumulator* a_scale_base_as_int,
ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int,
LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) {
int expert_id = threadIdx.x;
if (expert_id >= gridDim.x * blockDim.x) {
return;
}
int m = problem_sizes[expert_id * 3];
int n = problem_sizes[expert_id * 3 + 1];
int k = problem_sizes[expert_id * 3 + 2];
int32_t expert_offset = expert_offsets[expert_id];
int a_stride = expert_offset * k;
int b_stride = expert_id * k * n;
int a_scale_stride = expert_offset * k / 128;
int b_scale_stride = expert_id * k * n / 128 / 128;
a_offsets[expert_id] = a_base_as_int + a_stride;
b_offsets[expert_id] = b_base_as_int + b_stride;
out_offsets[expert_id] = out_base_as_int + expert_offset * n;
a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride;
b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride;
LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
*layout_sfa_ptr =
ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
*layout_sfb_ptr =
ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
}
#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \
ScaleConfig) \
else if (out_tensors.dtype() == TENSOR_C_TYPE) { \
get_ggemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA, \
LayoutSFB, ScaleConfig><<<1, num_experts, 0, stream>>>( \
static_cast<int32_t*>(expert_offsets.data_ptr()), \
static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()), \
static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()), \
static_cast<C_TYPE**>(out_ptrs.data_ptr()), \
static_cast<float**>(a_scales_ptrs.data_ptr()), \
static_cast<float**>(b_scales_ptrs.data_ptr()), \
static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()), \
static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()), \
static_cast<C_TYPE*>(out_tensors.data_ptr()), \
static_cast<float*>(a_scales.data_ptr()), \
static_cast<float*>(b_scales.data_ptr()), \
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()), \
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()), \
static_cast<int*>(problem_sizes.data_ptr())); \
}
template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
void run_get_ggemm_starts(
torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
torch::Tensor out_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& layout_sfa,
torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) {
TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
int num_experts = (int)expert_offsets.size(0);
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
if (false) {
}
__CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA,
LayoutSFB, ScaleConfig)
__CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA,
LayoutSFB, ScaleConfig)
else {
TORCH_CHECK(false, "Unsupported output tensor type");
}
}
template <typename OutType, typename ScheduleConfig, typename LayoutD>
void run_blockwise_scaled_group_mm(
torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs,
const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs,
const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a,
const torch::Tensor& stride_b, const torch::Tensor& stride_c,
const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
// Types
using ElementA = cutlass::float_e4m3_t;
using ElementB = cutlass::float_e4m3_t;
using ElementC = OutType;
using ElementD = ElementC;
using ElementAccumulator = float;
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = LayoutD;
// Alignments
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
using ArchTag = cutlass::arch::Sm100;
using OperatorClass = cutlass::arch::OpClassTensorOp;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape,
typename ScheduleConfig::ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*,
AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass, ElementA,
cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
AlignmentA, ElementB,
cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape,
typename ScheduleConfig::ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
typename ScheduleConfig::KernelSchedule>::CollectiveOp;
using GemmKernel =
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
CollectiveEpilogue, void>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
int num_experts = (int)expert_offsets.size(0);
Gemm gemm_op;
// Mainloop Arguments
typename GemmKernel::MainloopArguments mainloop_args{
static_cast<const ElementA**>(a_ptrs.data_ptr()),
static_cast<StrideA*>(stride_a.data_ptr()),
static_cast<const ElementB**>(b_ptrs.data_ptr()),
static_cast<StrideB*>(stride_b.data_ptr()),
static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(
layout_sfa.data_ptr()),
static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(
layout_sfb.data_ptr())};
int device_id = a_ptrs.device().index();
static const cutlass::KernelHardwareInfo hw_info{
device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
device_id)};
// Epilogue Arguments
typename GemmKernel::EpilogueArguments epilogue_args{
{}, // epilogue.thread
nullptr,
static_cast<StrideC*>(stride_c.data_ptr()),
static_cast<ElementD**>(out_ptrs.data_ptr()),
static_cast<StrideC*>(stride_c.data_ptr())};
UnderlyingProblemShape* problem_sizes_as_shapes =
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
// Gemm Arguments
typename GemmKernel::Arguments args{
cutlass::gemm::GemmUniversalMode::kGrouped,
{num_experts, problem_sizes_as_shapes, nullptr},
mainloop_args,
epilogue_args,
hw_info};
at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()};
const cudaStream_t stream =
at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM");
size_t workspace_size = gemm_op.get_workspace_size(args);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device());
auto workspace = torch::empty(workspace_size, workspace_options);
auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
status = gemm_op.run(stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
template <typename OutType>
void blockwise_scaled_group_mm_dispatch_shape(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
struct MmaConfig {
using ElementA = cutlass::float_e4m3_t;
using KernelSchedule =
cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
using LayoutC = cutlass::layout::RowMajor;
using MmaTileShape = Shape<_128, _128, _128>;
using ClusterShape = Shape<_1, _1, _1>;
};
int num_experts = (int)expert_offsets.size(0);
auto a_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto b_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto out_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto a_scales_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto b_scales_ptrs = torch::empty(
{num_experts},
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto layout_sfa = torch::empty(
{num_experts, 5},
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
auto layout_sfb = torch::empty(
{num_experts, 5},
torch::TensorOptions().dtype(torch::kInt32).device(a.device()));
auto stride_a = torch::full(
{num_experts}, a.size(1),
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto stride_b = torch::full(
{num_experts}, a.size(1),
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
auto stride_c = torch::full(
{num_experts}, output.size(1),
torch::TensorOptions().dtype(torch::kInt64).device(a.device()));
torch::TensorOptions options_int =
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
run_get_ggemm_starts<typename MmaConfig::LayoutSFA,
typename MmaConfig::LayoutSFB,
typename MmaConfig::ScaleConfig>(
expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a,
b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes);
run_blockwise_scaled_group_mm<OutType, MmaConfig,
typename MmaConfig::LayoutC>(
out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a,
stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes,
expert_offsets);
}
void cutlass_blockwise_scaled_grouped_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& scales_a, const torch::Tensor& scales_b,
const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) {
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
TORCH_CHECK(problem_sizes.size(1) == 3,
"problem_sizes must have shape (num_experts, 3)");
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
"Number of experts in problem_sizes must match expert_offsets");
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
"problem_sizes must be int32");
TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn,
"a must be kFloat8_e4m3fn");
TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn,
"b must be kFloat8_e4m3fn");
TORCH_CHECK(output.scalar_type() == torch::kBFloat16 ||
output.scalar_type() == torch::kHalf,
"output must be bfloat16 or half");
TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32,
"scales_a must be float32");
TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32,
"scales_b must be float32");
TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32,
"expert_offsets must be int32");
TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
TORCH_CHECK(problem_sizes.size(1) == 3,
"problem_sizes must have shape (num_experts, 3)");
TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0),
"Number of experts in problem_sizes must match expert_offsets");
TORCH_CHECK(problem_sizes.dtype() == torch::kInt32,
"problem_sizes must be int32");
TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100
if (output.scalar_type() == torch::kBFloat16) {
blockwise_scaled_group_mm_dispatch_shape<cutlass::bfloat16_t>(
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
} else if (output.scalar_type() == torch::kFloat16) {
blockwise_scaled_group_mm_dispatch_shape<cutlass::half_t>(
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets);
} else {
TORCH_CHECK(false, "Unsupported output tensor type");
}
#endif
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("cutlass_blockwise_scaled_grouped_mm",
&cutlass_blockwise_scaled_grouped_mm);
}

View File

@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor alpha) -> ()");
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
// cutlass blockwise scaledgroup GEMM
ops.def(
"cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
"Tensor scales_a, Tensor scales_b, "
"Tensor problem_sizes, Tensor expert_offsets) -> ()");
// conditionally compiled so impl registration is in source file
// cutlass nvfp4 block scaled group GEMM
ops.def(
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
@ -692,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
// Copy the cache blocks from src to dst.
cache_ops.def(
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
"Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
cache_ops.def(
"copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
// Reshape the key and value tensors and cache them.
cache_ops.def(
"reshape_and_cache(Tensor key, Tensor value,"

View File

@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& uv pip install --system *.whl
ARG COMMON_WORKDIR
ARG BASE_IMAGE
# Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1
# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
# Workaround for ROCm profiler limits
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
CMD ["/bin/bash"]

View File

@ -1,15 +1,15 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="1c57644d"
ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_BRANCH="89075173"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_BRANCH="v0.24.1"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="59bd8ff2"
ARG AITER_BRANCH="6af8b687"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics
add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
-r requirements/xpu.txt
# arctic-inference is built from source which needs torch-xpu properly installed
# used for suffix method speculative decoding
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir arctic-inference==0.1.1
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
COPY . .

View File

@ -2,4 +2,4 @@
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).

View File

@ -0,0 +1,5 @@
# llm-d
vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).

View File

@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
- [Helm](frameworks/helm.md)
- [InftyAI/llmaz](integrations/llmaz.md)
- [llm-d](integrations/llm-d.md)
- [KAITO](integrations/kaito.md)
- [KServe](integrations/kserve.md)
- [Kthena](integrations/kthena.md)

View File

@ -139,18 +139,18 @@ token data.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
```
<figure markdown="span">
![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/query.png" alt="query" width="70%" />
</p>
Each thread defines its own `q_ptr` which points to the assigned
query token data on global memory. For example, if `VEC_SIZE` is 4
and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
total of 128 elements divided into 128 / 4 = 32 vecs.
<figure markdown="span">
![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/q_vecs.png" alt="q_vecs" width="70%" />
</p>
```cpp
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr`
points to key token data based on `k_cache` at assigned block,
assigned head and assigned token.
<figure markdown="span">
![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/key.png" alt="key" width="70%" />
</p>
The diagram above illustrates the memory layout for key data. It
assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128
elements for one token) that will be processed by 2 threads (one
thread group) separately.
<figure markdown="span">
![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/k_vecs.png" alt="k_vecs" width="70%" />
</p>
```cpp
K_vec k_vecs[NUM_VECS_PER_THREAD]
@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of
## Value
<figure markdown="span">
![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/value.png" alt="value" width="70%" />
</p>
<figure markdown="span">
![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/logits_vec.png" alt="logits_vec" width="50%" />
</p>
<figure markdown="span">
![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" }
</figure>
<p align="center">
<img src="../assets/design/paged_attention/v_vec.png" alt="v_vec" width="70%" />
</p>
Now we need to retrieve the value data and perform dot multiplication
with `logits`. Unlike query and key, there is no thread group

View File

@ -64,7 +64,7 @@ th:not(:first-child) {
| [CP](../configuration/optimization.md#chunked-prefill) | [](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [APC](automatic_prefix_caching.md) | [](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) |
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) |
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |

View File

@ -8,6 +8,16 @@ We recommend installing the library with:
pip install nvidia-modelopt
```
## Supported ModelOpt checkpoint formats
vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the
following `quantization.quant_algo` values:
- `FP8`: per-tensor weight scale (+ optional static activation scale).
- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization.
- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks).
- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`).
## Quantizing HuggingFace Models with PTQ
You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory.
@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
if __name__ == "__main__":
main()
```
## Running the OpenAI-compatible server
To serve a local ModelOpt checkpoint via the OpenAI-compatible API:
```bash
vllm serve <path_to_exported_checkpoint> \
--quantization modelopt \
--host 0.0.0.0 --port 8000
```
## Testing (local checkpoints)
vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped
by default in CI. To run the tests locally:
```bash
export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH=<path_to_fp8_pc_pt_checkpoint>
export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH=<path_to_fp8_pb_wo_checkpoint>
pytest -q tests/quantization/test_modelopt.py
```

View File

@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm
For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel).
### How FP8 KV Cache Works
The FP8 KV cache implementation follows this workflow:
1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache
2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16)
3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor
This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations.
### Performance Impact
The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either:

View File

@ -352,10 +352,17 @@ Supported models:
* `zai-org/GLM-4.5`
* `zai-org/GLM-4.5-Air`
* `zai-org/GLM-4.6`
* `zai-org/GLM-4.6-Air`
Flags: `--tool-call-parser glm45`
### GLM-4.7 Models (`glm47`)
Supported models:
* `zai-org/GLM-4.7`
Flags: `--tool-call-parser glm47`
### Qwen3-Coder Models (`qwen3_xml`)
Supported models:

View File

@ -28,3 +28,4 @@ The backends below live **outside** the main `vllm` repository and follow the
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
| Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |
| Apple Silicon (Metal) | N/A, install from source | <https://github.com/vllm-project/vllm-metal> |

View File

@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
!!! tip "GPU-Accelerated Inference with vLLM-Metal"
For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend.
# --8<-- [end:installation]
# --8<-- [start:requirements]

View File

@ -387,7 +387,7 @@ th {
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ |
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ |
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ |
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ |
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ |
@ -418,7 +418,7 @@ th {
| `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | | ✅︎ |
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ |
| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ |
| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ |
@ -490,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
@ -543,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | |
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
| `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
@ -562,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
!!! note
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
!!! note
`nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly.
Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py)
!!! note
Load the official original `mxbai-rerank-v2` by using the following command.

View File

@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
from langchain_community.llms import VLLM
llm = VLLM(
model="mosaicml/mpt-7b",
model="Qwen/Qwen3-4B",
trust_remote_code=True, # mandatory for hf models
max_new_tokens=128,
top_k=10,

View File

@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w
Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
#### Score Template
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
#### Single inference
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.

View File

@ -5,130 +5,91 @@ Usage:
Single node:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2
-dp=2 \
-tp=2
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
--node-rank=0 \
--master-addr=10.99.48.128 \
--master-port=13345
-dp=2 \
-tp=2 \
--dp-num-nodes=2 \
--dp-node-rank=0 \
--dp-master-addr=10.99.48.128 \
--dp-master-port=13345
Node 1:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
--node-rank=1 \
--master-addr=10.99.48.128 \
--master-port=13345
-dp=2 \
-tp=2 \
--dp-num-nodes=2 \
--dp-node-rank=1 \
--dp-master-addr=10.99.48.128 \
--dp-master-port=13345
"""
import os
from time import sleep
from vllm import LLM, SamplingParams
from vllm import LLM, EngineArgs, SamplingParams
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import get_open_port
def parse_args():
import argparse
def create_parser():
parser = FlexibleArgumentParser(description="Data Parallel Inference")
parser = argparse.ArgumentParser(description="Data Parallel Inference")
# Add all engine args
EngineArgs.add_cli_args(parser)
parser.set_defaults(
model="ibm-research/PowerMoE-3b",
enable_expert_parallel=True,
)
# Add DP-specific args (separate from engine args to avoid conflicts)
parser.add_argument(
"--model",
"--dp-num-nodes",
type=int,
default=1,
help="Total number of nodes for data parallel.",
)
parser.add_argument(
"--dp-node-rank",
type=int,
default=0,
help="Rank of the current node for data parallel.",
)
parser.add_argument(
"--dp-master-addr",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path",
)
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
parser.add_argument(
"--node-size", type=int, default=1, help="Total number of nodes"
default="",
help="Master node IP address for DP coordination.",
)
parser.add_argument(
"--node-rank", type=int, default=0, help="Rank of the current node"
)
parser.add_argument(
"--master-addr", type=str, default="", help="Master node IP address"
)
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
parser.add_argument(
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
)
parser.add_argument(
"--trust-remote-code", action="store_true", help="Trust remote code."
)
parser.add_argument(
"--max-num-seqs",
"--dp-master-port",
type=int,
default=64,
help=("Maximum number of sequences to be processed in a single iteration."),
)
parser.add_argument(
"--max-model-len",
type=int,
help=("Maximum number of tokens to be processed in a single iteration."),
default=0,
help="Master node port for DP coordination.",
)
parser.add_argument(
"--timeout",
type=int,
default=300,
help=("Number of seconds before unresponsive process is killed."),
help="Number of seconds before unresponsive process is killed.",
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
parser.add_argument(
"--enable-dbo",
action="store_true",
help=("Enable microbatched execution"),
)
parser.add_argument(
"--compilation-config",
type=int,
help=("Compilation optimization (O) mode 0-3."),
)
parser.add_argument(
"--quantization",
type=str,
)
parser.add_argument(
"--disable-expert-parallel",
dest="enable_expert_parallel",
action="store_false",
help="Disable expert parallel (default: enabled).",
)
parser.set_defaults(enable_expert_parallel=True)
return parser.parse_args()
return parser
def main(
model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
GPUs_per_dp_rank,
enforce_eager,
enable_expert_parallel,
trust_remote_code,
max_num_seqs,
max_model_len,
compilation_config,
gpu_memory_utilization,
enable_dbo,
quantization,
engine_args,
):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@ -173,19 +134,7 @@ def main(
)
# Create an LLM.
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=enforce_eager,
enable_expert_parallel=enable_expert_parallel,
trust_remote_code=trust_remote_code,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enable_dbo=enable_dbo,
quantization=quantization,
compilation_config=compilation_config,
)
llm = LLM(**engine_args)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for i, output in enumerate(outputs):
@ -204,22 +153,29 @@ def main(
if __name__ == "__main__":
args = parse_args()
parser = create_parser()
args = vars(parser.parse_args())
dp_size = args.dp_size
tp_size = args.tp_size
node_size = args.node_size
node_rank = args.node_rank
# Extract DP-specific args (pop to remove from engine_args)
dp_size = args.pop("data_parallel_size")
dp_num_nodes = args.pop("dp_num_nodes")
dp_node_rank = args.pop("dp_node_rank")
dp_master_addr = args.pop("dp_master_addr")
dp_master_port = args.pop("dp_master_port")
timeout = args.pop("timeout")
if node_size == 1:
# Remaining args are engine args
engine_args = args
if dp_num_nodes == 1:
dp_master_ip = "127.0.0.1"
dp_master_port = get_open_port()
dp_master_port_val = get_open_port()
else:
dp_master_ip = args.master_addr
dp_master_port = args.master_port
dp_master_ip = dp_master_addr
dp_master_port_val = dp_master_port
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
dp_per_node = dp_size // node_size
assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes"
dp_per_node = dp_size // dp_num_nodes
from multiprocessing import Process
@ -230,34 +186,24 @@ if __name__ == "__main__":
procs = []
for local_dp_rank, global_dp_rank in enumerate(
range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node)
):
proc = Process(
target=main,
args=(
args.model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
tp_size,
args.enforce_eager,
args.enable_expert_parallel,
args.trust_remote_code,
args.max_num_seqs,
args.max_model_len,
args.compilation_config,
args.gpu_memory_utilization,
args.enable_dbo,
args.quantization,
dp_master_port_val,
engine_args,
),
)
proc.start()
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join(timeout=args.timeout)
proc.join(timeout=timeout)
if proc.exitcode is None:
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
proc.kill()

View File

@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags:
- `--max-num-batched-tokens=<large value>` **(default: 2048)** This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager.
- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.**
## Local media inputs
To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance:

View File

@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from pathlib import Path
from vllm import LLM
model_name = "nvidia/llama-nemotron-rerank-1b-v2"
# Path to template file
template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja"
chat_template = template_path.read_text()
llm = LLM(model=model_name, runner="pooling", trust_remote_code=True)
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
outputs = llm.score(query, documents, chat_template=chat_template)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)

View File

@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example of using the rerank API with template.
run:
vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
"""
import json
import requests
url = "http://127.0.0.1:8000/rerank"
headers = {"accept": "application/json", "Content-Type": "application/json"}
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
data = {
"model": "nvidia/llama-nemotron-rerank-1b-v2",
"query": query,
"documents": documents,
}
def main():
response = requests.post(url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,3 @@
question:{{ (messages | selectattr("role", "eq", "query") | first).content }}
passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}

View File

@ -557,7 +557,8 @@ def test_rms_group_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties

View File

@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
"evaluate_guards": evaluate_guards,
},
},
max_model_len=1024,
)
output = model.generate(prompt)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import pytest
import torch
@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
hidden_size: int,
eps: float,
group_shape: GroupShape,
cuda_force_torch: bool,
use_aiter: bool = False,
cuda_force_torch: bool = False,
use_aiter_quant_op: bool = True,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.use_aiter = use_aiter
self.use_aiter_quant_op = use_aiter_quant_op
self.cuda_force_torch = cuda_force_torch
self.group_shape = group_shape
self.enable_quant_fp8_custom_op = None # Will be set later if applicable
self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
if group_shape.is_per_group():
self.wscale = [
torch.rand(
(hidden_size // group_shape[1], hidden_size // group_shape[1]),
dtype=torch.float32,
)
for _ in range(3)
]
else:
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
static = group_shape == GroupShape.PER_TENSOR
# Setup quantization scale descriptor
static = group_shape == GroupShape.PER_TENSOR and not use_aiter
quant_scale = ScaleDesc(torch.float32, static, group_shape)
self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
# Setup scales
if static:
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
else:
self.scale = [None for _ in range(3)]
# Setup weights
self.w = [
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
]
if not group_shape.is_per_group():
if not group_shape.is_per_group() or use_aiter:
self.w = [self.w[0].t() for _ in range(3)]
# Setup weight scales
if group_shape.is_per_group():
scale_size = (
(hidden_size + 128 - 1) // 128
if use_aiter
else hidden_size // group_shape[1]
)
wscale_shape: tuple[int, ...] = (scale_size, scale_size)
else:
wscale_shape = (1,)
self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)]
# Setup FP8 linear operation
is_per_group = group_shape.is_per_group()
if is_per_group and use_aiter:
self.fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(128, 128),
act_quant_group_shape=group_shape,
use_aiter_and_is_supported=use_aiter_quant_op,
)
# AITER blockwise doesn't use enable_quant_fp8_custom_op
elif is_per_group:
self.fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
act_quant_group_shape=group_shape,
@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
use_aiter_and_is_supported=False,
)
self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
elif use_aiter:
self.fp8_linear = Fp8LinearOp(
act_quant_static=False,
act_quant_group_shape=group_shape,
)
self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
else:
with override_cutlass_fp8_supported(not cuda_force_torch):
self.fp8_linear = Fp8LinearOp(
@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
self.enable_rms_norm_custom_op = self.norm[0].enabled()
self.group_shape = group_shape
def forward(self, x):
# avoid having graph input be an arg to a pattern directly
@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
y4, resid = self.norm[3](x4, resid) # use resid here
return y4
def ops_in_model_before(self):
if (
self.use_aiter
and self.group_shape.is_per_group()
and current_platform.is_fp8_fnuz()
):
return [rocm_aiter_ops.get_group_quant_op()]
if self.use_aiter and self.group_shape.is_per_group():
return [torch.ops.vllm.triton_per_token_group_quant_fp8.default]
if self.use_aiter and self.use_aiter_quant_op:
return [rocm_aiter_ops.get_per_token_quant_op()]
if self.use_aiter:
return [QUANT_OPS[self.quant_key]]
if self.enable_quant_fp8_custom_op:
return [QUANT_OPS[self.quant_key]]
return [torch.ops.aten.reciprocal]
def ops_in_model_after(self):
if self.use_aiter and self.group_shape.is_per_group():
from vllm.compilation.rocm_aiter_fusion import (
AiterFusedAddRMSFp8GroupQuantPattern,
AiterRMSFp8GroupQuantPattern,
)
return [
AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP,
AiterRMSFp8GroupQuantPattern.FUSED_OP,
]
if self.use_aiter:
from vllm.compilation.rocm_aiter_fusion import (
AiterFusedAddRMSNormDynamicQuantPattern,
AiterRMSNormDynamicQuantPattern,
)
return [
AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP,
AiterRMSNormDynamicQuantPattern.FUSED_OP,
]
return [
FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
]
def ops_in_model_before(self):
return (
[QUANT_OPS[self.quant_key]]
if self.enable_quant_fp8_custom_op
else [torch.ops.aten.reciprocal]
)
def ops_in_model_before_partial(self):
return (
[RMS_OP, RMS_ADD_OP]
@ -155,67 +214,45 @@ GROUP_SHAPES = [
]
class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
def __init__(self, hidden_size: int, eps: float, **kwargs):
super().__init__()
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(128, 128),
act_quant_group_shape=GroupShape(1, 128),
cutlass_block_fp8_supported=False,
use_aiter_and_is_supported=True,
)
self.w = [
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
for _ in range(3)
]
def _run_fusion_test(
model,
fusion_pass,
vllm_config,
dtype,
hidden_size,
num_tokens,
):
"""Helper function for common fusion test logic.
scale_hidden_size = (hidden_size + 128 - 1) // 128
self.wscale = [
torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32)
for _ in range(3)
]
Must be called within vllm_config context.
"""
noop_pass = NoOpEliminationPass(vllm_config)
cleanup_pass = PostCleanupPass(vllm_config)
self.norm_weight = [torch.ones(hidden_size) for _ in range(4)]
self.eps = eps
backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
backend2 = TestBackend(noop_pass, cleanup_pass)
def forward(self, x):
# avoid having graph input be an arg to a pattern directly
x = resid = torch.relu(x)
y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0])
# make sure resid is used for replacement to work
y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
x2, resid, self.norm_weight[1], self.eps
)
model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x)
x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1])
model_unfused = torch.compile(model, backend=backend2)
result_unfused = model_unfused(x)
y3, resid = rocm_aiter_ops.rms_norm2d_with_add(
x3, resid, self.norm_weight[2], self.eps
)
if dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2])
torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
y4, resid = rocm_aiter_ops.rms_norm2d_with_add(
x4, resid, self.norm_weight[3], self.eps
)
return y4
assert fusion_pass.matched_count == 3
backend.check_before_ops(model.ops_in_model_before())
backend.check_after_ops(model.ops_in_model_after())
def ops_in_model_before(self):
return [
torch.ops.vllm.rocm_aiter_rms_norm,
torch.ops.vllm.rocm_aiter_group_fp8_quant,
]
def ops_in_model_before_partial(self):
return []
def ops_in_model_after(self):
return [
torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
]
return backend, backend2
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("group_shape", GROUP_SHAPES)
@pytest.mark.parametrize(
"model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op",
list(itertools.product([TestModel], [True, False], [True, False]))
+ [(TestRmsnormGroupFp8QuantModel, False, False)],
)
@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
# cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True.
@pytest.mark.parametrize(
@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
num_tokens,
eps,
group_shape,
model_class,
enable_rms_norm_custom_op,
enable_quant_fp8_custom_op,
cuda_force_torch,
):
if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
pytest.skip("AITER is not supported on this GPU.")
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity() # needed for certain non-cutlass fp8 paths
if not enable_quant_fp8_custom_op and group_shape.is_per_group():
pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
# Skip test for 64-bit group shape when running with cutlass or deepgemm
if group_shape == GroupShape(1, 64) and (
cutlass_block_fp8_supported() or is_deep_gemm_supported()
):
@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
custom_ops.append("+rms_norm")
if enable_quant_fp8_custom_op:
custom_ops.append("+quant_fp8")
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
),
),
)
with vllm.config.set_current_vllm_config(vllm_config):
# Reshape pass is needed for the fusion pass to work
noop_pass = NoOpEliminationPass(vllm_config)
if model_class is TestRmsnormGroupFp8QuantModel:
from vllm.compilation.rocm_aiter_fusion import (
RocmAiterRMSNormFp8GroupQuantFusionPass,
)
# Setup device before model creation
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity()
fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config)
else:
fusion_pass = RMSNormQuantFusionPass(vllm_config)
cleanup_pass = PostCleanupPass(vllm_config)
backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
backend2 = TestBackend(noop_pass, cleanup_pass)
model = model_class(
fusion_pass = RMSNormQuantFusionPass(vllm_config)
model = TestModel(
hidden_size=hidden_size,
eps=eps,
group_shape=group_shape,
use_aiter=False,
cuda_force_torch=cuda_force_torch,
)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
model_fused = torch.compile(model, backend=backend)
result_fused = model_fused(x)
model_unfused = torch.compile(model, backend=backend2)
result_unfused = model_unfused(x)
if dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
assert fusion_pass.matched_count == 3
backend.check_before_ops(model.ops_in_model_before())
backend, _ = _run_fusion_test(
model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
)
backend.check_before_ops(
model.ops_in_model_before_partial(), fully_replaced=False
)
backend.check_after_ops(model.ops_in_model_after())
# If RMSNorm custom op is disabled (native/torch impl used),
# there's a risk that the fused add doesn't get included in the
# replacement and only the rms part gets fused with quant.
# Hence, we check only 2 add nodes are left (final fused rmsnorm add).
if (
not enable_rms_norm_custom_op
and model_class is not TestRmsnormGroupFp8QuantModel
):
if not enable_rms_norm_custom_op:
n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
# 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
assert n_add_nodes(backend.graph_pre_pass) == 7
assert n_add_nodes(backend.graph_post_pass) == 2
GROUP_SHAPE_QUANT_OPS_MATCHS = [
(GroupShape.PER_TOKEN, True),
(GroupShape.PER_TOKEN, False),
(GroupShape(1, 128), True),
]
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("hidden_size", [256])
@pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize(
"group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS
)
@pytest.mark.skipif(
(not current_platform.is_rocm() or not IS_AITER_FOUND),
reason="Only test on ROCm with aiter package installed",
)
def test_aiter_fusion_rmsnorm_quant(
dtype: torch.dtype,
hidden_size: int,
num_tokens: int,
eps: float,
group_shape: GroupShape,
use_aiter_quant_op: bool,
monkeypatch: pytest.MonkeyPatch,
):
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm", "+quant_fp8"],
pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True),
),
)
with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass
m.setenv("VLLM_ROCM_USE_AITER", "1")
rocm_aiter_ops.refresh_env_variables()
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity()
fusion_pass = RocmAiterRMSNormFusionPass(vllm_config)
model = TestModel(
hidden_size=hidden_size,
eps=eps,
group_shape=group_shape,
use_aiter=True,
use_aiter_quant_op=use_aiter_quant_op,
)
_run_fusion_test(
model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
)

View File

@ -8,7 +8,7 @@ import pytest
import pytest_asyncio
from vllm.assets.audio import AudioAsset
from vllm.multimodal.utils import encode_audio_base64, fetch_audio
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
from ...utils import RemoteOpenAIServer
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
}
@pytest.fixture(scope="session")
def url_encoded_audio() -> dict[str, str]:
return {
audio_url: encode_audio_url(*fetch_audio(audio_url))
for audio_url in TEST_AUDIO_URLS
}
def dummy_messages_from_audio_url(
audio_urls: str | list[str],
content_text: str = "What's happening in this audio?",
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
audio_url: str,
base64_encoded_audio: dict[str, str],
url_encoded_audio: dict[str, str],
):
messages = dummy_messages_from_audio_url(
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
)
messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
# test single completion
chat_completion = await client.chat.completions.create(
@ -313,7 +319,7 @@ async def test_chat_streaming_input_audio(
"format": "wav",
},
},
{"type": "text", "text": "What's happening in this audio?"},
{"type": "text", "text": "What's a short title for this audio?"},
],
}
]

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.util
import json
import time
@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
assert (
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
response = await client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
temperature=0.0,
max_tokens=250,
)
choice = response.choices[0]
assert choice.finish_reason == "length", (
f"Expected finish_reason='length', got {choice.finish_reason}"
)
assert choice.message.content is not None, (
"Content should not be None when truncated"
)
assert len(choice.message.content) > 0, "Content should not be empty"

View File

@ -955,7 +955,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
],
)
@ -983,7 +982,6 @@ class TestServingChatWithHarmony:
input_messages_2,
[
{"role": "system"},
{"role": "developer"},
{"role": "user"},
# The analysis message should be dropped on subsequent inputs because
# of the subsequent assistant message to the final channel.
@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
verify_harmony_messages(
input_messages_3,
@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
verify_harmony_messages(
input_messages_4,
@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
# The reasoning that would have resulted in an analysis message is
# dropped because of a later assistant message to the final channel.
@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
{
"role": "assistant",
@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
{
"role": "assistant",

View File

@ -7,7 +7,7 @@ import openai
import pytest
import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video
from vllm.multimodal.utils import encode_video_url, fetch_video
from ...utils import RemoteOpenAIServer
@ -48,9 +48,9 @@ async def client(server):
@pytest.fixture(scope="session")
def base64_encoded_video() -> dict[str, str]:
def url_encoded_video() -> dict[str, str]:
return {
video_url: encode_video_base64(fetch_video(video_url)[0])
video_url: encode_video_url(fetch_video(video_url)[0])
for video_url in TEST_VIDEO_URLS
}
@ -175,11 +175,9 @@ async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
video_url: str,
base64_encoded_video: dict[str, str],
url_encoded_video: dict[str, str],
):
messages = dummy_messages_from_video_url(
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
# test single completion
chat_completion = await client.chat.completions.create(
@ -223,11 +221,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI,
model_name: str,
video_url: str,
base64_encoded_video: dict[str, str],
url_encoded_video: dict[str, str],
):
messages = dummy_messages_from_video_url(
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
chat_completion = await client.chat.completions.create(
model=model_name,

View File

@ -9,7 +9,7 @@ import pytest_asyncio
from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
from vllm.multimodal.utils import encode_image_url, fetch_image
from ...utils import RemoteOpenAIServer
@ -35,7 +35,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
],
[
"The image shows a Venn diagram with three over",
"The image shows a colorful Venn diagram with",
"The image displays a Venn diagram with three over",
],
[
"This image displays a gradient of colors ranging from",
@ -70,11 +70,9 @@ async def client(server):
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
def url_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_asset: encode_image_base64(
local_asset_server.get_image_asset(image_asset)
)
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
for image_asset in TEST_IMAGE_ASSETS
}
@ -234,11 +232,11 @@ async def test_single_chat_session_image_base64encoded(
model_name: str,
raw_image_url: str,
image_url: str,
base64_encoded_image: dict[str, str],
url_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = dummy_messages_from_image_url(
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
url_encoded_image[raw_image_url],
content_text,
)
@ -288,15 +286,13 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI,
model_name: str,
image_idx: int,
base64_encoded_image: dict[str, str],
url_encoded_image: dict[str, str],
):
# NOTE: This test also validates that we pass MM data through beam search
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = dummy_messages_from_image_url(
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
)
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
chat_completion = await client.chat.completions.create(
model=model_name,

View File

@ -10,7 +10,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image
from vllm.multimodal.utils import fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2
@ -48,14 +48,6 @@ def server():
yield remote_server
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4

View File

@ -0,0 +1,352 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
from vllm.entrypoints.score_utils import get_score_prompt
from vllm.inputs import TokensPrompt
from vllm.tokenizers import get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
def assert_prompt_tokenization_consistent(
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert actual_ids == expected_ids, (
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
)
@pytest.fixture(scope="module")
def cross_encoder_model_config():
return ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
@pytest.fixture(scope="module")
def cross_encoder_tokenizer(cross_encoder_model_config):
return get_tokenizer(
CROSS_ENCODER_MODEL_ID,
trust_remote_code=cross_encoder_model_config.trust_remote_code,
)
@pytest.fixture(scope="module")
def llm_reranker_model_config():
"""Model config for LLM-as-reranker style (no pad token)."""
config = ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
# use_pad_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config.hf_config.use_pad_token = False
return config
@pytest.fixture
def tokenization_kwargs():
"""Common tokenization kwargs used across tests."""
return {"add_special_tokens": True, "return_tensors": None}
@pytest.fixture
def mock_model_with_score_template():
"""Mock model class that supports score template and tracks post_process calls."""
class MockModelWithScoreTemplate:
supports_score_template = True
post_process_called: list[TokensPrompt] = []
@staticmethod
def get_score_template(p1: str, p2: str) -> str:
return f"[QUERY]{p1}[SEP][DOC]{p2}"
@staticmethod
def post_process_tokens(prompt: TokensPrompt) -> None:
MockModelWithScoreTemplate.post_process_called.append(prompt)
return MockModelWithScoreTemplate
@pytest.fixture
def mock_model_no_score_template():
"""Mock model class that does not support score template."""
class MockModelNoScoreTemplate:
supports_score_template = False
return MockModelNoScoreTemplate
class TestGetScorePrompt:
"""Tests for the get_score_prompt function."""
def test_tokenization_kwargs_passed_through(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
):
"""Test that tokenization kwargs are properly passed through."""
data_1 = "Query text"
data_2 = "Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs = {
"add_special_tokens": True,
"return_tensors": None,
"truncation": True,
"max_length": 20,
}
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
custom_tokenization_kwargs,
data_1,
data_2,
)
assert isinstance(full_prompt, str)
assert "prompt_token_ids" in engine_prompt
# With max_length=20 and truncation, should not exceed this
assert len(engine_prompt["prompt_token_ids"]) <= 20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert full_ids[: len(actual_ids)] == actual_ids, (
f"Token IDs are not a prefix of full encoding.\n"
f"Full IDs: {full_ids}\n"
f"Actual IDs: {actual_ids}"
)
def test_model_supports_score_template(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template (no score_template arg)."""
with patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query text",
"document text",
)
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
assert "prompt_token_ids" in engine_prompt
assert len(engine_prompt["prompt_token_ids"]) > 0
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_model_supports_score_template_but_custom_template_provided(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template but custom template is provided."""
template = (
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
score_template=template, # Providing a template
)
assert "prompt_token_ids" in engine_prompt
assert full_prompt == "TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_not_using_default_template(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: Models implementing SupportsScoreTemplate must use their custom
# template implementation by default to preserve existing functionality.
# Attempting to use tokenizer_config.json templates would most likely break
# these models, as often they just inherit the template from the original LLM.
# CLI --chat-template overrides are still supported.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
return_value="test querytest doc",
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"test query",
"test doc",
)
assert full_prompt == "test querytest doc"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_fallback_with_pad_token(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_pad_token=True."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config, # use_pad_token=True
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert "prompt_token_ids" in engine_prompt
# Should have token_type_ids from text_pair encoding
assert "token_type_ids" in engine_prompt
assert "query" in full_prompt
assert "document" in full_prompt
assert full_prompt != "querydocument"
assert (
engine_prompt["prompt_token_ids"]
== cross_encoder_tokenizer(
"query", text_pair="document", add_special_tokens=True
)["input_ids"]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer,
full_prompt,
engine_prompt,
add_special_tokens=False,
)
def test_fallback_without_pad_token(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_pad_token=False."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config, # use_pad_token=False
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert full_prompt == "querydocument"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_post_process_tokens_called(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template.post_process_called.clear()
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
)
# post_process_tokens should have been called once
assert len(mock_model_with_score_template.post_process_called) == 1
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)

View File

@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
)
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import (
encode_audio_base64,
encode_image_base64,
encode_video_base64,
encode_audio_url,
encode_image_url,
encode_video_url,
)
from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
@ -141,22 +141,19 @@ def mistral_model_config():
@pytest.fixture(scope="module")
def image_url():
image = ImageAsset("cherry_blossom")
base64 = encode_image_base64(image.pil_image)
return f"data:image/jpeg;base64,{base64}"
return encode_image_url(image.pil_image)
@pytest.fixture(scope="module")
def video_url():
video = VideoAsset("baby_reading", 1)
base64 = encode_video_base64(video.np_ndarrays)
return f"data:video/jpeg;base64,{base64}"
return encode_video_url(video.np_ndarrays)
@pytest.fixture(scope="module")
def audio_url():
audio = AudioAsset("mary_had_lamb")
base64 = encode_audio_base64(*audio.audio_and_sample_rate)
return f"data:audio/ogg;base64,{base64}"
return encode_audio_url(*audio.audio_and_sample_rate)
def _assert_mm_data_is_image_input(

View File

@ -0,0 +1,11 @@
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--async-scheduling
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"

View File

@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
Qwen3-Next-FP8-EP2.yaml

View File

@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
print(f"Number of questions: {eval_config['num_questions']}")
print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
print(f"Server args: {' '.join(server_args)}")
print(f"Environment variables: {env_dict}")
# Launch server and run evaluation
with RemoteOpenAIServer(

View File

@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_copy_blocks(
kv_cache_factory,
num_mappings: int,
num_layers: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
seed: int,
kv_cache_dtype: str,
device: str,
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
current_platform.seed_everything(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
assert 2 * num_mappings <= num_blocks
src_blocks = random.sample(range(num_blocks), num_mappings)
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
block_mapping: list[tuple[int, int]] = []
for i in range(num_mappings):
src = src_blocks[i]
dst1 = dst_blocks[2 * i]
dst2 = dst_blocks[2 * i + 1]
block_mapping.append((src, dst1))
block_mapping.append((src, dst2))
# Create the KV caches.
key_caches, value_caches = kv_cache_factory(
num_blocks,
block_size,
num_layers,
num_heads,
head_size,
kv_cache_dtype,
dtype,
seed,
device,
)
# Clone the KV caches.
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
# Call the copy blocks kernel.
block_mapping_tensor = torch.tensor(
block_mapping, dtype=torch.int64, device=device
).view(-1, 2)
opcheck(
torch.ops._C_cache_ops.copy_blocks,
(key_caches, value_caches, block_mapping_tensor),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
cond=(head_size == HEAD_SIZES[0]),
)
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
# Run the reference implementation.
for src, dst in block_mapping:
for cloned_key_cache in cloned_key_caches:
cloned_key_cache[dst].copy_(cloned_key_cache[src])
for cloned_value_cache in cloned_value_caches:
cloned_value_cache[dst].copy_(cloned_value_cache[src])
# Compare the results.
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
torch.testing.assert_close(key_cache, cloned_key_cache)
for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
torch.testing.assert_close(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_copy_blocks_mla(
kv_lora_rank: int,
qk_rope_head_dim: int,
block_size: int,
num_blocks: int,
num_layers: int,
dtype: torch.dtype,
seed: int,
device: str,
kv_cache_dtype: str,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
entry_size = kv_lora_rank + qk_rope_head_dim
kv_caches = []
for _ in range(num_layers):
kv_cache = _create_mla_cache(
num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
)
_fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
kv_caches.append(kv_cache)
ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
num_mappings = min(2, num_blocks // 2)
src_blocks = random.sample(range(num_blocks), num_mappings)
remaining = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining, 2 * num_mappings)
block_mapping = []
for i in range(num_mappings):
src = src_blocks[i]
dst1 = dst_blocks[2 * i]
dst2 = dst_blocks[2 * i + 1]
block_mapping.append((src, dst1))
block_mapping.append((src, dst2))
block_mapping_tensor = torch.tensor(
block_mapping, dtype=torch.int64, device=device
).view(-1, 2)
for src, dst in block_mapping:
for ref_cache in ref_caches:
ref_cache[dst].copy_(ref_cache[src])
opcheck(
torch.ops._C_cache_ops.copy_blocks_mla,
(kv_caches, block_mapping_tensor),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
)
ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
for kv_cache, ref_cache in zip(kv_caches, ref_caches):
torch.testing.assert_close(kv_cache, ref_cache)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)

View File

@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16]
IS_NEOX = [True, False]
EPS_VALUES = [1e-5, 1e-6]
SEEDS = [13]
PARTIAL_ROPE = [True, False]
CUDA_DEVICES = ["cuda:0"]
@ -52,6 +53,7 @@ def _apply_qk_norm_rope(
@pytest.mark.parametrize("is_neox", IS_NEOX)
@pytest.mark.parametrize("eps", EPS_VALUES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
@torch.inference_mode()
def test_fused_qk_norm_rope_matches_reference(
device: str,
@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference(
is_neox: bool,
eps: float,
seed: int,
rotary_ratio: float,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference(
k_norm.weight.data.normal_(mean=1.0, std=0.1)
q_weight = q_norm.weight.data
k_weight = k_norm.weight.data
rotary_dim = int(head_dim * rotary_ratio)
rope = RotaryEmbedding(
head_size=head_dim,
rotary_dim=head_dim,
rotary_dim=rotary_dim,
max_position_embeddings=4096,
base=10000.0,
is_neox_style=is_neox,

View File

@ -258,16 +258,16 @@ class Config:
f"{self.fe_supported_types()}."
)
# Check block quanization support
is_block_quatized = self.quant_block_shape is not None
if is_block_quatized and self.quant_dtype is None:
# Check block quantization support
is_block_quantized = self.quant_block_shape is not None
if is_block_quantized and self.quant_dtype is None:
return False, "No block quantization support."
if is_block_quatized and not self.is_block_quant_supported():
if is_block_quantized and not self.is_block_quant_supported():
return False, "Mismatched block quantization support."
# deep_gemm only works with block-quantized
if self.needs_deep_gemm() and not is_block_quatized:
if self.needs_deep_gemm() and not is_block_quantized:
return False, "Needs DeepGEMM but not block quantized."
# Check dependencies (turn into asserts?)

View File

@ -1,92 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# DeepGEMM Style Cutlass Grouped GEMM Test
# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py
import random
import pytest
import torch
from tests.kernels.moe.utils import per_token_cast_to_fp8
from tests.kernels.utils import baseline_scaled_mm
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.deep_gemm import per_block_cast_to_fp8
from vllm.utils.math_utils import cdiv
@pytest.mark.parametrize(
"num_groups, expected_m_per_group, k, n",
[
(4, 8192, 7168, 4096),
(4, 8192, 2048, 7168),
(8, 4096, 7168, 4096),
(8, 4096, 2048, 7168),
(32, 1024, 7168, 4096),
(32, 1024, 2048, 7168),
],
)
@pytest.mark.parametrize("out_dtype", [torch.float16])
@pytest.mark.skipif(
(lambda x: x is None or x.to_int() != 100)(
current_platform.get_device_capability()
),
reason="Block Scaled Grouped GEMM is only supported on SM100.",
)
def test_cutlass_grouped_gemm(
num_groups: int,
expected_m_per_group: int,
k: int,
n: int,
out_dtype: torch.dtype,
):
device = "cuda"
alignment = 128
group_ms = [
int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)
]
m = sum([cdiv(m, alignment) * alignment for m in group_ms])
x = torch.randn((m, k), device=device, dtype=out_dtype)
y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype)
out = torch.empty((m, n), device=device, dtype=out_dtype)
ref_out = torch.randn((m, n), device=device, dtype=out_dtype)
ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m]
pb_size = []
for i in range(num_groups):
pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k])
problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32)
expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32)
x_fp8 = per_token_cast_to_fp8(x)
y_fp8 = (
torch.empty_like(y, dtype=torch.float8_e4m3fn),
torch.empty(
(num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float
),
)
for i in range(num_groups):
y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
for i in range(num_groups):
a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]]
a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]]
b = y_fp8[0][i].t()
b_scale = y_fp8[1][i].t()
baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype)
ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline
ops.cutlass_blockwise_scaled_grouped_mm(
out,
x_fp8[0],
y_fp8[0],
x_fp8[1],
y_fp8[1],
problem_sizes,
expert_offsets[:-1],
)
torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3)

View File

@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
from vllm.v1.worker.workspace import init_workspace_manager
NUM_EXPERTS = [8, 64, 192]
EP_SIZE = [1, 4]
@ -487,6 +488,7 @@ def test_mixtral_moe(
monkeypatch.setenv("MASTER_ADDR", "localhost")
monkeypatch.setenv("MASTER_PORT", "12345")
init_distributed_environment()
init_workspace_manager(torch.cuda.current_device())
# Instantiate our and huggingface's MoE blocks
vllm_config.compilation_config.static_forward_context = dict()
@ -533,6 +535,11 @@ def test_mixtral_moe(
torch.cuda.synchronize()
torch.cuda.empty_cache()
# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
# Run forward passes for both MoE blocks
hf_states, _ = hf_moe.forward(hf_inputs)
vllm_states = vllm_moe.forward(vllm_inputs)

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from pathlib import Path
import mteb
import numpy as np
@ -19,6 +20,11 @@ from tests.models.utils import (
get_vllm_extra_kwargs,
)
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
@ -102,30 +108,6 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client):
self.model_name = model_name
@ -153,6 +135,35 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
return embeds
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = self.llm.score(
queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False,
chat_template=self.chat_template,
)
scores = np.array(outputs)
return scores
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
@ -387,6 +398,11 @@ def mteb_test_rerank_models(
== model_info.default_pooling_type
)
chat_template: str | None = None
if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text()
vllm_model.chat_template = chat_template
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import (
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
LASTPoolingEmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673,
)
]
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994,
),
]
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)

View File

@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
),
"intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],

View File

@ -8,7 +8,7 @@ from PIL.Image import Image
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
@ -31,10 +31,7 @@ def test_keye_vl(
question: str,
):
images = [asset.pil_image for asset in image_assets]
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]
image_urls = [encode_image_url(image) for image in images]
engine_args = EngineArgs(
model=MODEL_NAME,

View File

@ -15,7 +15,7 @@ from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR
image = images[0] # Already filtered to stop_sign
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
image_url = encode_image_url(image)
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
messages = [
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
config["model_name"], trust_remote_code=True
)
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
image_urls = [encode_image_url(img) for img in images]
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
def build_ovis_prompt(images, config):
"""Build Ovis2.5 specific prompt with custom format."""
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
image_urls = [encode_image_url(img) for img in images]
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)

View File

@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.message.content == "In the first audio clip, you hear a brief"
assert choice.finish_reason == "length"

View File

@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import pytest
from vllm.platforms import current_platform
@pytest.fixture
def siglip_attention_config():
"""Return attention config for SigLIP tests on ROCm.
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if current_platform.is_rocm():
return {"backend": "FLEX_ATTENTION"}
return None

View File

@ -138,7 +138,7 @@ def create_batched_mm_kwargs(
)
# TODO(Isotr0py): Don't initalize model during test
# TODO(Isotr0py): Don't initialize model during test
@contextmanager
def initialize_dummy_model(
model_cls: type[nn.Module],

View File

@ -215,7 +215,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True,
),
"CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
"DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
# FIXME: databricks/dbrx-instruct has been deleted
"DbrxForCausalLM": _HfExamplesInfo(
"databricks/dbrx-instruct", is_available_online=False
),
"DeciLMForCausalLM": _HfExamplesInfo(
"nvidia/Llama-3_3-Nemotron-Super-49B-v1",
trust_remote_code=True,
@ -366,7 +369,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{"tiny": "TitanML/tiny-mixtral"},
),
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
# FIXME: mosaicml/mpt-7b has been deleted
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b", is_available_online=False),
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
"NemotronHForCausalLM": _HfExamplesInfo(
"nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
@ -484,6 +488,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
),
"JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
"LlamaBidirectionalModel": _HfExamplesInfo(
"nvidia/llama-nemotron-embed-1b-v2", trust_remote_code=True
),
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
"ModernBertModel": _HfExamplesInfo(
"Alibaba-NLP/gte-modernbert-base", trust_remote_code=True
@ -550,6 +557,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
trust_remote_code=True,
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
),
"LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
"nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
),
"ModernBertForSequenceClassification": _HfExamplesInfo(
"Alibaba-NLP/gte-reranker-modernbert-base"
),
@ -850,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# disable this temporarily until we support HF format
is_available_online=False,
),
"VoxtralStreamingGeneration": _HfExamplesInfo(
"<place-holder>",
# disable this temporarily until we support HF format
is_available_online=False,
),
# [Encoder-decoder]
"WhisperForConditionalGeneration": _HfExamplesInfo(
"openai/whisper-large-v3-turbo",

View File

@ -38,7 +38,7 @@ def test_inference(
max_num_seqs=32,
default_torch_num_threads=1,
) as vllm_model:
vllm_output = vllm_model.llm.encode(prompt)
vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
assert torch.equal(
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
)

View File

@ -399,6 +399,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
@dataclass
class RerankModelInfo(ModelInfo):
mteb_score: float | None = None
chat_template_name: str | None = None
@dataclass

View File

@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
with vllm_runner(model_path, enforce_eager=True) as llm:
@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs(
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:

View File

@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import (
Fp8Config,
Fp8KVCacheMethod,
Fp8LinearMethod,
Fp8MoeBackend,
Fp8MoEMethod,
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@ -216,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None:
ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
# Reference dynamic quantizaton
# Reference dynamic quantization
y = quantize_ref(x, inv_scale)
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
@ -324,7 +325,10 @@ def test_fp8_reloading(
weight_loader=default_weight_loader,
)
# Fp8LinearMethod uses use_marlin
# Fp8MoEMethod uses fp8_backend
method.use_marlin = use_marlin
method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None
# capture weights format during loading
original_metadata = [

View File

@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
"""
import os
from typing import NoReturn
import pytest
import torch
@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
def _skip(msg: str) -> NoReturn:
pytest.skip(msg)
raise RuntimeError(msg)
def _snapshot_download_or_skip(model_id: str) -> str:
try:
from huggingface_hub import snapshot_download
except Exception as e: # pragma: no cover
_skip(f"huggingface_hub is required to download {model_id}: {e}")
try:
return snapshot_download(
repo_id=model_id,
repo_type="model",
# These checkpoints are already small; download full repo for simplicity.
allow_patterns=["*"],
)
except Exception as e:
_skip(f"Failed to download {model_id} from the HF Hub: {e}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8 output: {output}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
model_path = _snapshot_download_or_skip(model_id)
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8PcPtLinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Per-channel scales; activations are dynamically scaled per token.
assert hasattr(qkv_proj, "weight_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.weight_scale.dim() == 1
assert not hasattr(qkv_proj, "input_scale")
assert hasattr(o_proj, "weight_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.weight_scale.dim() == 1
assert not hasattr(o_proj, "input_scale")
assert hasattr(gate_up_proj, "weight_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.weight_scale.dim() == 1
assert not hasattr(gate_up_proj, "input_scale")
assert hasattr(down_proj, "weight_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.weight_scale.dim() == 1
assert not hasattr(down_proj, "input_scale")
llm.apply_model(check_model)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8_PB_WO checkpoint setup."""
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
model_path = _snapshot_download_or_skip(model_id)
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8PbWoLinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
assert hasattr(qkv_proj, "weight_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.weight_scale.dim() == 2
assert hasattr(o_proj, "weight_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.weight_scale.dim() == 2
assert hasattr(gate_up_proj, "weight_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.weight_scale.dim() == 2
assert hasattr(down_proj, "weight_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.weight_scale.dim() == 2
llm.apply_model(check_model)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8_PB_WO output: {output}")

View File

@ -18,25 +18,37 @@ for i in {1..5}; do
echo "Checking metadata.json URL (attempt $i)..."
if curl --fail "$meta_json_url" > metadata.json; then
echo "INFO: metadata.json URL is valid."
# check whether it is valid json by python
# check whether it is valid json by python (printed to stdout)
if python3 -m json.tool metadata.json; then
echo "INFO: metadata.json is valid JSON. Proceeding with the test."
echo "INFO: metadata.json is valid JSON. Proceeding with the check."
# check whether there is an object in the json matching:
# "package_name": "vllm", and "platform_tag" matches the current architecture
# see `determine_wheel_url` in setup.py for more details
if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \
s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \
for o in d)))" 2>/dev/null; then
echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture."
break
else
echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture."
fi
else
echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
echo "INFO: metadata.json content:"
cat metadata.json
exit 1
fi
break
fi
# failure handling
# failure handling & retry logic
if [ $i -eq 5 ]; then
echo "ERROR: metadata.json URL is still not valid after 5 attempts."
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
echo "ERROR: metadata is still not available after 5 attempts."
echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available."
echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
echo " NOTE: If it fails, please report in #sig-ci channel."
exit 1
else
echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
sleep 180
echo "WARNING: metadata is not available. Retrying after 5 minutes..."
sleep 300
fi
done

View File

@ -0,0 +1,169 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.attention.backends.abstract import (
AttentionBackend,
AttentionImpl,
)
from vllm.attention.backends.registry import (
AttentionBackendEnum,
MambaAttentionBackendEnum,
register_backend,
)
class CustomAttentionImpl(AttentionImpl):
"""Mock custom attention implementation for testing."""
def __init__(self, *args, **kwargs):
super().__init__()
def forward(self, *args, **kwargs):
"""Mock forward pass."""
pass
class CustomAttentionBackend(AttentionBackend):
"""Mock custom attention backend for testing."""
@staticmethod
def get_name():
return "CUSTOM"
@staticmethod
def get_impl_cls():
return CustomAttentionImpl
@staticmethod
def get_builder_cls():
"""Mock builder class."""
return None
@staticmethod
def get_required_kv_cache_layout():
"""Mock KV cache layout."""
return None
class CustomMambaAttentionImpl(AttentionImpl):
"""Mock custom mamba attention implementation for testing."""
def __init__(self, *args, **kwargs):
super().__init__()
def forward(self, *args, **kwargs):
"""Mock forward pass."""
pass
class CustomMambaAttentionBackend(AttentionBackend):
"""Mock custom mamba attention backend for testing."""
@staticmethod
def get_name():
return "CUSTOM_MAMBA"
@staticmethod
def get_impl_cls():
return CustomMambaAttentionImpl
@staticmethod
def get_builder_cls():
"""Mock builder class."""
return None
@staticmethod
def get_required_kv_cache_layout():
"""Mock KV cache layout."""
return None
def test_custom_is_not_alias_of_any_backend():
# Get all members of AttentionBackendEnum
all_backends = list(AttentionBackendEnum)
# Find any aliases of CUSTOM
aliases = []
for backend in all_backends:
if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM:
aliases.append(backend.name)
# CUSTOM should not be an alias of any other backend
assert len(aliases) == 0, (
f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n"
f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n"
f"This happens when CUSTOM has the same value as another backend.\n"
f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n"
f"All backend values:\n"
+ "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends)
)
# Verify CUSTOM has its own unique identity
assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", (
f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'"
)
def test_register_custom_backend_with_class_path():
# Register with explicit class path
register_backend(
backend=AttentionBackendEnum.CUSTOM,
class_path="tests.test_attention_backend_registry.CustomAttentionBackend",
is_mamba=False,
)
# Check that CUSTOM backend is registered
assert AttentionBackendEnum.CUSTOM.is_overridden(), (
"CUSTOM should be overridden after registration"
)
# Get the registered class path
class_path = AttentionBackendEnum.CUSTOM.get_path()
assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend"
# Get the backend class
backend_cls = AttentionBackendEnum.CUSTOM.get_class()
assert backend_cls.get_name() == "CUSTOM"
assert backend_cls.get_impl_cls() == CustomAttentionImpl
def test_mamba_custom_is_not_alias_of_any_backend():
# Get all mamba backends
all_backends = list(MambaAttentionBackendEnum)
# Find any aliases of CUSTOM
aliases = []
for backend in all_backends:
if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM:
aliases.append(backend.name)
# CUSTOM should not be an alias of any other backend
assert len(aliases) == 0, (
f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n"
f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n"
f"All mamba backend values:\n"
+ "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends)
)
def test_register_custom_mamba_backend_with_class_path():
# Register with explicit class path
register_backend(
backend=MambaAttentionBackendEnum.CUSTOM,
class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend",
is_mamba=True,
)
# Check that the backend is registered
assert MambaAttentionBackendEnum.CUSTOM.is_overridden()
# Get the registered class path
class_path = MambaAttentionBackendEnum.CUSTOM.get_path()
assert (
class_path
== "tests.test_attention_backend_registry.CustomMambaAttentionBackend"
)
# Get the backend class
backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class()
assert backend_cls.get_name() == "CUSTOM_MAMBA"
assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl

View File

@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
envs.environment_variables[env_name] = lambda s=strategy: s
# Test the select_experts method
topk_weights, topk_ids, _ = fused_moe.select_experts(
topk_weights, topk_ids = fused_moe.select_experts(
hidden_states=hidden_states,
router_logits=router_logits,
)

View File

@ -38,7 +38,8 @@ TOKENIZERS = [
"EleutherAI/gpt-j-6b",
"EleutherAI/pythia-70m",
"bigscience/bloom-560m",
"mosaicml/mpt-7b",
# FIXME: mosaicml/mpt-7b has been deleted
# "mosaicml/mpt-7b",
"tiiuae/falcon-7b",
"meta-llama/Llama-3.2-1B-Instruct",
"codellama/CodeLlama-7b-hf",

View File

@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
"single_tool_add",
"single_tool_weather",
"multiple_tool_calls",
"complex",
"wrong_json",
],
argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
],
None,
),
(
# Complex
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="bash",
arguments=json.dumps(
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
)[:-2],
)
)
],
"hi{hi",
),
(
# Wrong json
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="bash",
arguments=json.dumps(
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
),
)
)
],
"hi{hi",
),
],
)
def test_extract_tool_calls(
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
),
(
# Complex
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
)
)
],
"",
"hi{hi",
),
],
)

View File

@ -106,6 +106,7 @@ class RemoteOpenAIServer:
env.update(env_dict)
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
print(f"Environment variables: {env}")
self.proc: subprocess.Popen = subprocess.Popen(
serve_cmd,
env=env,

View File

@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
assert len(manager.block_pool.cached_block_hash_to_block) == 0
def test_null_parent_block_hash():
block_size = 1
num_cached_blocks = 2
num_full_blocks = 4
pool = BlockPool(
num_gpu_blocks=8,
enable_caching=True,
hash_block_size=block_size,
enable_kv_cache_events=True,
)
req = make_request(
"req_null_parent",
prompt_token_ids=[10, 11, 12, 13],
block_size=block_size,
hash_fn=sha256,
)
assert len(req.block_hashes) == num_full_blocks
# Physical parent is `null_block` (no hash), while the logical parent hash
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
assert pool.null_block.block_hash is None
new_blocks = pool.get_new_blocks(num_full_blocks - 1)
blocks = [
new_blocks[: num_cached_blocks - 1],
pool.null_block, # physical parent
*new_blocks[num_cached_blocks - 1 :],
]
pool.cache_full_blocks(
request=req,
blocks=blocks,
num_cached_blocks=num_cached_blocks,
num_full_blocks=num_full_blocks,
block_size=block_size,
kv_cache_group_id=0,
)
events = pool.take_events()
assert len(events) == 1
event = events[0]
assert isinstance(event, BlockStored)
expected_parent = kv_cache_utils.maybe_convert_block_hash(
req.block_hashes[num_cached_blocks - 1]
)
assert event.parent_block_hash == expected_parent
assert event.parent_block_hash is not None
expected_new_hashes = [
kv_cache_utils.maybe_convert_block_hash(h)
for h in req.block_hashes[num_cached_blocks:num_full_blocks]
]
assert event.block_hashes == expected_new_hashes
# Ensure we didn't accidentally assign a hash to the null block.
assert pool.null_block.block_hash is None
# Sanity check: newly cached physical blocks should have hashes assigned.
assert blocks[num_cached_blocks].block_hash is not None
assert blocks[num_full_blocks - 1].block_hash is not None
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
def test_kv_cache_events_with_lora(blocks_to_cache: int):
"""Test BlockStored events contain correct lora_id when using LoRA requests."""

View File

@ -31,7 +31,7 @@ import openai
import requests
from vllm.assets.image import ImageAsset
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
MAX_OUTPUT_LEN = 256
@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image;base64,{encode_image_base64(image_1)}"
},
"image_url": {"url": encode_image_url(image_1)},
},
{"type": "text", "text": "What's in this image?"},
],
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image;base64,{encode_image_base64(image_2)}"
},
"image_url": {"url": encode_image_url(image_2)},
},
{
"type": "image_url",

View File

@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
# Use multi-abort to abort multiple requests at once
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
await engine.abort(abort_request_ids)
await engine.abort(abort_request_ids, internal=False)
# Wait for all tasks to complete
results = await asyncio.gather(*tasks, return_exceptions=True)
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
await asyncio.sleep(0.5)
# Abort the request
await engine.abort(request_id)
await engine.abort(request_id, internal=False)
# Wait for generation to complete and return final output
final_output = await generated

View File

@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "I am Gyoubu Masataka Oniwa"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
_REQUEST_COUNTER = 0
def make_request() -> EngineCoreRequest:
global _REQUEST_COUNTER
_REQUEST_COUNTER += 1
request_id = f"request-{_REQUEST_COUNTER}"
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
request_id=request_id,
external_req_id=f"{request_id}-{uuid.uuid4()}",
prompt_token_ids=PROMPT_TOKENS,
mm_features=None,
sampling_params=SamplingParams(),

View File

@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import importlib
import os
import signal
import time
import uuid
from dataclasses import dataclass
from threading import Thread
from types import SimpleNamespace
from typing import Any
from unittest.mock import MagicMock
@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
from vllm.v1.engine.core_client import (
AsyncMPClient,
EngineCoreClient,
SyncMPClient,
)
from vllm.v1.engine.utils import CoreEngineProcManager
from vllm.v1.executor.abstract import Executor
@ -39,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "Hello my name is Robert and I love quantization kernels"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
_REQUEST_COUNTER = 0
def make_request(
params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@ -46,8 +54,12 @@ def make_request(
if not prompt_tokens_ids:
prompt_tokens_ids = PROMPT_TOKENS
global _REQUEST_COUNTER
_REQUEST_COUNTER += 1
request_id = f"request-{_REQUEST_COUNTER}"
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
request_id=request_id,
external_req_id=f"{request_id}-{uuid.uuid4()}",
prompt_token_ids=prompt_tokens_ids,
mm_features=None,
sampling_params=params,
@ -60,6 +72,91 @@ def make_request(
)
def _reload_envs_module():
import vllm.envs as envs_mod
cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
if cache_clear is not None:
cache_clear()
return importlib.reload(envs_mod)
def _reload_core_client_module():
module = importlib.import_module("vllm.v1.engine.core_client")
return importlib.reload(module)
def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
timeout_value = 654
monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
# Ensure that the environment variable is loaded if caching is enabled
_reload_envs_module()
core_client_mod = _reload_core_client_module()
poll_timeouts: list[int] = []
class ShadowSocket:
def poll(self, timeout: int) -> int:
# Capture the timeout value for each poll call
poll_timeouts.append(timeout)
return 1
def recv_multipart(self):
return (b"\x00\x00", b"ready")
class DummySocket:
def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
if track:
return SimpleNamespace(done=True)
def recv_multipart(self, *, copy: bool = False):
return (b"", b"")
def close(self, *, linger: int = 0):
pass
def bind(self, _address):
pass
def connect(self, _address):
pass
def setsockopt(self, *_args, **_kwargs):
pass
monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
monkeypatch.setattr(
core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
)
parallel_config = SimpleNamespace(
data_parallel_size=1,
data_parallel_rank=0,
data_parallel_size_local=1,
data_parallel_rank_local=None,
data_parallel_hybrid_lb=False,
data_parallel_external_lb=False,
)
vllm_config = SimpleNamespace(parallel_config=parallel_config)
client = core_client_mod.MPClient(
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=object,
log_stats=False,
client_addresses={
"input_address": "inproc://input",
"output_address": "inproc://output",
},
)
try:
# timeout_value is in seconds, but poll receives milliseconds
assert poll_timeouts == [timeout_value * 1000]
finally:
client.shutdown()
def loop_until_done(client: EngineCoreClient, outputs: dict):
while True:
engine_core_outputs = client.get_output().outputs

View File

@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
params = SamplingParams(skip_special_tokens=True)
request = EngineCoreRequest(
request_id="test",
external_req_id="test-ext",
prompt_token_ids=prompt_token_ids,
mm_features=None,
sampling_params=params,

View File

@ -58,12 +58,12 @@ def test_incremental_detokenization(
output_processor = OutputProcessor(
dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
)
engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
# Make N requests.
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
request_id=f"request-{idx}-int",
external_req_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
@ -83,6 +83,11 @@ def test_incremental_detokenization(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
@ -438,15 +443,6 @@ def test_logprobs_processor(
dummy_test_vectors,
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
if num_sample_logprobs is None
else dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None
else dummy_test_vectors.prompt_logprobs,
)
# Make N requests.
request_id_list = [
@ -454,7 +450,8 @@ def test_logprobs_processor(
]
requests = [
EngineCoreRequest(
request_id=request_id_list[idx],
request_id=request_id_list[idx] + "-int",
external_req_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
@ -476,6 +473,17 @@ def test_logprobs_processor(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
if num_sample_logprobs is None
else dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None
else dummy_test_vectors.prompt_logprobs,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
@ -621,19 +629,12 @@ def test_stop_token(
]
prompt_string = dummy_test_vectors.prompt_strings[0]
prompt_tokens = dummy_test_vectors.prompt_tokens[0]
engine_core = MockEngineCore(
tokens_list=[generation_tokens],
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
prompt_logprobs_raw=None,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids,
ignore_eos=ignore_eos,
)
# Make request.
request_id = "request-0"
request = EngineCoreRequest(
request_id=request_id,
external_req_id=request_id + "-ext",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=eos_token_id,
@ -655,6 +656,16 @@ def test_stop_token(
pooling_params=None,
)
engine_core = MockEngineCore(
tokens_list=[generation_tokens],
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
prompt_logprobs_raw=None,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids,
ignore_eos=ignore_eos,
request_ids=[request.request_id],
)
# Add request to the detokenizer.
output_processor.add_request(request, prompt_string)
@ -720,13 +731,6 @@ def test_stop_string(
dummy_test_vectors,
):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs
else None,
prompt_logprobs_raw=None,
)
# Make N requests.
request_id_list = [
@ -734,7 +738,8 @@ def test_stop_string(
]
requests = [
EngineCoreRequest(
request_id=request_id_list[idx],
request_id=request_id_list[idx] + "-int",
external_req_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
@ -756,6 +761,15 @@ def test_stop_string(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs
else None,
prompt_logprobs_raw=None,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
@ -813,9 +827,12 @@ def test_stop_string(
for idx, (ref_gen_str, stop_str) in enumerate(
zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
):
# Request should be aborted.
# Request should be aborted (check internal ID in abort list).
internal_request_id = f"request-{idx}-int"
assert internal_request_id in aborted
# Use external ID for collecting outputs
request_id = f"request-{idx}"
assert request_id in aborted
# Collected values that were generated.
gen_str = gen_strings[request_id]
@ -848,13 +865,13 @@ def test_stop_string(
def test_iteration_stats(dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic()
# Make N requests.
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
external_req_id=f"request-{idx}-ext",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add all requests except one to the OutputProcessor.
num_active = len(dummy_test_vectors.generation_tokens) - 1
for request in requests[:num_active]:
@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
output_processor = OutputProcessor(
dummy_test_vectors.tokenizer, log_stats=log_stats
)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic()
# Create LoRA requests
@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
lora_assignments = [lora1, lora2, None]
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
request_id=f"request-{idx}-int",
external_req_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
engine_core = MockEngineCore(
dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add all requests to the OutputProcessor
for request in requests:
output_processor.add_request(request, None)
@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
)
# Find and mark request-0 as finished (it uses lora-1)
# Find and mark request-0-int as finished (it uses lora-1)
for output in outputs.outputs:
if output.request_id == "request-0":
if output.request_id == "request-0-int":
output.finish_reason = FinishReason.LENGTH
break
@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
)
# Find and mark request-1 as finished (it uses lora-2)
# Find and mark request-1-int as finished (it uses lora-2)
for output in outputs.outputs:
if output.request_id == "request-1":
if output.request_id == "request-1-int":
output.finish_reason = FinishReason.LENGTH
break
@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
)
# Find and mark request-2 as finished (it has no LoRA)
# Find and mark request-2-int as finished (it has no LoRA)
for output in outputs.outputs:
if output.request_id == "request-2":
if output.request_id == "request-2-int":
output.finish_reason = FinishReason.LENGTH
break
@ -1107,7 +1134,9 @@ async def test_request_output_collector():
for idx in range(NUM_REQS)
]
collector = RequestOutputCollector(RequestOutputKind.DELTA)
collector = RequestOutputCollector(
RequestOutputKind.DELTA, request_id="my-request-id-int"
)
# CASE 1: Put then get.
outputs = make_outputs()
@ -1163,7 +1192,9 @@ async def test_request_output_collector():
@pytest.mark.asyncio
async def test_cumulative_output_collector_n():
"""Test collector correctly handles multiple outputs by index."""
collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
collector = RequestOutputCollector(
RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
)
outputs = [
RequestOutput(
request_id="my-request-id",
@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():
@pytest.mark.parametrize("runner", ["generate", "pooling"])
def test_abort_requests(runner: str, dummy_test_vectors):
@pytest.mark.parametrize("abort_by", ["internal", "external"])
def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
external_req_id=f"external-{idx}",
prompt_token_ids=prompt_tokens,
mm_features=None,
eos_token_id=None,
@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
output_kind = request.sampling_params.output_kind
else:
output_kind = request.pooling_params.output_kind
queue = RequestOutputCollector(output_kind=output_kind)
queue = RequestOutputCollector(
output_kind=output_kind, request_id=request.request_id
)
output_processor.add_request(request, None, queue=queue)
for request in requests:
output_processor.abort_requests([request.request_id])
if abort_by == "internal":
output_processor.abort_requests([request.request_id], internal=True)
else:
output_processor.abort_requests([request.external_req_id], internal=False)

View File

@ -4,11 +4,12 @@
from vllm import SamplingParams
from vllm.outputs import CompletionOutput
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.parallel_sampling import ParentRequest
def test_parent_request_to_output_stream() -> None:
parent_request = ParentRequest("parent_id", SamplingParams(n=2))
parent_request = ParentRequest(make_request(SamplingParams(n=2)))
parent_request.child_requests = {"child_id_0", "child_id_1"}
output_0 = CompletionOutput(
index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
)
# Request not finished
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
# output_1 finished
output_1.finish_reason = "ended"
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
# Finished output_1 had already returned, DO NOT returned again
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
# output_0 finished
output_0.finish_reason = "ended"
assert ("parent_id", [output_0], True) == parent_request.get_outputs(
"child_id_0", output_0
)
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
# Finished output_0 had already returned, DO NOT returned again
assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
def test_parent_request_to_output_final_only() -> None:
parent_request = ParentRequest(
"parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
)
parent_request.child_requests = {"child_id_0", "child_id_1"}
output_0 = CompletionOutput(
@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
)
# Request not finished, return nothing
assert parent_request.get_outputs("child_id_0", output_0) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
# output_1 finished, but outputs won't be returned until all child requests finished
output_1.finish_reason = "ended"
assert parent_request.get_outputs("child_id_0", output_0) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
# output_0 finished, as all child requests finished, the output would be returned
output_0.finish_reason = "ended"
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
assert ([output_0, output_1], True) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
assert ([output_0, output_1], True) == parent_request.get_outputs(
"child_id_1", output_1
)
def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
return EngineCoreRequest(
request_id="parent_id",
external_req_id="ext_parent_id",
prompt_token_ids=None,
mm_features=None,
sampling_params=sampling_params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None,
)

View File

@ -5,6 +5,7 @@ import pytest
import torch.cuda
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore
@ -14,6 +15,11 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
"""Test that preprocessing errors are handled gracefully."""
if current_platform.is_rocm():
pytest.skip(
"Skipped on ROCm: this test only works with 'fork', but ROCm uses 'spawn'."
)
assert not torch.cuda.is_initialized(), (
"fork needs to be used for the engine "
"core process and this isn't possible if cuda is already initialized"

View File

@ -6,6 +6,7 @@ import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.multimodal import MultiModalUUIDDict
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor
@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
)
captured: dict[str, object] = {}
captured: dict[str, MultiModalUUIDDict] = {}
def fake_preprocess(
prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
)
# Expect request-id-based overrides are passed through
assert captured["mm_uuids"] == {
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
"video": [f"{request_id}-video-0"],
}
mm_uuids = captured["mm_uuids"]
assert set(mm_uuids.keys()) == {"image", "video"}
assert len(mm_uuids["image"]) == 2
assert len(mm_uuids["video"]) == 1
assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][0].endswith("-0")
assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][1].endswith("-1")
assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
"video"
][0].endswith("-0")

View File

@ -343,6 +343,7 @@ class MockEngineCore:
eos_token_id: int | None = None,
stop_token_ids: list[int] | None = None,
ignore_eos: bool = False,
request_ids: list[str] | None = None,
) -> None:
self.num_requests = len(tokens_list)
self.tokens_list = tokens_list
@ -355,6 +356,11 @@ class MockEngineCore:
self.eos_token_id = eos_token_id
self.stop_token_ids = stop_token_ids
self.ignore_eos = ignore_eos
self.request_ids = (
request_ids
if request_ids is not None
else [f"request-{i}" for i in range(self.num_requests)]
)
def get_outputs(self) -> list[EngineCoreOutput]:
do_logprobs = self.do_logprobs
@ -386,7 +392,7 @@ class MockEngineCore:
prompt_logprobs = None
new_token_id = token_ids[token_idx]
output = EngineCoreOutput(
request_id=f"request-{req_idx}",
request_id=self.request_ids[req_idx],
new_token_ids=[new_token_id],
new_logprobs=logprobs,
new_prompt_logprobs_tensors=prompt_logprobs,

View File

@ -8,7 +8,7 @@ import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
# Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
@ -52,9 +52,9 @@ async def client(image_server):
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
def url_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI,
model_name: str,
raw_image_url: str,
base64_encoded_image: dict[str, str],
url_encoded_image: dict[str, str],
):
content_text = "What's in this image?"
messages = [
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
"content": [
{
"type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501
"image_url": url_encoded_image[raw_image_url],
"detail": "auto",
},
{"type": "input_text", "text": content_text},

View File

@ -9,7 +9,7 @@ from PIL import Image
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import KVTransferConfig
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
from vllm.platforms import current_platform
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
placeholders = [
{
"type": "image_url",
"image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
"image_url": {"url": encode_image_url(image_pil)},
}
for image_pil in image_urls
]

View File

@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
has_kv_transfer_group,
)
from vllm.forward_context import ForwardContext
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.platforms.interface import Platform
from vllm.sampling_params import SamplingParams
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
from vllm.v1.request import RequestStatus
@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
run_test_and_cleanup()
class RequestIdMapper:
"""Helper class to map external request IDs to internal request IDs."""
def __init__(self, output_processor: OutputProcessor):
self.req_id_mapping: dict[str, str] = {}
self.original_add_request = output_processor.add_request
output_processor.add_request = self._add_request
def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
self.req_id_mapping[request.external_req_id] = request.request_id
return self.original_add_request(request, *args, **kwargs)
def __call__(self, external_req_id: str) -> str:
return self.req_id_mapping[external_req_id]
def _run_abort_timeout_test(llm: LLM, timeout: int):
"""Helper function to run the abort timeout test logic."""
remote_prefill_opts = {
@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
0
].req_to_blocks
id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
def req_id(outputs: list[RequestOutput]) -> str:
assert len(outputs) == 1
return id_mapper(outputs[0].request_id)
padding = "Just making this request a little longer so that we're sure "
"we're not hitting the small-request lower bound beneath which we don't "
"actually trigger the whole kv transfer, but rather just recompute the "
"blocks on D."
_ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
req0_id = req_id(
llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
)
# Request finished but not freed
assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
# Some other request, 0 still not freed
_ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
assert "0" in req_to_blocks
assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
req1_id = req_id(
llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
)
assert req0_id in req_to_blocks
assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
# Wait for timeout and trigger another scheduler loop
time.sleep(timeout)
_ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
# Request-0 times out and is cleared!
assert "0" not in req_to_blocks
assert req0_id not in req_to_blocks
# Need to shutdown the background thread to release NIXL side channel port
llm.llm_engine.engine_core.shutdown()

View File

@ -306,10 +306,16 @@ def test_prepare_inputs_padded():
proposer = _create_proposer("eagle", num_speculative_tokens)
output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = (
proposer.prepare_inputs_padded(
common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
)
)
# Verify num_rejected_tokens_gpu is calculated correctly
expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device)
assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected)
assert output_metadata.max_query_len == 3
assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)

View File

@ -4,7 +4,7 @@
import openai
import pytest
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
from vllm.platforms import current_platform
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
def url_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_asset: encode_image_base64(
local_asset_server.get_image_asset(image_asset)
)
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
for image_asset in TEST_IMAGE_ASSETS
}
@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
@pytest.mark.asyncio
@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]):
pytest.skip("Skip this test until it's fixed.")
def whats_in_this_image_msg(b64):
def whats_in_this_image_msg(url):
return [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
},
{"type": "image_url", "image_url": {"url": url}},
],
}
]
@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str
# Other requests now should be much faster
for image_url in TEST_IMAGE_ASSETS:
image_base64 = base64_encoded_image[image_url]
chat_completion_from_base64 = await client.chat.completions.create(
image_url = url_encoded_image[image_url]
chat_completion_from_url = await client.chat.completions.create(
model=model_name,
messages=whats_in_this_image_msg(image_base64),
messages=whats_in_this_image_msg(image_url),
max_completion_tokens=24,
temperature=0.0,
)
result = chat_completion_from_base64
result = chat_completion_from_url
assert result
choice = result.choices[0]
assert choice.finish_reason == "length"

View File

@ -4,6 +4,7 @@ import functools
from collections.abc import Callable
import torch
from torch._ops import OpOverload
import vllm.envs as envs
from vllm.platforms import current_platform
@ -433,16 +434,16 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
from aiter import rmsnorm2d_fwd_with_add
residual_out = torch.empty_like(residual)
output = torch.empty_like(x)
out = torch.empty_like(x)
rmsnorm2d_fwd_with_add(
output, # output
out, # output
x, # input
residual, # residual input
residual_out, # residual output
weight,
variance_epsilon,
)
return output, residual_out
return out, residual_out
def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
@ -451,7 +452,84 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
weight: torch.Tensor,
variance_epsilon: float,
) -> tuple[torch.Tensor, torch.Tensor]:
return torch.empty_like(x), torch.empty_like(residual)
residual_out = torch.empty_like(residual)
out = torch.empty_like(x)
return out, residual_out
def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
x: torch.Tensor,
residual: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
import aiter as rocm_aiter
assert quant_dtype in [torch.int8, _FP8_DTYPE]
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
residual_out = torch.empty_like(x)
rocm_aiter.rmsnorm2d_fwd_with_add_dynamicquant(
out,
x,
residual,
residual_out,
y_scale,
weight,
epsilon,
use_model_sensitive_rmsnorm=0,
)
return out, residual_out, y_scale
def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake(
x: torch.Tensor,
residual: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
residual_out = torch.empty_like(x)
return out, residual_out, y_scale
def _rocm_aiter_rmsnorm_fused_dynamic_quant_impl(
x: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor]:
import aiter as rocm_aiter
assert quant_dtype in [torch.int8, _FP8_DTYPE]
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
rocm_aiter.rmsnorm2d_fwd_with_dynamicquant(
out, x, y_scale, weight, epsilon, use_model_sensitive_rmsnorm=0
)
return out, y_scale
def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake(
x: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor]:
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
return out, y_scale
def _rocm_aiter_per_tensor_quant_impl(
@ -527,7 +605,11 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
dtype_quant=AITER_FP8_DTYPE,
res1=residual,
)
return (x_quant, x_quant_scales, res)
return (
x_quant,
res,
x_quant_scales,
)
def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
@ -541,8 +623,8 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
scale_shape = (M, (N + group_size - 1) // group_size)
return (
torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
torch.empty(scale_shape, dtype=torch.float32, device=x.device),
torch.empty_like(residual, device=residual.device),
torch.empty(scale_shape, dtype=torch.float32, device=x.device),
)
@ -761,7 +843,7 @@ class rocm_aiter_ops:
@classmethod
@if_aiter_supported
def is_linear_fp8_enaled(cls) -> bool:
def is_linear_fp8_enabled(cls) -> bool:
return cls.is_linear_enabled()
@classmethod
@ -901,6 +983,20 @@ class rocm_aiter_ops:
dispatch_key=current_platform.dispatch_key,
)
direct_register_custom_op(
op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
fake_impl=_rocm_aiter_rmsnorm_fused_dynamic_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
direct_register_custom_op(
op_name="rocm_aiter_rmsnorm_fused_add_dynamic_quant",
op_func=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl,
fake_impl=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
direct_register_custom_op(
op_name="rocm_aiter_rmsnorm_fp8_group_quant",
op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
@ -936,13 +1032,54 @@ class rocm_aiter_ops:
direct_register_custom_op(
op_name="rocm_aiter_per_token_quant",
op_func=_rocm_aiter_per_token_quant_impl,
mutates_args=["scale"],
fake_impl=_rocm_aiter_per_token_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
_OPS_REGISTERED = True
@staticmethod
def get_rmsnorm_fused_add_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
@staticmethod
def get_rmsnorm_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rms_norm.default
@staticmethod
def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
@staticmethod
def get_rmsnorm_fused_dynamic_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_fused_dynamic_quant.default
@staticmethod
def get_rmsnorm_group_fused_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
@staticmethod
def get_rmsnorm_group_add_fused_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
@staticmethod
def get_per_token_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_per_token_quant.default
@staticmethod
def get_group_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_group_fp8_quant.default
@staticmethod
def get_act_mul_fused_fp8_group_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
@staticmethod
def rms_norm(
x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
) -> torch.Tensor:
return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
@staticmethod
def rms_norm2d_with_add(
x: torch.Tensor,
@ -954,12 +1091,6 @@ class rocm_aiter_ops:
x, residual, weight, variance_epsilon
)
@staticmethod
def rms_norm(
x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
) -> torch.Tensor:
return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
@staticmethod
def gemm_a8w8(
A: torch.Tensor,

View File

@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
def cutlass_blockwise_scaled_grouped_mm(
output: torch.Tensor,
a: torch.Tensor,
b: torch.Tensor,
scales_a: torch.Tensor,
scales_b: torch.Tensor,
problem_sizes: torch.Tensor,
expert_offsets: torch.Tensor,
):
torch.ops._C.cutlass_blockwise_scaled_grouped_mm(
output, a, b, scales_a, scales_b, problem_sizes, expert_offsets
)
def cutlass_scaled_fp4_mm(
a: torch.Tensor,
b: torch.Tensor,
@ -2342,18 +2328,6 @@ def concat_and_cache_mla(
)
def copy_blocks(
key_caches: list[torch.Tensor],
value_caches: list[torch.Tensor],
block_mapping: torch.Tensor,
) -> None:
torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
def swap_blocks(
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
) -> None:

View File

@ -383,18 +383,6 @@ class ipex_ops:
)
return None
@staticmethod
def copy_blocks(
key_caches: list[torch.Tensor],
value_caches: list[torch.Tensor],
block_mapping: torch.Tensor,
) -> None:
torch.xpu.copy_blocks( # type: ignore
key_caches,
value_caches,
block_mapping,
)
@staticmethod
def swap_blocks(
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor

View File

@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
)
CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
# Placeholder for third-party/custom backends - must be registered before use
CUSTOM = ""
# set to None to avoid alias with other backend, whose value is an empty string
CUSTOM = None
def get_path(self, include_classname: bool = True) -> str:
"""Get the class path for this backend (respects overrides).
@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
# Placeholder for third-party/custom backends - must be registered before use
CUSTOM = ""
# set to None to avoid alias with other backend, whose value is an empty string
CUSTOM = None
def get_path(self, include_classname: bool = True) -> str:
"""Get the class path for this backend (respects overrides).

View File

@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp):
cu_seqlens=cu_seqlens,
)
if is_reshaped:
output = output.view(bsz, q_len, -1)
output = output.reshape(bsz, q_len, -1)
return output
def _forward_fa(
@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp):
fa_version=self._fa_version,
)
if is_reshaped:
output = output.view(bsz, q_len, -1)
output = output.reshape(bsz, q_len, -1)
return output
def forward_native(

Some files were not shown because too many files have changed in this diff Show More