diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index d0965fbd56405..1794df9479e55 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -291,6 +291,7 @@ if __name__ == "__main__": """ Arguments: --version : version string for the current build (e.g., commit hash) + --wheel-dir : directory containing wheel files (default to be same as `version`) --current-objects : path to JSON file containing current S3 objects listing in this version directory --output-dir : directory to store generated index files --alias-to-default : (optional) alias variant name for the default variant @@ -318,6 +319,12 @@ if __name__ == "__main__": required=True, help="Directory to store generated index files", ) + parser.add_argument( + "--wheel-dir", + type=str, + default=None, + help="Directory containing wheel files (default to be same as `version`)", + ) parser.add_argument( "--alias-to-default", type=str, @@ -372,7 +379,7 @@ if __name__ == "__main__": print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") - # keep only "official" files for a non-nightly version (specifed by cli args) + # keep only "official" files for a non-nightly version (specified by cli args) PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") if PY_VERSION_RE.match(version): # upload-wheels.sh ensures no "dev" is in args.version @@ -384,9 +391,10 @@ if __name__ == "__main__": print("Nightly version detected, keeping all wheel files.") # Generate index and metadata, assuming wheels and indices are stored as: - # s3://vllm-wheels/{version}/ + # s3://vllm-wheels/{wheel_dir}/ # s3://vllm-wheels// - wheel_base_dir = Path(output_dir).parent / version + wheel_dir = args.wheel_dir or version + wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/") index_base_dir = Path(output_dir) generate_index_and_metadata( diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 3a218a4bb2e6d..1af7f476ae74b 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then echo "Re-generating indices for /$pure_version/" rm -rf "$INDICES_OUTPUT_DIR/*" mkdir -p "$INDICES_OUTPUT_DIR" - $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg + # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" fi diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 9a770869b1d17..400c744852ab6 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -162,7 +162,10 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (API Server 2) @@ -349,7 +352,9 @@ steps: - label: V1 Test e2e + engine # 65min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 + # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. + # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + agent_pool: mi325_8 # grade: Blocking source_file_dependencies: - vllm/ @@ -977,7 +982,10 @@ steps: - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Accuracy Eval (Small Models) # 5min @@ -1254,13 +1262,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1339,7 +1347,9 @@ steps: # end platform plugin tests # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test # begin stat_logger plugins test @@ -1508,7 +1518,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f644504a5b937..7b664c4fa15fe 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1109,13 +1109,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1334,7 +1334,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### @@ -1359,6 +1359,7 @@ steps: - vllm/ - .buildkite/scripts/run-prime-rl-test.sh commands: + - nvidia-smi - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 52d57c99fcfb5..65a981a9d6d00 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -145,7 +145,7 @@ steps: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) @@ -171,7 +171,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d6447649cd89a..4d7a366f05e37 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,6 +15,7 @@ /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang +/vllm/tool_parsers @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/distributed/kv_transfer @NickLucche @ApostaC CMakeLists.txt @tlrmchlsmth @LucasWilkinson diff --git a/CMakeLists.txt b/CMakeLists.txt index a14496e035d9a..c46fb18d7bfef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -799,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") - message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is " - "not >= 12.8, we recommend upgrading to CUDA 12.8 or later " - "if you intend on running FP8 quantized MoE models on Blackwell.") - else() - message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found " - "in CUDA target architectures") - endif() - endif() # # Machete kernels diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py index b5c16c42de467..7473a41e51406 100755 --- a/benchmarks/benchmark_batch_invariance.py +++ b/benchmarks/benchmark_batch_invariance.py @@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant( random.seed(seed) # Set environment variables - os.environ["VLLM_ATTENTION_BACKEND"] = backend if batch_invariant: os.environ["VLLM_BATCH_INVARIANT"] = "1" else: @@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant( max_model_len=max_model_len, dtype="bfloat16", tensor_parallel_size=tp_size, + attention_config={"backend": backend}, enable_prefix_caching=False, ) init_time = time.perf_counter() - start_init diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py new file mode 100644 index 0000000000000..7517376535925 --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_quant.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.triton_utils import triton +from vllm.utils.flashinfer import flashinfer_fp4_quantize + +if not current_platform.has_device_capability(100): + raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)") + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +PROVIDER_CFGS = { + "vllm": dict(backend="vllm", enabled=True), + "flashinfer": dict(backend="flashinfer", enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor: + """Compute global scale for FP4 quantization.""" + amax = torch.abs(tensor).max().to(torch.float32) + return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="us (lower is better)", + plot_name="NVFP4 Input Quantization Latency (us)", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale for activation + a_global_scale = compute_global_scale(a) + + quantiles = [0.5, 0.2, 0.8] + + cfg = PROVIDER_CFGS[provider] + + if cfg["backend"] == "vllm": + # vLLM's FP4 quantization + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: ops.scaled_fp4_quant(a, a_global_scale), + quantiles=quantiles, + ) + elif cfg["backend"] == "flashinfer": + # FlashInfer's FP4 quantization + # Use is_sf_swizzled_layout=True to match vLLM's output format + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ), + quantiles=quantiles, + ) + + # Convert ms to us for better readability at small batch sizes + to_us = lambda t_ms: t_ms * 1000 + return to_us(ms), to_us(max_ms), to_us(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str): + """Test accuracy between vLLM and FlashInfer FP4 quantization.""" + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale + a_global_scale = compute_global_scale(a) + + # vLLM quantization + vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale) + + # FlashInfer quantization (with swizzled layout to match vLLM's output) + flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ) + flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn) + + # Compare outputs + torch.testing.assert_close( + vllm_fp4, + flashinfer_fp4, + ) + print(f"M={M}, K={K}, dtype={dtype}: PASSED") + + +def test_accuracy(): + """Run accuracy tests across various shapes.""" + print("\n" + "=" * 60) + print("Running accuracy tests: vLLM vs FlashInfer") + print("=" * 60) + + device = "cuda" + dtype = torch.bfloat16 + + # Test various batch sizes and hidden dimensions + Ms = [1, 1024] + Ks = [4096] + + for M in Ms: + for K in Ks: + _test_accuracy_once(M, K, dtype, device) + + print("\nAll accuracy tests passed!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark NVFP4 quantization: vLLM vs FlashInfer" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save benchmark results", + ) + parser.add_argument( + "--accuracy", + action="store_true", + help="Run accuracy tests", + ) + args = parser.parse_args() + + if args.accuracy: + test_accuracy() + + for K, N, model in prepare_shapes(args): + print(f"\n{model}, N={N} K={K}") + benchmark.run( + print_data=True, + save_path=args.save_path, + N=N, + K=K, + ) + + print("\nBenchmark finished!") diff --git a/csrc/cache.h b/csrc/cache.h index cbe44c09eb624..42ccb589683a9 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -9,16 +9,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, const torch::Tensor& block_mapping); -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping); - -void copy_blocks_mla(std::vector const& kv_caches, - const torch::Tensor& block_mapping); - void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index f11c5f24c12ec..cf26ae544deaa 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel( } // namespace vllm -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping) { - int num_layers = key_caches.size(); - TORCH_CHECK(num_layers == value_caches.size()); - if (num_layers == 0) { - return; - } - torch::Device cache_device = key_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda()); - - // Create data structures for the kernel. - // Create an array of pointers to the key and value caches. - int64_t key_cache_ptrs[num_layers]; - int64_t value_cache_ptrs[num_layers]; - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - key_cache_ptrs[layer_idx] = - reinterpret_cast(key_caches[layer_idx].data_ptr()); - value_cache_ptrs[layer_idx] = - reinterpret_cast(value_caches[layer_idx].data_ptr()); - } - - // block_mapping is a 2D tensor with shape (num_pairs, 2). - int num_pairs = block_mapping.size(0); - - // Move the data structures to the GPU. - // NOTE: This synchronizes the CPU and GPU. - torch::Tensor key_cache_ptrs_tensor = - torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64) - .to(cache_device); - torch::Tensor value_cache_ptrs_tensor = - torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64) - .to(cache_device); - - // Launch the kernel. - const int numel_per_block = key_caches[0][0].numel(); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, numel_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { - vllm::copy_blocks_kernel<<>>( - key_cache_ptrs_tensor.data_ptr(), - value_cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), numel_per_block); - })); -} - -// copy blocks kernel for MLA (assumes a joint KV-cache) -void copy_blocks_mla(std::vector const& kv_caches, - const torch::Tensor& block_mapping) { - int num_layers = kv_caches.size(); - if (num_layers == 0) { - return; - } - torch::Device cache_device = kv_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA"); - - std::vector cache_ptrs(num_layers); - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - cache_ptrs[layer_idx] = - reinterpret_cast(kv_caches[layer_idx].data_ptr()); - } - torch::Tensor cache_ptrs_tensor = - torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64) - .to(cache_device); - - int num_pairs = block_mapping.size(0); - // We use the stride instead of numel in case the cache is padded for memory - // alignment reasons, we assume the blocks data (inclusive of any padding) - // is contiguous in memory - int mem_footprint_per_block = kv_caches[0].stride(0); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, mem_footprint_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] { - vllm::copy_blocks_mla_kernel<<>>( - cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), mem_footprint_per_block); - })); -} - namespace vllm { // Used to copy/convert one element diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu index baff8363162ef..a51e1a347e1d4 100644 --- a/csrc/fused_qknorm_rope_kernel.cu +++ b/csrc/fused_qknorm_rope_kernel.cu @@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel( void const* k_weight_void, // RMSNorm weights for key void const* cos_sin_cache_void, // Pre-computed cos/sin cache int64_t const* position_ids, // Position IDs for RoPE - int const num_tokens // Number of tokens + int const num_tokens, // Number of tokens + int const rotary_dim // Dimension for RoPE ) { #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM) if constexpr ((std::is_same_v) || @@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel( // Calculate cache pointer for this position - similar to // pos_encoding_kernels.cu - T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim; - int const embed_dim = head_dim / 2; + T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim; + int const embed_dim = rotary_dim / 2; T_cache const* cos_ptr = cache_ptr; T_cache const* sin_ptr = cache_ptr + embed_dim; - - if constexpr (interleave) { - // Perform interleaving. Use pre-computed cos/sin values. + int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range + if (laneId < rotary_lanes) { + if constexpr (interleave) { + // Perform interleaving. Use pre-computed cos/sin values. #pragma unroll - for (int i = 0; i < numElemsPerThread / 2; ++i) { - int const idx0 = 2 * i; - int const idx1 = 2 * i + 1; + for (int i = 0; i < numElemsPerThread / 2; ++i) { + int const idx0 = 2 * i; + int const idx1 = 2 * i + 1; + // Global dimension index in the head + int const dim_idx = laneId * numElemsPerThread + idx0; - float const val0 = elements[idx0]; - float const val1 = elements[idx1]; + float const val0 = elements[idx0]; + float const val1 = elements[idx1]; - int const dim_idx = laneId * numElemsPerThread + idx0; - int const half_dim = dim_idx / 2; - float const cos_val = - CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); - float const sin_val = - CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + int const half_dim = dim_idx / 2; + float const cos_val = + CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); + float const sin_val = + CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); - elements[idx0] = val0 * cos_val - val1 * sin_val; - elements[idx1] = val0 * sin_val + val1 * cos_val; - } - } else { - // Before data exchange with in warp, we need to sync. - __syncwarp(); - // Get the data from the other half of the warp. Use pre-computed cos/sin - // values. -#pragma unroll - for (int i = 0; i < numElemsPerThread; i++) { - elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16); - if (laneId < 16) { - elements2[i] = -elements2[i]; + elements[idx0] = val0 * cos_val - val1 * sin_val; + elements[idx1] = val0 * sin_val + val1 * cos_val; } + } else { + // Before data exchange with in warp, we need to sync. + __syncwarp(); + int pairOffset = (rotary_dim / 2) / numElemsPerThread; + // Get the data from the other half of the warp. Use pre-computed + // cos/sin values. +#pragma unroll + for (int i = 0; i < numElemsPerThread; i++) { + elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset); - int dim_idx = laneId * numElemsPerThread + i; - dim_idx = (dim_idx * 2) % head_dim; - int half_dim = dim_idx / 2; - // Use pre-computed cos/sin from cache - float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); - float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + if (laneId < pairOffset) { + elements2[i] = -elements2[i]; + } + int dim_idx = laneId * numElemsPerThread + i; - elements[i] = elements[i] * cos_val + elements2[i] * sin_val; + dim_idx = (dim_idx * 2) % rotary_dim; + int half_dim = dim_idx / 2; + float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); + float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + + elements[i] = elements[i] * cos_val + elements2[i] * sin_val; + } + // __shfl_xor_sync does not provide memfence. Need to sync again. + __syncwarp(); } - // __shfl_xor_sync does not provide memfence. Need to sync again. - __syncwarp(); } - // Store. { vec_T vec; @@ -312,10 +316,10 @@ template void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_q, int const num_heads_k, int const num_heads_v, int const head_dim, - float const eps, void const* q_weight, - void const* k_weight, void const* cos_sin_cache, - bool const interleave, int64_t const* position_ids, - cudaStream_t stream) { + int const rotary_dim, float const eps, + void const* q_weight, void const* k_weight, + void const* cos_sin_cache, bool const interleave, + int64_t const* position_ids, cudaStream_t stream) { constexpr int blockSize = 256; int const warpsPerBlock = blockSize / 32; @@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; case 128: @@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; case 256: @@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; default: @@ -392,8 +396,11 @@ void fused_qk_norm_rope( "Query weights size must match head dimension"); TORCH_CHECK(k_weight.size(0) == head_dim, "Key weights size must match head dimension"); - TORCH_CHECK(cos_sin_cache.size(1) == head_dim, - "Cos/sin cache dimension must match head_dim"); + + TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even"); + TORCH_CHECK(cos_sin_cache.size(1) <= head_dim, + "rotary_dim must be less than or equal to head_dim"); + TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() && qkv.scalar_type() == k_weight.scalar_type(), "qkv, q_weight and k_weight must have the same dtype"); @@ -419,7 +426,8 @@ void fused_qk_norm_rope( qkv.data_ptr(), static_cast(num_tokens), static_cast(num_heads_q), static_cast(num_heads_k), static_cast(num_heads_v), static_cast(head_dim), - static_cast(eps), q_weight.data_ptr(), k_weight.data_ptr(), + static_cast(cos_sin_cache.size(1)), static_cast(eps), + q_weight.data_ptr(), k_weight.data_ptr(), cos_sin_cache.data_ptr(), !is_neox, reinterpret_cast(position_ids.data_ptr()), stream); diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 7539f836ecf37..e0438556dfe5c 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is // (448.f / (Alpha_A / 6.f)). @@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx, colIdx, numCols, SFout); + rowIdx, colIdx, numKTiles, SFout); out_pos = cvt_warp_fp16_to_fp4(out_silu_mul, SFScaleVal, sf_out); diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 82c53c2375a31..20191a9bc6160 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -25,6 +25,7 @@ #include #include "dispatch_utils.h" +#include "cuda_utils.h" #include "nvfp4_utils.cuh" #include "launch_bounds_utils.h" @@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + int tid = blockIdx.x * blockDim.x + threadIdx.x; int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD; @@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) // (448.f / (Alpha_A / 6.f)). float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - int factor = CVT_FP4_SF_VEC_SIZE * 4; - // The actual output_scales dim is computed from the padded numCols. - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + SFout + output_scale_offset_by_experts[expert_idx] * numKTiles; auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } @@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + extern __shared__ uint32_t shared_input_offsets[]; // Load input offsets into shared memory. @@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + SFout + output_scale_offset_by_experts[expert_idx] * numKTiles; auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } @@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input, void* input_global_scale, void* input_offset_by_experts, void* output_scale_offset_by_experts, int m_topk, int k, int n_experts, cudaStream_t stream) { - // TODO: this multiProcessorCount should be cached. - int device; - cudaGetDevice(&device); - int multiProcessorCount; - cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, - device); + int multiProcessorCount = + get_device_attribute(cudaDevAttrMultiProcessorCount, -1); // Grid, Block size. // Each thread converts 8 values. diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 6d69852bb4e4f..8e38deeb6607f 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -35,7 +35,13 @@ template __host__ __device__ inline Int round_up(Int x, Int y) { static_assert(std::is_integral_v, "round_up argument must be integral type"); - return (x + y - 1) / y * y; + return ((x + y - 1) / y) * y; +} + +// Compute effective rows for grid configuration with swizzled SF layouts. +inline int computeEffectiveRows(int m) { + constexpr int ROW_TILE = 128; + return round_up(m, ROW_TILE); } // Use UE4M3 by default. @@ -49,81 +55,57 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + int sf_m = round_up(numRows, 128); int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_int = round_up(sf_n_unpadded, 4) / 4; - for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) { - // Each thread writes 4 uint32_t elements. - for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int; - col += blockDim.x * 4) { - SFout[row * sf_n_int + col] = 0x00; - } - } + int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE; // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is // (448.f / (Alpha_A / 6.f)). float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0]; - // Input tensor row/col loops. - for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { - for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; + // Iterate over all rows and cols including padded ones - + // ensures we visit every single scale factor address to initialize it. + for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) { + for (int colIdx = threadIdx.x; + colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD; colIdx += blockDim.x) { + int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD; + + PackedVec in_vec; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; - PackedVec in_vec = reinterpret_cast(in)[inOffset]; - // Get the output tensor offset. - // Same as inOffset because 8 elements are packed into one uint32_t. - int64_t outOffset = inOffset; - auto& out_pos = out[outOffset]; + + // If we are outside valid rows OR outside valid columns -> Use Zeros + if (rowIdx >= numRows || elem_idx >= numCols) { + memset(&in_vec, 0, sizeof(PackedVec)); + + } else { + // Valid Region: Load actual data + in_vec = reinterpret_cast(in)[inOffset]; + } auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx, colIdx, numCols, SFout); + rowIdx, colIdx, numKTiles, SFout); - out_pos = + auto out_val = cvt_warp_fp16_to_fp4(in_vec, global_scale, sf_out); + + // We do NOT write output for padding because the 'out' tensor is not + // padded. + if (rowIdx < numRows && elem_idx < numCols) { + // Same as inOffset because 8 elements are packed into one uint32_t. + out[inOffset] = out_val; + } } } } -template -void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, - int64_t* output, int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, cudaStream_t stream) { - // Grid, Block size. - // Each thread converts 8 values. - dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); - // Get number of blocks per SM - int const numBlocksPerSM = - vllm_runtime_blocks_per_sm(static_cast(block.x)); - dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); - - // Launch the cvt kernel. - if (useUE8M0) { - cvt_fp16_to_fp4<<>>( - m, n, input, SFScale, reinterpret_cast(output), - reinterpret_cast(SFOuput)); - } else { - cvt_fp16_to_fp4<<>>( - m, n, input, SFScale, reinterpret_cast(output), - reinterpret_cast(SFOuput)); - } -} - -// Instantiate the function. -template void invokeFP4Quantization(int m, int n, half const* input, - float const* SFScale, int64_t* output, - int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, - cudaStream_t stream); - -template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input, - float const* SFScale, int64_t* output, - int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, - cudaStream_t stream); - } // namespace vllm void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, @@ -147,13 +129,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); - // We don't support e8m0 scales at this moment. - bool useUE8M0 = false; + // Grid, Block size. Each thread converts 8 values. + dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); + int effectiveRows = vllm::computeEffectiveRows(m); + dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM)); VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] { using cuda_type = vllm::CUDATypeConverter::Type; auto input_ptr = static_cast(input.data_ptr()); - vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, - sf_out, useUE8M0, multiProcessorCount, stream); + // NOTE: We don't support e8m0 scales at this moment. + vllm::cvt_fp16_to_fp4<<>>( + m, n, input_ptr, input_sf_ptr, reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); }); -} +} \ No newline at end of file diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh index 48e4959de9793..4c91af85e1514 100644 --- a/csrc/quantization/fp4/nvfp4_utils.cuh +++ b/csrc/quantization/fp4/nvfp4_utils.cuh @@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) { return b; } +// Compute SF output offset for swizzled tensor core layout. +// SF layout: [numMTiles, numKTiles, 32, 4, 4] +// Caller must precompute: numKTiles = (numCols + 63) / 64 template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { +__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset( + int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) { static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2); // One pair of threads write one SF to global memory. // TODO: stage through smem for packed STG.32 // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) { + return nullptr; } - return nullptr; + + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // Decompose indices using bitwise ops (all divisors are powers of 2). + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + int32_t mTileIdx = mIdx >> 7; // mIdx / 128 + int32_t outerMIdx = mIdx & 31; // mIdx % 32 + int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4 + int32_t kTileIdx = kIdx >> 2; // kIdx / 4 + int32_t innerKIdx = kIdx & 3; // kIdx % 4 + + // Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 + + // outerMIdx * 16 + innerMIdx * 4 + innerKIdx + // Use bitwise OR for non-overlapping lower bits. + int64_t SFOffset = (static_cast(mTileIdx) * numKTiles + kTileIdx) + << 9 | + (outerMIdx << 4) | (innerMIdx << 2) | innerKIdx; + + return reinterpret_cast(SFout) + SFOffset; } // Quantizes the provided PackedVec into the uint32_t output diff --git a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu deleted file mode 100644 index 6c8f6309ef43f..0000000000000 --- a/csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu +++ /dev/null @@ -1,373 +0,0 @@ -#include "core/registration.h" - -#include -#include - -#include -#include -#include - -#include "cute/tensor.hpp" -#include "cutlass/tensor_ref.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/epilogue/thread/linear_combination.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/gemm/group_array_problem_shape.hpp" -#include "cutlass/gemm/collective/collective_builder.hpp" -#include "cutlass/epilogue/collective/collective_builder.hpp" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/kernel/gemm_universal.hpp" - -#include "cutlass/util/command_line.h" -#include "cutlass/util/distribution.h" -#include "cutlass/util/host_tensor.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/tensor_view_io.h" -#include "cutlass/util/reference/device/gemm.h" -#include "cutlass/util/reference/device/tensor_compare.h" -#include "cutlass/util/reference/host/tensor_fill.h" -#include "cutlass/util/reference/host/gett.hpp" -#include "cutlass/util/reference/host/tensor_norm.h" -#include "cutlass/util/reference/host/tensor_compare.h" -#include - -using namespace cute; - -template -__global__ void get_ggemm_starts( - int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets, - ElementC** out_offsets, ElementAccumulator** a_scale_offsets, - ElementAccumulator** b_scale_offsets, ElementAB* a_base_as_int, - ElementAB* b_base_as_int, ElementC* out_base_as_int, - ElementAccumulator* a_scale_base_as_int, - ElementAccumulator* b_scale_base_as_int, LayoutSFA* layout_sfa_base_as_int, - LayoutSFB* layout_sfb_base_as_int, int* problem_sizes) { - int expert_id = threadIdx.x; - - if (expert_id >= gridDim.x * blockDim.x) { - return; - } - - int m = problem_sizes[expert_id * 3]; - int n = problem_sizes[expert_id * 3 + 1]; - int k = problem_sizes[expert_id * 3 + 2]; - - int32_t expert_offset = expert_offsets[expert_id]; - int a_stride = expert_offset * k; - int b_stride = expert_id * k * n; - int a_scale_stride = expert_offset * k / 128; - int b_scale_stride = expert_id * k * n / 128 / 128; - - a_offsets[expert_id] = a_base_as_int + a_stride; - b_offsets[expert_id] = b_base_as_int + b_stride; - out_offsets[expert_id] = out_base_as_int + expert_offset * n; - a_scale_offsets[expert_id] = a_scale_base_as_int + a_scale_stride; - b_scale_offsets[expert_id] = b_scale_base_as_int + b_scale_stride; - - LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id; - LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id; - - *layout_sfa_ptr = - ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1)); - *layout_sfb_ptr = - ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1)); -} - -#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, \ - ScaleConfig) \ - else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ - get_ggemm_starts<<<1, num_experts, 0, stream>>>( \ - static_cast(expert_offsets.data_ptr()), \ - static_cast(a_ptrs.data_ptr()), \ - static_cast(b_ptrs.data_ptr()), \ - static_cast(out_ptrs.data_ptr()), \ - static_cast(a_scales_ptrs.data_ptr()), \ - static_cast(b_scales_ptrs.data_ptr()), \ - static_cast(a_tensors.data_ptr()), \ - static_cast(b_tensors.data_ptr()), \ - static_cast(out_tensors.data_ptr()), \ - static_cast(a_scales.data_ptr()), \ - static_cast(b_scales.data_ptr()), \ - reinterpret_cast(layout_sfa.data_ptr()), \ - reinterpret_cast(layout_sfb.data_ptr()), \ - static_cast(problem_sizes.data_ptr())); \ - } - -template -void run_get_ggemm_starts( - torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs, - torch::Tensor& b_ptrs, torch::Tensor& out_ptrs, - torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs, - torch::Tensor const& a_tensors, torch::Tensor const& b_tensors, - torch::Tensor out_tensors, torch::Tensor const& a_scales, - torch::Tensor const& b_scales, torch::Tensor const& layout_sfa, - torch::Tensor const& layout_sfb, torch::Tensor const& problem_sizes) { - TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0); - TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0); - - int num_experts = (int)expert_offsets.size(0); - auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); - - if (false) { - } - __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA, - LayoutSFB, ScaleConfig) - __CALL_GET_STARTS_KERNEL(torch::kFloat16, cutlass::half_t, LayoutSFA, - LayoutSFB, ScaleConfig) - else { - TORCH_CHECK(false, "Unsupported output tensor type"); - } -} - -template -void run_blockwise_scaled_group_mm( - torch::Tensor& out_ptrs, const torch::Tensor& a_ptrs, - const torch::Tensor& b_ptrs, const torch::Tensor& a_scales_ptrs, - const torch::Tensor& b_scales_ptrs, const torch::Tensor& stride_a, - const torch::Tensor& stride_b, const torch::Tensor& stride_c, - const torch::Tensor& layout_sfa, const torch::Tensor& layout_sfb, - const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { - using ProblemShape = cutlass::gemm::GroupProblemShape>; - - // Types - using ElementA = cutlass::float_e4m3_t; - using ElementB = cutlass::float_e4m3_t; - using ElementC = OutType; - using ElementD = ElementC; - using ElementAccumulator = float; - using LayoutA = cutlass::layout::RowMajor; - using LayoutB = cutlass::layout::ColumnMajor; - using LayoutC = LayoutD; - - // Alignments - static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; - static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - static constexpr int AlignmentC = 128 / cutlass::sizeof_bits::value; - - using ArchTag = cutlass::arch::Sm100; - using OperatorClass = cutlass::arch::OpClassTensorOp; - - using CollectiveEpilogue = - typename cutlass::epilogue::collective::CollectiveBuilder< - ArchTag, OperatorClass, typename ScheduleConfig::MmaTileShape, - typename ScheduleConfig::ClusterShape, - cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, - ElementAccumulator, void, LayoutC*, AlignmentC, ElementD, LayoutC*, - AlignmentC, typename ScheduleConfig::EpilogueSchedule>::CollectiveOp; - - using CollectiveMainloop = - typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, OperatorClass, ElementA, - cute::tuple, - AlignmentA, ElementB, - cute::tuple, - AlignmentB, ElementAccumulator, typename ScheduleConfig::MmaTileShape, - typename ScheduleConfig::ClusterShape, - cutlass::gemm::collective::StageCountAutoCarveout( - sizeof(typename CollectiveEpilogue::SharedStorage))>, - typename ScheduleConfig::KernelSchedule>::CollectiveOp; - - using GemmKernel = - cutlass::gemm::kernel::GemmUniversal; - - using Gemm = cutlass::gemm::device::GemmUniversalAdapter; - using StrideA = typename Gemm::GemmKernel::InternalStrideA; - using StrideB = typename Gemm::GemmKernel::InternalStrideB; - using StrideC = typename Gemm::GemmKernel::InternalStrideC; - using StrideD = typename Gemm::GemmKernel::InternalStrideD; - - using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape; - int num_experts = (int)expert_offsets.size(0); - - Gemm gemm_op; - - // Mainloop Arguments - typename GemmKernel::MainloopArguments mainloop_args{ - static_cast(a_ptrs.data_ptr()), - static_cast(stride_a.data_ptr()), - static_cast(b_ptrs.data_ptr()), - static_cast(stride_b.data_ptr()), - static_cast(a_scales_ptrs.data_ptr()), - reinterpret_cast( - layout_sfa.data_ptr()), - static_cast(b_scales_ptrs.data_ptr()), - reinterpret_cast( - layout_sfb.data_ptr())}; - - int device_id = a_ptrs.device().index(); - static const cutlass::KernelHardwareInfo hw_info{ - device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count( - device_id)}; - - // Epilogue Arguments - typename GemmKernel::EpilogueArguments epilogue_args{ - {}, // epilogue.thread - nullptr, - static_cast(stride_c.data_ptr()), - static_cast(out_ptrs.data_ptr()), - static_cast(stride_c.data_ptr())}; - - UnderlyingProblemShape* problem_sizes_as_shapes = - static_cast(problem_sizes.data_ptr()); - - // Gemm Arguments - typename GemmKernel::Arguments args{ - cutlass::gemm::GemmUniversalMode::kGrouped, - {num_experts, problem_sizes_as_shapes, nullptr}, - mainloop_args, - epilogue_args, - hw_info}; - - at::cuda::CUDAGuard device_guard{(char)a_ptrs.device().index()}; - const cudaStream_t stream = - at::cuda::getCurrentCUDAStream(a_ptrs.get_device()); - - auto can_implement_status = gemm_op.can_implement(args); - TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, - "Failed to implement GEMM"); - - size_t workspace_size = gemm_op.get_workspace_size(args); - auto const workspace_options = - torch::TensorOptions().dtype(torch::kUInt8).device(a_ptrs.device()); - auto workspace = torch::empty(workspace_size, workspace_options); - - auto status = gemm_op.initialize(args, workspace.data_ptr(), stream); - TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM"); - - status = gemm_op.run(stream); - TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM"); -} - -template -void blockwise_scaled_group_mm_dispatch_shape( - torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, - const torch::Tensor& scales_a, const torch::Tensor& scales_b, - const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { - struct MmaConfig { - using ElementA = cutlass::float_e4m3_t; - using KernelSchedule = - cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100; - using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm; - using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< - 1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>; - using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); - using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); - using LayoutC = cutlass::layout::RowMajor; - using MmaTileShape = Shape<_128, _128, _128>; - using ClusterShape = Shape<_1, _1, _1>; - }; - - int num_experts = (int)expert_offsets.size(0); - - auto a_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto b_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto out_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto a_scales_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto b_scales_ptrs = torch::empty( - {num_experts}, - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - - auto layout_sfa = torch::empty( - {num_experts, 5}, - torch::TensorOptions().dtype(torch::kInt32).device(a.device())); - auto layout_sfb = torch::empty( - {num_experts, 5}, - torch::TensorOptions().dtype(torch::kInt32).device(a.device())); - - auto stride_a = torch::full( - {num_experts}, a.size(1), - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto stride_b = torch::full( - {num_experts}, a.size(1), - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - auto stride_c = torch::full( - {num_experts}, output.size(1), - torch::TensorOptions().dtype(torch::kInt64).device(a.device())); - - torch::TensorOptions options_int = - torch::TensorOptions().dtype(torch::kInt64).device(a.device()); - - run_get_ggemm_starts( - expert_offsets, a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, a, - b, output, scales_a, scales_b, layout_sfa, layout_sfb, problem_sizes); - - run_blockwise_scaled_group_mm( - out_ptrs, a_ptrs, b_ptrs, a_scales_ptrs, b_scales_ptrs, stride_a, - stride_b, stride_c, layout_sfa, layout_sfb, problem_sizes, - expert_offsets); -} - -void cutlass_blockwise_scaled_grouped_mm( - torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, - const torch::Tensor& scales_a, const torch::Tensor& scales_b, - const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets) { - TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor"); - TORCH_CHECK(problem_sizes.size(1) == 3, - "problem_sizes must have shape (num_experts, 3)"); - TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0), - "Number of experts in problem_sizes must match expert_offsets"); - TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, - "problem_sizes must be int32"); - TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn, - "a must be kFloat8_e4m3fn"); - TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn, - "b must be kFloat8_e4m3fn"); - TORCH_CHECK(output.scalar_type() == torch::kBFloat16 || - output.scalar_type() == torch::kHalf, - "output must be bfloat16 or half"); - TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, - "scales_a must be float32"); - TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, - "scales_b must be float32"); - TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32, - "expert_offsets must be int32"); - - TORCH_CHECK(output.dim() == 2, "output must be 2D tensor"); - TORCH_CHECK(a.dim() == 2, "a must be 2D tensor"); - TORCH_CHECK(b.dim() == 3, "b must be 3D tensor"); - TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor"); - TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor"); - TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor"); - TORCH_CHECK(problem_sizes.size(1) == 3, - "problem_sizes must have shape (num_experts, 3)"); - TORCH_CHECK(problem_sizes.size(0) == expert_offsets.size(0), - "Number of experts in problem_sizes must match expert_offsets"); - TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, - "problem_sizes must be int32"); - TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor"); - -#if defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100 - if (output.scalar_type() == torch::kBFloat16) { - blockwise_scaled_group_mm_dispatch_shape( - output, a, b, scales_a, scales_b, problem_sizes, expert_offsets); - } else if (output.scalar_type() == torch::kFloat16) { - blockwise_scaled_group_mm_dispatch_shape( - output, a, b, scales_a, scales_b, problem_sizes, expert_offsets); - } else { - TORCH_CHECK(false, "Unsupported output tensor type"); - } -#endif -} - -TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { - m.impl("cutlass_blockwise_scaled_grouped_mm", - &cutlass_blockwise_scaled_grouped_mm); -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 83d4943d62776..6f2c8e915b5cb 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -416,13 +416,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor alpha) -> ()"); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); - // cutlass blockwise scaledgroup GEMM - ops.def( - "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " - "Tensor scales_a, Tensor scales_b, " - "Tensor problem_sizes, Tensor expert_offsets) -> ()"); - // conditionally compiled so impl registration is in source file - // cutlass nvfp4 block scaled group GEMM ops.def( "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," @@ -692,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()"); cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks); - // Copy the cache blocks from src to dst. - cache_ops.def( - "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, " - "Tensor block_mapping) -> ()"); - cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks); - - cache_ops.def( - "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()"); - cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla); - // Reshape the key and value tensors and cache them. cache_ops.def( "reshape_and_cache(Tensor key, Tensor value," diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1b6bdabc7a539..4c09808a14333 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -130,6 +130,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ && uv pip install --system *.whl ARG COMMON_WORKDIR +ARG BASE_IMAGE # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks @@ -144,4 +145,9 @@ ENV SAFETENSORS_FAST_GPU=1 # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 +# Workaround for ROCm profiler limits +RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf +ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf" +RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt + CMD ["/bin/bash"] diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index a57ee728d9243..c5e94ee1f6928 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,15 +1,15 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG TRITON_BRANCH="57c693b6" ARG TRITON_REPO="https://github.com/ROCm/triton.git" -ARG PYTORCH_BRANCH="1c57644d" -ARG PYTORCH_VISION_BRANCH="v0.23.0" +ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" +ARG PYTORCH_VISION_BRANCH="v0.24.1" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_AUDIO_BRANCH="v2.9.0" ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git" ARG FA_BRANCH="0e60e394" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="59bd8ff2" +ARG AITER_BRANCH="6af8b687" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ - && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 72d2053102c22..4168c1570d874 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ - add-apt-repository -y ppa:kobuk-team/intel-graphics + add-apt-repository -y ppa:kobuk-team/intel-graphics-staging RUN apt clean && apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ -r requirements/xpu.txt +# arctic-inference is built from source which needs torch-xpu properly installed +# used for suffix method speculative decoding +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --no-cache-dir arctic-inference==0.1.1 + ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" COPY . . diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md index 37b29aa1a4876..06ad5f29a1a65 100644 --- a/docs/deployment/integrations/kserve.md +++ b/docs/deployment/integrations/kserve.md @@ -2,4 +2,4 @@ vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. -Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe. +You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview). diff --git a/docs/deployment/integrations/llm-d.md b/docs/deployment/integrations/llm-d.md new file mode 100644 index 0000000000000..cccf1773c6be6 --- /dev/null +++ b/docs/deployment/integrations/llm-d.md @@ -0,0 +1,5 @@ +# llm-d + +vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers. + +You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview). diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 05814cbad9bfc..77a159009aa8d 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: - [Helm](frameworks/helm.md) - [InftyAI/llmaz](integrations/llmaz.md) +- [llm-d](integrations/llm-d.md) - [KAITO](integrations/kaito.md) - [KServe](integrations/kserve.md) - [Kthena](integrations/kthena.md) diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md index 5cc5878425515..53368ab1a79fa 100644 --- a/docs/design/paged_attention.md +++ b/docs/design/paged_attention.md @@ -139,18 +139,18 @@ token data. const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` -
- ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" } -
+

+ query +

Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. -
- ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" } -
+

+ q_vecs +

```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr` points to key token data based on `k_cache` at assigned block, assigned head and assigned token. -
- ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" } -
+

+ key +

The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128 elements for one token) that will be processed by 2 threads (one thread group) separately. -
- ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" } -
+

+ k_vecs +

```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of ## Value -
- ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" } -
+

+ value +

-
- ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" } -
+

+ logits_vec +

-
- ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" } -
+

+ v_vec +

Now we need to retrieve the value data and perform dot multiplication with `logits`. Unlike query and key, there is no thread group diff --git a/docs/features/README.md b/docs/features/README.md index e9e5232929b72..b9083b9993159 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -64,7 +64,7 @@ th:not(:first-child) { | [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | +| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | | [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index b02d5ba9e89a2..5c846767bc5b8 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -8,6 +8,16 @@ We recommend installing the library with: pip install nvidia-modelopt ``` +## Supported ModelOpt checkpoint formats + +vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the +following `quantization.quant_algo` values: + +- `FP8`: per-tensor weight scale (+ optional static activation scale). +- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization. +- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks). +- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`). + ## Quantizing HuggingFace Models with PTQ You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. @@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll if __name__ == "__main__": main() ``` + +## Running the OpenAI-compatible server + +To serve a local ModelOpt checkpoint via the OpenAI-compatible API: + +```bash +vllm serve \ + --quantization modelopt \ + --host 0.0.0.0 --port 8000 +``` + +## Testing (local checkpoints) + +vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped +by default in CI. To run the tests locally: + +```bash +export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH= +export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH= +pytest -q tests/quantization/test_modelopt.py +``` diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index d26a5e217f314..586117272d3ba 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). +### How FP8 KV Cache Works + +The FP8 KV cache implementation follows this workflow: + +1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache +2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16) +3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor + +This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations. + ### Performance Impact The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 70a11d6def566..7b6945cb71c27 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -352,10 +352,17 @@ Supported models: * `zai-org/GLM-4.5` * `zai-org/GLM-4.5-Air` * `zai-org/GLM-4.6` -* `zai-org/GLM-4.6-Air` Flags: `--tool-call-parser glm45` +### GLM-4.7 Models (`glm47`) + +Supported models: + +* `zai-org/GLM-4.7` + +Flags: `--tool-call-parser glm47` + ### Qwen3-Coder Models (`qwen3_xml`) Supported models: diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 9b93a6b9ac12c..cdbe601ca801a 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -28,3 +28,4 @@ The backends below live **outside** the main `vllm` repository and follow the | Cambricon MLU | `vllm-mlu` | | | Baidu Kunlun XPU | N/A, install from source | | | Sophgo TPU | N/A, install from source | | +| Apple Silicon (Metal) | N/A, install from source | | diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md index 9f1f6e3821397..c5a4d00ddcf4c 100644 --- a/docs/getting_started/installation/cpu.apple.inc.md +++ b/docs/getting_started/installation/cpu.apple.inc.md @@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. +!!! tip "GPU-Accelerated Inference with vLLM-Metal" + For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend. + # --8<-- [end:installation] # --8<-- [start:requirements] diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 62e42a730e9cb..6838fc227f355 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -387,7 +387,7 @@ th { | `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | | `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6, GLM-4.7 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | @@ -418,7 +418,7 @@ th { | `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | -| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | +| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | @@ -490,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A | `GteNewModel`C | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | `ModernBertModel`C | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | `NomicBertModel`C | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | +| `LlamaBidirectionalModel`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ | | `LlamaModel`C, `LlamaForCausalLM`C, `MistralModel`C, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | | `Qwen2Model`C, `Qwen2ForCausalLM`C | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | | `Qwen3Model`C, `Qwen3ForCausalLM`C | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | @@ -543,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | -| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | -| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | +| `LlamaBidirectionalForSequenceClassification`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen2ForSequenceClassification`C | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification`C | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | @@ -562,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A !!! note The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. +!!! note + `nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly. + + Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py) + !!! note Load the official original `mxbai-rerank-v2` by using the following command. diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 192a61ea5b903..14b336dffa78f 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` from langchain_community.llms import VLLM llm = VLLM( - model="mosaicml/mpt-7b", + model="Qwen/Qwen3-4B", trust_remote_code=True, # mandatory for hf models max_new_tokens=128, top_k=10, diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 6a08f872def15..fb4b0b634145b 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py) +#### Score Template + +Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)). + +Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template. + +Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter: + +- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}` +- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}` + +This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future. + +Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) + #### Single inference You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index be0b846995a92..287409fa2b5c1 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -5,130 +5,91 @@ Usage: Single node: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 + -dp=2 \ + -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ - --node-rank=0 \ - --master-addr=10.99.48.128 \ - --master-port=13345 + -dp=2 \ + -tp=2 \ + --dp-num-nodes=2 \ + --dp-node-rank=0 \ + --dp-master-addr=10.99.48.128 \ + --dp-master-port=13345 Node 1: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ - --node-rank=1 \ - --master-addr=10.99.48.128 \ - --master-port=13345 + -dp=2 \ + -tp=2 \ + --dp-num-nodes=2 \ + --dp-node-rank=1 \ + --dp-master-addr=10.99.48.128 \ + --dp-master-port=13345 """ import os from time import sleep -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.platforms import current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import get_open_port -def parse_args(): - import argparse +def create_parser(): + parser = FlexibleArgumentParser(description="Data Parallel Inference") - parser = argparse.ArgumentParser(description="Data Parallel Inference") + # Add all engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults( + model="ibm-research/PowerMoE-3b", + enable_expert_parallel=True, + ) + + # Add DP-specific args (separate from engine args to avoid conflicts) parser.add_argument( - "--model", + "--dp-num-nodes", + type=int, + default=1, + help="Total number of nodes for data parallel.", + ) + parser.add_argument( + "--dp-node-rank", + type=int, + default=0, + help="Rank of the current node for data parallel.", + ) + parser.add_argument( + "--dp-master-addr", type=str, - default="ibm-research/PowerMoE-3b", - help="Model name or path", - ) - parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size") - parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size") - parser.add_argument( - "--node-size", type=int, default=1, help="Total number of nodes" + default="", + help="Master node IP address for DP coordination.", ) parser.add_argument( - "--node-rank", type=int, default=0, help="Rank of the current node" - ) - parser.add_argument( - "--master-addr", type=str, default="", help="Master node IP address" - ) - parser.add_argument("--master-port", type=int, default=0, help="Master node port") - parser.add_argument( - "--enforce-eager", action="store_true", help="Enforce eager mode execution." - ) - parser.add_argument( - "--trust-remote-code", action="store_true", help="Trust remote code." - ) - parser.add_argument( - "--max-num-seqs", + "--dp-master-port", type=int, - default=64, - help=("Maximum number of sequences to be processed in a single iteration."), - ) - parser.add_argument( - "--max-model-len", - type=int, - help=("Maximum number of tokens to be processed in a single iteration."), + default=0, + help="Master node port for DP coordination.", ) parser.add_argument( "--timeout", type=int, default=300, - help=("Number of seconds before unresponsive process is killed."), + help="Number of seconds before unresponsive process is killed.", ) - parser.add_argument( - "--gpu-memory-utilization", - type=float, - default=0.8, - help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), - ) - parser.add_argument( - "--enable-dbo", - action="store_true", - help=("Enable microbatched execution"), - ) - parser.add_argument( - "--compilation-config", - type=int, - help=("Compilation optimization (O) mode 0-3."), - ) - parser.add_argument( - "--quantization", - type=str, - ) - parser.add_argument( - "--disable-expert-parallel", - dest="enable_expert_parallel", - action="store_false", - help="Disable expert parallel (default: enabled).", - ) - parser.set_defaults(enable_expert_parallel=True) - return parser.parse_args() + + return parser def main( - model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - GPUs_per_dp_rank, - enforce_eager, - enable_expert_parallel, - trust_remote_code, - max_num_seqs, - max_model_len, - compilation_config, - gpu_memory_utilization, - enable_dbo, - quantization, + engine_args, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -173,19 +134,7 @@ def main( ) # Create an LLM. - llm = LLM( - model=model, - tensor_parallel_size=GPUs_per_dp_rank, - enforce_eager=enforce_eager, - enable_expert_parallel=enable_expert_parallel, - trust_remote_code=trust_remote_code, - max_num_seqs=max_num_seqs, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enable_dbo=enable_dbo, - quantization=quantization, - compilation_config=compilation_config, - ) + llm = LLM(**engine_args) outputs = llm.generate(prompts, sampling_params) # Print the outputs. for i, output in enumerate(outputs): @@ -204,22 +153,29 @@ def main( if __name__ == "__main__": - args = parse_args() + parser = create_parser() + args = vars(parser.parse_args()) - dp_size = args.dp_size - tp_size = args.tp_size - node_size = args.node_size - node_rank = args.node_rank + # Extract DP-specific args (pop to remove from engine_args) + dp_size = args.pop("data_parallel_size") + dp_num_nodes = args.pop("dp_num_nodes") + dp_node_rank = args.pop("dp_node_rank") + dp_master_addr = args.pop("dp_master_addr") + dp_master_port = args.pop("dp_master_port") + timeout = args.pop("timeout") - if node_size == 1: + # Remaining args are engine args + engine_args = args + + if dp_num_nodes == 1: dp_master_ip = "127.0.0.1" - dp_master_port = get_open_port() + dp_master_port_val = get_open_port() else: - dp_master_ip = args.master_addr - dp_master_port = args.master_port + dp_master_ip = dp_master_addr + dp_master_port_val = dp_master_port - assert dp_size % node_size == 0, "dp_size should be divisible by node_size" - dp_per_node = dp_size // node_size + assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes" + dp_per_node = dp_size // dp_num_nodes from multiprocessing import Process @@ -230,34 +186,24 @@ if __name__ == "__main__": procs = [] for local_dp_rank, global_dp_rank in enumerate( - range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node) + range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node) ): proc = Process( target=main, args=( - args.model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, - dp_master_port, - tp_size, - args.enforce_eager, - args.enable_expert_parallel, - args.trust_remote_code, - args.max_num_seqs, - args.max_model_len, - args.compilation_config, - args.gpu_memory_utilization, - args.enable_dbo, - args.quantization, + dp_master_port_val, + engine_args, ), ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=args.timeout) + proc.join(timeout=timeout) if proc.exitcode is None: print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md index b2c3bb974dfab..2a59f86d15fb7 100644 --- a/examples/online_serving/disaggregated_encoder/README.md +++ b/examples/online_serving/disaggregated_encoder/README.md @@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags: - `--max-num-batched-tokens=` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager. +- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.** + ## Local media inputs To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance: diff --git a/examples/pooling/score/offline_using_template.py b/examples/pooling/score/offline_using_template.py new file mode 100644 index 0000000000000..427cbaab6fbc8 --- /dev/null +++ b/examples/pooling/score/offline_using_template.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +from pathlib import Path + +from vllm import LLM + +model_name = "nvidia/llama-nemotron-rerank-1b-v2" + +# Path to template file +template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja" +chat_template = template_path.read_text() + +llm = LLM(model=model_name, runner="pooling", trust_remote_code=True) + +query = "how much protein should a female eat?" +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", + "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", +] + +outputs = llm.score(query, documents, chat_template=chat_template) + +print("-" * 30) +print([output.outputs.score for output in outputs]) +print("-" * 30) diff --git a/examples/pooling/score/online_using_template.py b/examples/pooling/score/online_using_template.py new file mode 100644 index 0000000000000..66b22e0a9563f --- /dev/null +++ b/examples/pooling/score/online_using_template.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +""" +Example of using the rerank API with template. + +run: + vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja +""" + +import json + +import requests + +url = "http://127.0.0.1:8000/rerank" + +headers = {"accept": "application/json", "Content-Type": "application/json"} + +query = "how much protein should a female eat?" +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", + "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", +] + +data = { + "model": "nvidia/llama-nemotron-rerank-1b-v2", + "query": query, + "documents": documents, +} + + +def main(): + response = requests.post(url, headers=headers, json=data) + + # Check the response + if response.status_code == 200: + print("Request successful!") + print(json.dumps(response.json(), indent=2)) + else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) + + +if __name__ == "__main__": + main() diff --git a/examples/pooling/score/template/nemotron-rerank.jinja b/examples/pooling/score/template/nemotron-rerank.jinja new file mode 100644 index 0000000000000..0447d7bcd5d59 --- /dev/null +++ b/examples/pooling/score/template/nemotron-rerank.jinja @@ -0,0 +1,3 @@ +question:{{ (messages | selectattr("role", "eq", "query") | first).content }} + + passage:{{ (messages | selectattr("role", "eq", "document") | first).content }} \ No newline at end of file diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 28ab2cee71a6a..f8a629ed46cee 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -557,7 +557,8 @@ def test_rms_group_quant( # To capture subprocess logs, we need to know whether spawn or fork is used. # Force spawn as it is more general. monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name) + + model_kwargs["attention_config"] = {"backend": backend.name} compilation_config = CompilationConfig( # Testing properties diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py index 9ccb363b088f5..1fda21dea6361 100644 --- a/tests/compile/test_dynamic_shapes_compilation.py +++ b/tests/compile/test_dynamic_shapes_compilation.py @@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation( "evaluate_guards": evaluate_guards, }, }, + max_model_len=1024, ) output = model.generate(prompt) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 6b72c595cd779..7755e9f9b7380 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import itertools import pytest import torch @@ -53,37 +52,61 @@ class TestModel(torch.nn.Module): hidden_size: int, eps: float, group_shape: GroupShape, - cuda_force_torch: bool, + use_aiter: bool = False, + cuda_force_torch: bool = False, + use_aiter_quant_op: bool = True, *args, **kwargs, ): super().__init__(*args, **kwargs) + self.use_aiter = use_aiter + self.use_aiter_quant_op = use_aiter_quant_op self.cuda_force_torch = cuda_force_torch + self.group_shape = group_shape + self.enable_quant_fp8_custom_op = None # Will be set later if applicable + self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)] - if group_shape.is_per_group(): - self.wscale = [ - torch.rand( - (hidden_size // group_shape[1], hidden_size // group_shape[1]), - dtype=torch.float32, - ) - for _ in range(3) - ] - else: - self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] - static = group_shape == GroupShape.PER_TENSOR + + # Setup quantization scale descriptor + static = group_shape == GroupShape.PER_TENSOR and not use_aiter quant_scale = ScaleDesc(torch.float32, static, group_shape) self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True) + + # Setup scales if static: self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] else: self.scale = [None for _ in range(3)] + + # Setup weights self.w = [ torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3) ] - if not group_shape.is_per_group(): + if not group_shape.is_per_group() or use_aiter: self.w = [self.w[0].t() for _ in range(3)] + # Setup weight scales if group_shape.is_per_group(): + scale_size = ( + (hidden_size + 128 - 1) // 128 + if use_aiter + else hidden_size // group_shape[1] + ) + wscale_shape: tuple[int, ...] = (scale_size, scale_size) + else: + wscale_shape = (1,) + self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)] + + # Setup FP8 linear operation + is_per_group = group_shape.is_per_group() + if is_per_group and use_aiter: + self.fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(128, 128), + act_quant_group_shape=group_shape, + use_aiter_and_is_supported=use_aiter_quant_op, + ) + # AITER blockwise doesn't use enable_quant_fp8_custom_op + elif is_per_group: self.fp8_linear = W8A8BlockFp8LinearOp( weight_group_shape=GroupShape(group_shape[1], group_shape[1]), act_quant_group_shape=group_shape, @@ -91,6 +114,13 @@ class TestModel(torch.nn.Module): use_aiter_and_is_supported=False, ) self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled() + elif use_aiter: + self.fp8_linear = Fp8LinearOp( + act_quant_static=False, + act_quant_group_shape=group_shape, + ) + self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op + self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled() else: with override_cutlass_fp8_supported(not cuda_force_torch): self.fp8_linear = Fp8LinearOp( @@ -100,7 +130,6 @@ class TestModel(torch.nn.Module): self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled() self.enable_rms_norm_custom_op = self.norm[0].enabled() - self.group_shape = group_shape def forward(self, x): # avoid having graph input be an arg to a pattern directly @@ -126,19 +155,49 @@ class TestModel(torch.nn.Module): y4, resid = self.norm[3](x4, resid) # use resid here return y4 + def ops_in_model_before(self): + if ( + self.use_aiter + and self.group_shape.is_per_group() + and current_platform.is_fp8_fnuz() + ): + return [rocm_aiter_ops.get_group_quant_op()] + if self.use_aiter and self.group_shape.is_per_group(): + return [torch.ops.vllm.triton_per_token_group_quant_fp8.default] + if self.use_aiter and self.use_aiter_quant_op: + return [rocm_aiter_ops.get_per_token_quant_op()] + if self.use_aiter: + return [QUANT_OPS[self.quant_key]] + if self.enable_quant_fp8_custom_op: + return [QUANT_OPS[self.quant_key]] + return [torch.ops.aten.reciprocal] + def ops_in_model_after(self): + if self.use_aiter and self.group_shape.is_per_group(): + from vllm.compilation.rocm_aiter_fusion import ( + AiterFusedAddRMSFp8GroupQuantPattern, + AiterRMSFp8GroupQuantPattern, + ) + + return [ + AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP, + AiterRMSFp8GroupQuantPattern.FUSED_OP, + ] + if self.use_aiter: + from vllm.compilation.rocm_aiter_fusion import ( + AiterFusedAddRMSNormDynamicQuantPattern, + AiterRMSNormDynamicQuantPattern, + ) + + return [ + AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP, + AiterRMSNormDynamicQuantPattern.FUSED_OP, + ] return [ FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)], FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)], ] - def ops_in_model_before(self): - return ( - [QUANT_OPS[self.quant_key]] - if self.enable_quant_fp8_custom_op - else [torch.ops.aten.reciprocal] - ) - def ops_in_model_before_partial(self): return ( [RMS_OP, RMS_ADD_OP] @@ -155,67 +214,45 @@ GROUP_SHAPES = [ ] -class TestRmsnormGroupFp8QuantModel(torch.nn.Module): - def __init__(self, hidden_size: int, eps: float, **kwargs): - super().__init__() - self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( - weight_group_shape=GroupShape(128, 128), - act_quant_group_shape=GroupShape(1, 128), - cutlass_block_fp8_supported=False, - use_aiter_and_is_supported=True, - ) - self.w = [ - torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() - for _ in range(3) - ] +def _run_fusion_test( + model, + fusion_pass, + vllm_config, + dtype, + hidden_size, + num_tokens, +): + """Helper function for common fusion test logic. - scale_hidden_size = (hidden_size + 128 - 1) // 128 - self.wscale = [ - torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32) - for _ in range(3) - ] + Must be called within vllm_config context. + """ + noop_pass = NoOpEliminationPass(vllm_config) + cleanup_pass = PostCleanupPass(vllm_config) - self.norm_weight = [torch.ones(hidden_size) for _ in range(4)] - self.eps = eps + backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) + backend2 = TestBackend(noop_pass, cleanup_pass) - def forward(self, x): - # avoid having graph input be an arg to a pattern directly - x = resid = torch.relu(x) - y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps) + x = torch.rand(num_tokens, hidden_size) + torch._dynamo.mark_dynamic(x, 0) - x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0]) - # make sure resid is used for replacement to work - y2, resid = rocm_aiter_ops.rms_norm2d_with_add( - x2, resid, self.norm_weight[1], self.eps - ) + model_fused = torch.compile(model, backend=backend) + result_fused = model_fused(x) - x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1]) + model_unfused = torch.compile(model, backend=backend2) + result_unfused = model_unfused(x) - y3, resid = rocm_aiter_ops.rms_norm2d_with_add( - x3, resid, self.norm_weight[2], self.eps - ) + if dtype == torch.float16: + ATOL, RTOL = (2e-3, 2e-3) + else: + ATOL, RTOL = (1e-2, 1e-2) - x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2]) + torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL) - y4, resid = rocm_aiter_ops.rms_norm2d_with_add( - x4, resid, self.norm_weight[3], self.eps - ) - return y4 + assert fusion_pass.matched_count == 3 + backend.check_before_ops(model.ops_in_model_before()) + backend.check_after_ops(model.ops_in_model_after()) - def ops_in_model_before(self): - return [ - torch.ops.vllm.rocm_aiter_rms_norm, - torch.ops.vllm.rocm_aiter_group_fp8_quant, - ] - - def ops_in_model_before_partial(self): - return [] - - def ops_in_model_after(self): - return [ - torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant, - torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant, - ] + return backend, backend2 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module): @pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("group_shape", GROUP_SHAPES) -@pytest.mark.parametrize( - "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op", - list(itertools.product([TestModel], [True, False], [True, False])) - + [(TestRmsnormGroupFp8QuantModel, False, False)], -) +@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False]) +@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False]) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @pytest.mark.parametrize( @@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant( num_tokens, eps, group_shape, - model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, cuda_force_torch, ): - if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND: - pytest.skip("AITER is not supported on this GPU.") - - torch.set_default_device("cuda") - torch.set_default_dtype(dtype) - torch.manual_seed(1) - maybe_create_device_identity() # needed for certain non-cutlass fp8 paths - if not enable_quant_fp8_custom_op and group_shape.is_per_group(): pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization") - # Skip test for 64-bit group shape when running with cutlass or deepgemm if group_shape == GroupShape(1, 64) and ( cutlass_block_fp8_supported() or is_deep_gemm_supported() ): @@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant( custom_ops.append("+rms_norm") if enable_quant_fp8_custom_op: custom_ops.append("+quant_fp8") + vllm_config = VllmConfig( model_config=ModelConfig(dtype=dtype), compilation_config=CompilationConfig( @@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant( ), ), ) + with vllm.config.set_current_vllm_config(vllm_config): - # Reshape pass is needed for the fusion pass to work - noop_pass = NoOpEliminationPass(vllm_config) - if model_class is TestRmsnormGroupFp8QuantModel: - from vllm.compilation.rocm_aiter_fusion import ( - RocmAiterRMSNormFp8GroupQuantFusionPass, - ) + # Setup device before model creation + torch.set_default_device("cuda") + torch.set_default_dtype(dtype) + torch.manual_seed(1) + maybe_create_device_identity() - fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config) - else: - fusion_pass = RMSNormQuantFusionPass(vllm_config) - cleanup_pass = PostCleanupPass(vllm_config) - - backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) - backend2 = TestBackend(noop_pass, cleanup_pass) - model = model_class( + fusion_pass = RMSNormQuantFusionPass(vllm_config) + model = TestModel( hidden_size=hidden_size, eps=eps, group_shape=group_shape, + use_aiter=False, cuda_force_torch=cuda_force_torch, ) - # First dimension dynamic - x = torch.rand(num_tokens, hidden_size) - torch._dynamo.mark_dynamic(x, 0) - model_fused = torch.compile(model, backend=backend) - result_fused = model_fused(x) - - model_unfused = torch.compile(model, backend=backend2) - result_unfused = model_unfused(x) - - if dtype == torch.float16: - ATOL, RTOL = (2e-3, 2e-3) - else: - ATOL, RTOL = (1e-2, 1e-2) - - torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL) - - assert fusion_pass.matched_count == 3 - backend.check_before_ops(model.ops_in_model_before()) + backend, _ = _run_fusion_test( + model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens + ) backend.check_before_ops( model.ops_in_model_before_partial(), fully_replaced=False ) - backend.check_after_ops(model.ops_in_model_after()) # If RMSNorm custom op is disabled (native/torch impl used), # there's a risk that the fused add doesn't get included in the # replacement and only the rms part gets fused with quant. # Hence, we check only 2 add nodes are left (final fused rmsnorm add). - if ( - not enable_rms_norm_custom_op - and model_class is not TestRmsnormGroupFp8QuantModel - ): + if not enable_rms_norm_custom_op: n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g)) # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each) assert n_add_nodes(backend.graph_pre_pass) == 7 assert n_add_nodes(backend.graph_post_pass) == 2 + + +GROUP_SHAPE_QUANT_OPS_MATCHS = [ + (GroupShape.PER_TOKEN, True), + (GroupShape.PER_TOKEN, False), + (GroupShape(1, 128), True), +] + + +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("hidden_size", [256]) +@pytest.mark.parametrize("num_tokens", [257]) +@pytest.mark.parametrize("eps", [1e-5, 1e-6]) +@pytest.mark.parametrize( + "group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS +) +@pytest.mark.skipif( + (not current_platform.is_rocm() or not IS_AITER_FOUND), + reason="Only test on ROCm with aiter package installed", +) +def test_aiter_fusion_rmsnorm_quant( + dtype: torch.dtype, + hidden_size: int, + num_tokens: int, + eps: float, + group_shape: GroupShape, + use_aiter_quant_op: bool, + monkeypatch: pytest.MonkeyPatch, +): + vllm_config = VllmConfig( + model_config=ModelConfig(dtype=dtype), + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + custom_ops=["+rms_norm", "+quant_fp8"], + pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True), + ), + ) + + with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m: + from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass + + m.setenv("VLLM_ROCM_USE_AITER", "1") + rocm_aiter_ops.refresh_env_variables() + + torch.set_default_device("cuda") + torch.set_default_dtype(dtype) + torch.manual_seed(1) + maybe_create_device_identity() + + fusion_pass = RocmAiterRMSNormFusionPass(vllm_config) + model = TestModel( + hidden_size=hidden_size, + eps=eps, + group_shape=group_shape, + use_aiter=True, + use_aiter_quant_op=use_aiter_quant_op, + ) + + _run_fusion_test( + model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens + ) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 4cf864bdb2de9..9fe1d906d857e 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -8,7 +8,7 @@ import pytest import pytest_asyncio from vllm.assets.audio import AudioAsset -from vllm.multimodal.utils import encode_audio_base64, fetch_audio +from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio from ...utils import RemoteOpenAIServer @@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]: } +@pytest.fixture(scope="session") +def url_encoded_audio() -> dict[str, str]: + return { + audio_url: encode_audio_url(*fetch_audio(audio_url)) + for audio_url in TEST_AUDIO_URLS + } + + def dummy_messages_from_audio_url( audio_urls: str | list[str], content_text: str = "What's happening in this audio?", @@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded( client: openai.AsyncOpenAI, model_name: str, audio_url: str, - base64_encoded_audio: dict[str, str], + url_encoded_audio: dict[str, str], ): - messages = dummy_messages_from_audio_url( - f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" - ) + messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url]) # test single completion chat_completion = await client.chat.completions.create( @@ -313,7 +319,7 @@ async def test_chat_streaming_input_audio( "format": "wav", }, }, - {"type": "text", "text": "What's happening in this audio?"}, + {"type": "text", "text": "What's a short title for this audio?"}, ], } ] diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 6f2a50020699c..8ef0d7f277d5f 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib +import importlib.util import json import time @@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages( assert ( "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str): + response = await client.chat.completions.create( + model=model_name, + messages=[{"role": "user", "content": "What is the role of AI in medicine?"}], + temperature=0.0, + max_tokens=250, + ) + + choice = response.choices[0] + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length', got {choice.finish_reason}" + ) + assert choice.message.content is not None, ( + "Content should not be None when truncated" + ) + assert len(choice.message.content) > 0, "Content should not be empty" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 6fb074b8a19ba..d845913b8ee03 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -955,7 +955,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, ], ) @@ -983,7 +982,6 @@ class TestServingChatWithHarmony: input_messages_2, [ {"role": "system"}, - {"role": "developer"}, {"role": "user"}, # The analysis message should be dropped on subsequent inputs because # of the subsequent assistant message to the final channel. @@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the third turn's input - req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) verify_harmony_messages( input_messages_3, @@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the fourth turn's input - req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) verify_harmony_messages( input_messages_4, @@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, # The reasoning that would have resulted in an analysis message is # dropped because of a later assistant message to the final channel. @@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, { "role": "assistant", @@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, { "role": "assistant", diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 7ecdac518f97f..e6b8a7f895636 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -7,7 +7,7 @@ import openai import pytest import pytest_asyncio -from vllm.multimodal.utils import encode_video_base64, fetch_video +from vllm.multimodal.utils import encode_video_url, fetch_video from ...utils import RemoteOpenAIServer @@ -48,9 +48,9 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_video() -> dict[str, str]: +def url_encoded_video() -> dict[str, str]: return { - video_url: encode_video_base64(fetch_video(video_url)[0]) + video_url: encode_video_url(fetch_video(video_url)[0]) for video_url in TEST_VIDEO_URLS } @@ -175,11 +175,9 @@ async def test_single_chat_session_video_base64encoded( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: dict[str, str], + url_encoded_video: dict[str, str], ): - messages = dummy_messages_from_video_url( - f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" - ) + messages = dummy_messages_from_video_url(url_encoded_video[video_url]) # test single completion chat_completion = await client.chat.completions.create( @@ -223,11 +221,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, video_url: str, - base64_encoded_video: dict[str, str], + url_encoded_video: dict[str, str], ): - messages = dummy_messages_from_video_url( - f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" - ) + messages = dummy_messages_from_video_url(url_encoded_video[video_url]) chat_completion = await client.chat.completions.create( model=model_name, diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index ae8860ee877b4..bc3da6ef1554a 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -9,7 +9,7 @@ import pytest_asyncio from transformers import AutoProcessor from vllm.multimodal.base import MediaWithBytes -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import encode_image_url, fetch_image from ...utils import RemoteOpenAIServer @@ -35,7 +35,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], [ "The image shows a Venn diagram with three over", - "The image shows a colorful Venn diagram with", + "The image displays a Venn diagram with three over", ], [ "This image displays a gradient of colors ranging from", @@ -70,11 +70,9 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: +def url_encoded_image(local_asset_server) -> dict[str, str]: return { - image_asset: encode_image_base64( - local_asset_server.get_image_asset(image_asset) - ) + image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset)) for image_asset in TEST_IMAGE_ASSETS } @@ -234,11 +232,11 @@ async def test_single_chat_session_image_base64encoded( model_name: str, raw_image_url: str, image_url: str, - base64_encoded_image: dict[str, str], + url_encoded_image: dict[str, str], ): content_text = "What's in this image?" messages = dummy_messages_from_image_url( - f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", + url_encoded_image[raw_image_url], content_text, ) @@ -288,15 +286,13 @@ async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_idx: int, - base64_encoded_image: dict[str, str], + url_encoded_image: dict[str, str], ): # NOTE: This test also validates that we pass MM data through beam search raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] - messages = dummy_messages_from_image_url( - f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" - ) + messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url]) chat_completion = await client.chat.completions.create( model=model_name, diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py index eebbcdd2e4396..46b2d8a84d5ae 100644 --- a/tests/entrypoints/pooling/embed/test_online_vision.py +++ b/tests/entrypoints/pooling/embed/test_online_vision.py @@ -10,7 +10,7 @@ from transformers import AutoProcessor from tests.utils import VLLM_PATH, RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.multimodal.base import MediaWithBytes -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import fetch_image MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 @@ -48,14 +48,6 @@ def server(): yield remote_server -@pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: - return { - image_url: encode_image_base64(local_asset_server.get_image_asset(image_url)) - for image_url in TEST_IMAGE_ASSETS - } - - def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py new file mode 100644 index 0000000000000..92b443c48825f --- /dev/null +++ b/tests/entrypoints/pooling/score/test_utils.py @@ -0,0 +1,352 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import patch + +import pytest + +from vllm.config import ModelConfig +from vllm.entrypoints.chat_utils import ChatTemplateResolutionError +from vllm.entrypoints.score_utils import get_score_prompt +from vllm.inputs import TokensPrompt +from vllm.tokenizers import get_tokenizer + +# A cross-encoder model for testing +CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2" + + +def assert_prompt_tokenization_consistent( + tokenizer, full_prompt, engine_prompt, add_special_tokens=True +): + """Verify that engine_prompt token_ids match tokenizing full_prompt.""" + expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[ + "input_ids" + ] + actual_ids = engine_prompt["prompt_token_ids"] + assert actual_ids == expected_ids, ( + f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}" + ) + + +@pytest.fixture(scope="module") +def cross_encoder_model_config(): + return ModelConfig( + CROSS_ENCODER_MODEL_ID, + runner="pooling", + ) + + +@pytest.fixture(scope="module") +def cross_encoder_tokenizer(cross_encoder_model_config): + return get_tokenizer( + CROSS_ENCODER_MODEL_ID, + trust_remote_code=cross_encoder_model_config.trust_remote_code, + ) + + +@pytest.fixture(scope="module") +def llm_reranker_model_config(): + """Model config for LLM-as-reranker style (no pad token).""" + config = ModelConfig( + CROSS_ENCODER_MODEL_ID, + runner="pooling", + ) + # use_pad_token is a property that reads from hf_config, + # so we set it there to override the default (True) + config.hf_config.use_pad_token = False + return config + + +@pytest.fixture +def tokenization_kwargs(): + """Common tokenization kwargs used across tests.""" + return {"add_special_tokens": True, "return_tensors": None} + + +@pytest.fixture +def mock_model_with_score_template(): + """Mock model class that supports score template and tracks post_process calls.""" + + class MockModelWithScoreTemplate: + supports_score_template = True + post_process_called: list[TokensPrompt] = [] + + @staticmethod + def get_score_template(p1: str, p2: str) -> str: + return f"[QUERY]{p1}[SEP][DOC]{p2}" + + @staticmethod + def post_process_tokens(prompt: TokensPrompt) -> None: + MockModelWithScoreTemplate.post_process_called.append(prompt) + + return MockModelWithScoreTemplate + + +@pytest.fixture +def mock_model_no_score_template(): + """Mock model class that does not support score template.""" + + class MockModelNoScoreTemplate: + supports_score_template = False + + return MockModelNoScoreTemplate + + +class TestGetScorePrompt: + """Tests for the get_score_prompt function.""" + + def test_tokenization_kwargs_passed_through( + self, + llm_reranker_model_config, + cross_encoder_tokenizer, + ): + """Test that tokenization kwargs are properly passed through.""" + data_1 = "Query text" + data_2 = "Document text" + + # Test with truncation - custom kwargs for this test + custom_tokenization_kwargs = { + "add_special_tokens": True, + "return_tensors": None, + "truncation": True, + "max_length": 20, + } + + full_prompt, engine_prompt = get_score_prompt( + llm_reranker_model_config, + cross_encoder_tokenizer, + custom_tokenization_kwargs, + data_1, + data_2, + ) + + assert isinstance(full_prompt, str) + assert "prompt_token_ids" in engine_prompt + # With max_length=20 and truncation, should not exceed this + assert len(engine_prompt["prompt_token_ids"]) <= 20 + # Since truncation was applied, token_ids should be a prefix of full encoding + full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[ + "input_ids" + ] + actual_ids = engine_prompt["prompt_token_ids"] + assert full_ids[: len(actual_ids)] == actual_ids, ( + f"Token IDs are not a prefix of full encoding.\n" + f"Full IDs: {full_ids}\n" + f"Actual IDs: {actual_ids}" + ) + + def test_model_supports_score_template( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_with_score_template, + ): + """Test when model supports score template (no score_template arg).""" + with patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_with_score_template, + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "query text", + "document text", + ) + + assert full_prompt == "[QUERY]query text[SEP][DOC]document text" + assert "prompt_token_ids" in engine_prompt + assert len(engine_prompt["prompt_token_ids"]) > 0 + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_model_supports_score_template_but_custom_template_provided( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_with_score_template, + ): + """Test when model supports score template but custom template is provided.""" + template = ( + 'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}' + ) + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_with_score_template, + ), + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "doc", + score_template=template, # Providing a template + ) + + assert "prompt_token_ids" in engine_prompt + assert full_prompt == "TEMPLATE_USED query doc" + + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_not_using_default_template( + self, + llm_reranker_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_no_score_template, + ): + # FIXME: Models implementing SupportsScoreTemplate must use their custom + # template implementation by default to preserve existing functionality. + # Attempting to use tokenizer_config.json templates would most likely break + # these models, as often they just inherit the template from the original LLM. + # CLI --chat-template overrides are still supported. + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_no_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + return_value="test querytest doc", + ), + ): + full_prompt, engine_prompt = get_score_prompt( + llm_reranker_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "test query", + "test doc", + ) + + assert full_prompt == "test querytest doc" + assert "prompt_token_ids" in engine_prompt + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_fallback_with_pad_token( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_no_score_template, + ): + """Test fallback path when ChatTemplateResolutionError + and use_pad_token=True.""" + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_no_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + side_effect=ChatTemplateResolutionError("No template"), + ), + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, # use_pad_token=True + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "document", + ) + + assert "prompt_token_ids" in engine_prompt + # Should have token_type_ids from text_pair encoding + assert "token_type_ids" in engine_prompt + assert "query" in full_prompt + assert "document" in full_prompt + assert full_prompt != "querydocument" + assert ( + engine_prompt["prompt_token_ids"] + == cross_encoder_tokenizer( + "query", text_pair="document", add_special_tokens=True + )["input_ids"] + ) + + # FIXME(?): add_special_tokens=False is needed because in this case + # full_prompt is obtained by decoding the tokenized prompt, which includes + # special tokens and we would get duplicated special tokens otherwise. + # This is inconsistent with other cases. + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, + full_prompt, + engine_prompt, + add_special_tokens=False, + ) + + def test_fallback_without_pad_token( + self, + llm_reranker_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_no_score_template, + ): + """Test fallback path when ChatTemplateResolutionError + and use_pad_token=False.""" + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_no_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + side_effect=ChatTemplateResolutionError("No template"), + ), + ): + full_prompt, engine_prompt = get_score_prompt( + llm_reranker_model_config, # use_pad_token=False + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "document", + ) + + assert full_prompt == "querydocument" + assert "prompt_token_ids" in engine_prompt + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) + + def test_post_process_tokens_called( + self, + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + mock_model_with_score_template, + ): + """Test that post_process_tokens is called on the engine prompt.""" + # Reset the call tracker + mock_model_with_score_template.post_process_called.clear() + + with ( + patch( + "vllm.model_executor.model_loader.get_model_cls", + return_value=mock_model_with_score_template, + ), + patch( + "vllm.entrypoints.score_utils.apply_hf_chat_template", + side_effect=ChatTemplateResolutionError("No template"), + ), + ): + full_prompt, engine_prompt = get_score_prompt( + cross_encoder_model_config, + cross_encoder_tokenizer, + tokenization_kwargs, + "query", + "doc", + ) + + # post_process_tokens should have been called once + assert len(mock_model_with_score_template.post_process_called) == 1 + assert mock_model_with_score_template.post_process_called[0] is engine_prompt + assert_prompt_tokenization_consistent( + cross_encoder_tokenizer, full_prompt, engine_prompt + ) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index a87a4c35d3dc7..6df2d26f2f0da 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import ( ) from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import ( - encode_audio_base64, - encode_image_base64, - encode_video_base64, + encode_audio_url, + encode_image_url, + encode_video_url, ) from vllm.tokenizers import get_tokenizer from vllm.tokenizers.mistral import MistralTokenizer @@ -141,22 +141,19 @@ def mistral_model_config(): @pytest.fixture(scope="module") def image_url(): image = ImageAsset("cherry_blossom") - base64 = encode_image_base64(image.pil_image) - return f"data:image/jpeg;base64,{base64}" + return encode_image_url(image.pil_image) @pytest.fixture(scope="module") def video_url(): video = VideoAsset("baby_reading", 1) - base64 = encode_video_base64(video.np_ndarrays) - return f"data:video/jpeg;base64,{base64}" + return encode_video_url(video.np_ndarrays) @pytest.fixture(scope="module") def audio_url(): audio = AudioAsset("mary_had_lamb") - base64 = encode_audio_base64(*audio.audio_and_sample_rate) - return f"data:audio/ogg;base64,{base64}" + return encode_audio_url(*audio.audio_and_sample_rate) def _assert_mm_data_is_image_input( diff --git a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml new file mode 100644 index 0000000000000..9fae32734d753 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml @@ -0,0 +1,11 @@ +model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --tensor-parallel-size 2 + --enable-expert-parallel + --async-scheduling +env: + VLLM_USE_FLASHINFER_MOE_FP8: "1" diff --git a/tests/evals/gsm8k/configs/models-blackwell.txt b/tests/evals/gsm8k/configs/models-blackwell.txt index 39978aa6ffbe9..c27031d25fb8c 100644 --- a/tests/evals/gsm8k/configs/models-blackwell.txt +++ b/tests/evals/gsm8k/configs/models-blackwell.txt @@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml DeepSeek-V2-Lite-Instruct-FP8.yaml Qwen3-30B-A3B-NVFP4.yaml Qwen3-Next-80B-A3B-NVFP4-EP2.yaml +Qwen3-Next-FP8-EP2.yaml diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py index ea6715f5cb532..dd0d3ae0cca47 100644 --- a/tests/evals/gsm8k/test_gsm8k_correctness.py +++ b/tests/evals/gsm8k/test_gsm8k_correctness.py @@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename): print(f"Number of questions: {eval_config['num_questions']}") print(f"Number of few-shot examples: {eval_config['num_fewshot']}") print(f"Server args: {' '.join(server_args)}") + print(f"Environment variables: {env_dict}") # Launch server and run evaluation with RemoteOpenAIServer( diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index acf46d75d62eb..3f76033254d32 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"] RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"] -@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) -@pytest.mark.parametrize("num_layers", NUM_LAYERS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@torch.inference_mode() -def test_copy_blocks( - kv_cache_factory, - num_mappings: int, - num_layers: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - seed: int, - kv_cache_dtype: str, - device: str, -) -> None: - if kv_cache_dtype == "fp8" and head_size % 16: - pytest.skip() - current_platform.seed_everything(seed) - torch.set_default_device(device) - torch.cuda.set_device(device) - # Generate random block mappings where each source block is mapped to two - # destination blocks. - assert 2 * num_mappings <= num_blocks - src_blocks = random.sample(range(num_blocks), num_mappings) - remaining_blocks = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remaining_blocks, 2 * num_mappings) - block_mapping: list[tuple[int, int]] = [] - for i in range(num_mappings): - src = src_blocks[i] - dst1 = dst_blocks[2 * i] - dst2 = dst_blocks[2 * i + 1] - block_mapping.append((src, dst1)) - block_mapping.append((src, dst2)) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory( - num_blocks, - block_size, - num_layers, - num_heads, - head_size, - kv_cache_dtype, - dtype, - seed, - device, - ) - - # Clone the KV caches. - cloned_key_caches = [key_cache.clone() for key_cache in key_caches] - cloned_value_caches = [value_cache.clone() for value_cache in value_caches] - - # Call the copy blocks kernel. - block_mapping_tensor = torch.tensor( - block_mapping, dtype=torch.int64, device=device - ).view(-1, 2) - - opcheck( - torch.ops._C_cache_ops.copy_blocks, - (key_caches, value_caches, block_mapping_tensor), - test_utils=DEFAULT_OPCHECK_TEST_UTILS, - cond=(head_size == HEAD_SIZES[0]), - ) - ops.copy_blocks(key_caches, value_caches, block_mapping_tensor) - - # Run the reference implementation. - for src, dst in block_mapping: - for cloned_key_cache in cloned_key_caches: - cloned_key_cache[dst].copy_(cloned_key_cache[src]) - for cloned_value_cache in cloned_value_caches: - cloned_value_cache[dst].copy_(cloned_value_cache[src]) - - # Compare the results. - for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): - torch.testing.assert_close(key_cache, cloned_key_cache) - for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches): - torch.testing.assert_close(value_cache, cloned_value_cache) - - @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla( torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1) -@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) -@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) -@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) -@pytest.mark.parametrize("num_layers", NUM_LAYERS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@torch.inference_mode() -def test_copy_blocks_mla( - kv_lora_rank: int, - qk_rope_head_dim: int, - block_size: int, - num_blocks: int, - num_layers: int, - dtype: torch.dtype, - seed: int, - device: str, - kv_cache_dtype: str, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - torch.cuda.set_device(device) - - entry_size = kv_lora_rank + qk_rope_head_dim - - kv_caches = [] - for _ in range(num_layers): - kv_cache = _create_mla_cache( - num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device - ) - _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype) - kv_caches.append(kv_cache) - - ref_caches = [kv_cache.clone() for kv_cache in kv_caches] - - num_mappings = min(2, num_blocks // 2) - src_blocks = random.sample(range(num_blocks), num_mappings) - remaining = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remaining, 2 * num_mappings) - block_mapping = [] - for i in range(num_mappings): - src = src_blocks[i] - dst1 = dst_blocks[2 * i] - dst2 = dst_blocks[2 * i + 1] - block_mapping.append((src, dst1)) - block_mapping.append((src, dst2)) - block_mapping_tensor = torch.tensor( - block_mapping, dtype=torch.int64, device=device - ).view(-1, 2) - - for src, dst in block_mapping: - for ref_cache in ref_caches: - ref_cache[dst].copy_(ref_cache[src]) - - opcheck( - torch.ops._C_cache_ops.copy_blocks_mla, - (kv_caches, block_mapping_tensor), - test_utils=DEFAULT_OPCHECK_TEST_UTILS, - ) - ops.copy_blocks_mla(kv_caches, block_mapping_tensor) - - for kv_cache, ref_cache in zip(kv_caches, ref_caches): - torch.testing.assert_close(kv_cache, ref_cache) - - @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) @pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py index a23959e353da9..05d61ec02fd29 100644 --- a/tests/kernels/core/test_fused_qk_norm_rope.py +++ b/tests/kernels/core/test_fused_qk_norm_rope.py @@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16] IS_NEOX = [True, False] EPS_VALUES = [1e-5, 1e-6] SEEDS = [13] +PARTIAL_ROPE = [True, False] CUDA_DEVICES = ["cuda:0"] @@ -52,6 +53,7 @@ def _apply_qk_norm_rope( @pytest.mark.parametrize("is_neox", IS_NEOX) @pytest.mark.parametrize("eps", EPS_VALUES) @pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25]) @torch.inference_mode() def test_fused_qk_norm_rope_matches_reference( device: str, @@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference( is_neox: bool, eps: float, seed: int, + rotary_ratio: float, ): torch.set_default_device(device) current_platform.seed_everything(seed) @@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference( k_norm.weight.data.normal_(mean=1.0, std=0.1) q_weight = q_norm.weight.data k_weight = k_norm.weight.data - + rotary_dim = int(head_dim * rotary_ratio) rope = RotaryEmbedding( head_size=head_dim, - rotary_dim=head_dim, + rotary_dim=rotary_dim, max_position_embeddings=4096, base=10000.0, is_neox_style=is_neox, diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 6078ce44cee9f..537dcae4e74b4 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -258,16 +258,16 @@ class Config: f"{self.fe_supported_types()}." ) - # Check block quanization support - is_block_quatized = self.quant_block_shape is not None - if is_block_quatized and self.quant_dtype is None: + # Check block quantization support + is_block_quantized = self.quant_block_shape is not None + if is_block_quantized and self.quant_dtype is None: return False, "No block quantization support." - if is_block_quatized and not self.is_block_quant_supported(): + if is_block_quantized and not self.is_block_quant_supported(): return False, "Mismatched block quantization support." # deep_gemm only works with block-quantized - if self.needs_deep_gemm() and not is_block_quatized: + if self.needs_deep_gemm() and not is_block_quantized: return False, "Needs DeepGEMM but not block quantized." # Check dependencies (turn into asserts?) diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py deleted file mode 100644 index 1c10cb3b2c699..0000000000000 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ /dev/null @@ -1,92 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# DeepGEMM Style Cutlass Grouped GEMM Test -# See https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py - -import random - -import pytest -import torch - -from tests.kernels.moe.utils import per_token_cast_to_fp8 -from tests.kernels.utils import baseline_scaled_mm -from vllm import _custom_ops as ops -from vllm.platforms import current_platform -from vllm.utils.deep_gemm import per_block_cast_to_fp8 -from vllm.utils.math_utils import cdiv - - -@pytest.mark.parametrize( - "num_groups, expected_m_per_group, k, n", - [ - (4, 8192, 7168, 4096), - (4, 8192, 2048, 7168), - (8, 4096, 7168, 4096), - (8, 4096, 2048, 7168), - (32, 1024, 7168, 4096), - (32, 1024, 2048, 7168), - ], -) -@pytest.mark.parametrize("out_dtype", [torch.float16]) -@pytest.mark.skipif( - (lambda x: x is None or x.to_int() != 100)( - current_platform.get_device_capability() - ), - reason="Block Scaled Grouped GEMM is only supported on SM100.", -) -def test_cutlass_grouped_gemm( - num_groups: int, - expected_m_per_group: int, - k: int, - n: int, - out_dtype: torch.dtype, -): - device = "cuda" - alignment = 128 - group_ms = [ - int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups) - ] - m = sum([cdiv(m, alignment) * alignment for m in group_ms]) - - x = torch.randn((m, k), device=device, dtype=out_dtype) - y = torch.randn((num_groups, n, k), device=device, dtype=out_dtype) - out = torch.empty((m, n), device=device, dtype=out_dtype) - ref_out = torch.randn((m, n), device=device, dtype=out_dtype) - - ep_offset = [0] + [sum(group_ms[:i]) for i in range(1, num_groups)] + [m] - pb_size = [] - for i in range(num_groups): - pb_size.append([ep_offset[i + 1] - ep_offset[i], n, k]) - problem_sizes = torch.tensor(pb_size, device=device, dtype=torch.int32) - expert_offsets = torch.tensor(ep_offset, device=device, dtype=torch.int32) - - x_fp8 = per_token_cast_to_fp8(x) - y_fp8 = ( - torch.empty_like(y, dtype=torch.float8_e4m3fn), - torch.empty( - (num_groups, cdiv(n, 128), k // 128), device=device, dtype=torch.float - ), - ) - for i in range(num_groups): - y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128]) - - for i in range(num_groups): - a = x_fp8[0][ep_offset[i] : ep_offset[i + 1]] - a_scale = x_fp8[1][ep_offset[i] : ep_offset[i + 1]] - b = y_fp8[0][i].t() - b_scale = y_fp8[1][i].t() - baseline = baseline_scaled_mm(a, b, a_scale, b_scale, out_dtype) - ref_out[ep_offset[i] : ep_offset[i + 1]] = baseline - - ops.cutlass_blockwise_scaled_grouped_mm( - out, - x_fp8[0], - y_fp8[0], - x_fp8[1], - y_fp8[1], - problem_sizes, - expert_offsets[:-1], - ) - - torch.testing.assert_close(ref_out, out, atol=5e-1, rtol=1e-3) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index ce99d9691fdc8..fd6ce6bfbd782 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.v1.worker.workspace import init_workspace_manager NUM_EXPERTS = [8, 64, 192] EP_SIZE = [1, 4] @@ -487,6 +488,7 @@ def test_mixtral_moe( monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_PORT", "12345") init_distributed_environment() + init_workspace_manager(torch.cuda.current_device()) # Instantiate our and huggingface's MoE blocks vllm_config.compilation_config.static_forward_context = dict() @@ -533,6 +535,11 @@ def test_mixtral_moe( torch.cuda.synchronize() torch.cuda.empty_cache() + # FIXME (zyongye) fix this after we move self.kernel + # assignment in FusedMoE.__init__ + + vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts) + # Run forward passes for both MoE blocks hf_states, _ = hf_moe.forward(hf_inputs) vllm_states = vllm_moe.forward(vllm_inputs) diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 189cdbae99dcd..11e05a635c1d1 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile +from pathlib import Path import mteb import numpy as np @@ -19,6 +20,11 @@ from tests.models.utils import ( get_vllm_extra_kwargs, ) +template_home = ( + Path(__file__).parent.parent.parent.parent.parent + / "examples/pooling/score/template" +) + # Most embedding models on the STS12 task (See #17175): # - Model implementation and minor changes in tensor dtype # results in differences less than 1e-4 @@ -102,30 +108,6 @@ class VllmMtebEncoder(mteb.EncoderProtocol): return sim -class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): - mteb_model_meta = _empty_model_meta - - def __init__(self, vllm_model): - self.llm = vllm_model - self.rng = np.random.default_rng(seed=42) - - def predict( - self, - inputs1: DataLoader[mteb.types.BatchedInput], - inputs2: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - queries = [text for batch in inputs1 for text in batch["text"]] - corpus = [text for batch in inputs2 for text in batch["text"]] - - outputs = self.llm.score( - queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False - ) - scores = np.array(outputs) - return scores - - class OpenAIClientMtebEncoder(VllmMtebEncoder): def __init__(self, model_name: str, client): self.model_name = model_name @@ -153,6 +135,35 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder): return embeds +class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + self.chat_template: str | None = getattr(vllm_model, "chat_template", None) + + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] + + outputs = self.llm.score( + queries, + corpus, + truncate_prompt_tokens=-1, + use_tqdm=False, + chat_template=self.chat_template, + ) + scores = np.array(outputs) + return scores + + class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): mteb_model_meta = _empty_model_meta @@ -387,6 +398,11 @@ def mteb_test_rerank_models( == model_info.default_pooling_type ) + chat_template: str | None = None + if model_info.chat_template_name is not None: + chat_template = (template_home / model_info.chat_template_name).read_text() + vllm_model.chat_template = chat_template + vllm_main_score = run_mteb_rerank( vllm_mteb_encoder(vllm_model), tasks=MTEB_RERANK_TASKS, diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py new file mode 100644 index 0000000000000..167c3fcf50d1c --- /dev/null +++ b/tests/models/language/pooling_mteb_test/test_nemotron.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from tests.models.utils import ( + EmbedModelInfo, + LASTPoolingEmbedModelInfo, + LASTPoolingRerankModelInfo, + RerankModelInfo, +) + +from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models + +EMBEDDING_MODELS = [ + LASTPoolingEmbedModelInfo( + "nvidia/llama-nemotron-embed-1b-v2", + architecture="LlamaBidirectionalModel", + mteb_score=0.689164662128673, + ) +] + +RERANK_MODELS = [ + LASTPoolingRerankModelInfo( + "nvidia/llama-nemotron-rerank-1b-v2", + architecture="LlamaBidirectionalForSequenceClassification", + chat_template_name="nemotron-rerank.jinja", + mteb_score=0.33994, + ), +] + + +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: + mteb_test_embed_models(hf_runner, vllm_runner, model_info) + + +@pytest.mark.parametrize("model_info", RERANK_MODELS) +def test_rerank_models_mteb( + hf_runner, vllm_runner, model_info: RerankModelInfo +) -> None: + mteb_test_rerank_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/multimodal/conftest.py b/tests/models/multimodal/conftest.py index 4243298cdc896..31d99218c8276 100644 --- a/tests/models/multimodal/conftest.py +++ b/tests/models/multimodal/conftest.py @@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items): return # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers - # accuracy issues + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_mem_efficient_sdp(False) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 6640e1ff9474d..299f57f6c4f86 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = { max_model_len=8192, use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, + num_logprobs=10 if current_platform.is_rocm() else 5, ), "intern_vl-hf": VLMTestInfo( models=["OpenGVLab/InternVL3-1B-hf"], diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py index 6f98bde1d91ea..4205a8b2d1ac4 100644 --- a/tests/models/multimodal/generation/test_keye.py +++ b/tests/models/multimodal/generation/test_keye.py @@ -8,7 +8,7 @@ from PIL.Image import Image from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview" @@ -31,10 +31,7 @@ def test_keye_vl( question: str, ): images = [asset.pil_image for asset in image_assets] - - image_urls = [ - f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images - ] + image_urls = [encode_image_url(image) for image in images] engine_args = EngineArgs( model=MODEL_NAME, diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index a4e4ce312ddd4..8cea6135ba6a2 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -15,7 +15,7 @@ from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url from vllm.multimodal.video import sample_frames_from_video from vllm.platforms import current_platform @@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config): """Build Dots.OCR specific prompt with OCR instructions.""" # Use only stop_sign image for Dots.OCR image = images[0] # Already filtered to stop_sign - - image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}" + image_url = encode_image_url(image) placeholders = [{"type": "image_url", "image_url": {"url": image_url}}] messages = [ @@ -204,9 +203,7 @@ def build_processor_prompt(images, config): config["model_name"], trust_remote_code=True ) - image_urls = [ - f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images - ] + image_urls = [encode_image_url(img) for img in images] placeholders = [{"type": "image", "image": url} for url in image_urls] messages = [ { @@ -225,9 +222,7 @@ def build_processor_prompt(images, config): def build_ovis_prompt(images, config): """Build Ovis2.5 specific prompt with custom format.""" - image_urls = [ - f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images - ] + image_urls = [encode_image_url(img) for img in images] placeholders = "\n".join( f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py index 0eaef49e2395c..9f8415c0c390c 100644 --- a/tests/models/multimodal/generation/test_voxtral.py +++ b/tests/models/multimodal/generation/test_voxtral.py @@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets): assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] + assert choice.message.content == "In the first audio clip, you hear a brief" assert choice.finish_reason == "length" diff --git a/tests/models/multimodal/pooling/conftest.py b/tests/models/multimodal/pooling/conftest.py new file mode 100644 index 0000000000000..401bc39b4b109 --- /dev/null +++ b/tests/models/multimodal/pooling/conftest.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM pooling tests.""" + +import pytest + +from vllm.platforms import current_platform + + +@pytest.fixture +def siglip_attention_config(): + """Return attention config for SigLIP tests on ROCm. + + On ROCm, SigLIP tests require FLEX_ATTENTION backend. + """ + if current_platform.is_rocm(): + return {"backend": "FLEX_ATTENTION"} + return None diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index cb875436857cf..46fd4249ea4f8 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -138,7 +138,7 @@ def create_batched_mm_kwargs( ) -# TODO(Isotr0py): Don't initalize model during test +# TODO(Isotr0py): Don't initialize model during test @contextmanager def initialize_dummy_model( model_cls: type[nn.Module], diff --git a/tests/models/registry.py b/tests/models/registry.py index 82b9303b2a21b..2922414cdaa6a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -215,7 +215,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True, ), "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"), - "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), + # FIXME: databricks/dbrx-instruct has been deleted + "DbrxForCausalLM": _HfExamplesInfo( + "databricks/dbrx-instruct", is_available_online=False + ), "DeciLMForCausalLM": _HfExamplesInfo( "nvidia/Llama-3_3-Nemotron-Super-49B-v1", trust_remote_code=True, @@ -366,7 +369,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"tiny": "TitanML/tiny-mixtral"}, ), "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), - "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), + # FIXME: mosaicml/mpt-7b has been deleted + "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b", is_available_online=False), "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), "NemotronHForCausalLM": _HfExamplesInfo( "nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True @@ -484,6 +488,9 @@ _EMBEDDING_EXAMPLE_MODELS = { ), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), + "LlamaBidirectionalModel": _HfExamplesInfo( + "nvidia/llama-nemotron-embed-1b-v2", trust_remote_code=True + ), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo( "Alibaba-NLP/gte-modernbert-base", trust_remote_code=True @@ -550,6 +557,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { trust_remote_code=True, hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, ), + "LlamaBidirectionalForSequenceClassification": _HfExamplesInfo( + "nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True + ), "ModernBertForSequenceClassification": _HfExamplesInfo( "Alibaba-NLP/gte-reranker-modernbert-base" ), @@ -850,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = { # disable this temporarily until we support HF format is_available_online=False, ), + "VoxtralStreamingGeneration": _HfExamplesInfo( + "", + # disable this temporarily until we support HF format + is_available_online=False, + ), # [Encoder-decoder] "WhisperForConditionalGeneration": _HfExamplesInfo( "openai/whisper-large-v3-turbo", diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py index 15764145bc1a2..24b624e269583 100644 --- a/tests/models/test_terratorch.py +++ b/tests/models/test_terratorch.py @@ -38,7 +38,7 @@ def test_inference( max_num_seqs=32, default_torch_num_threads=1, ) as vllm_model: - vllm_output = vllm_model.llm.encode(prompt) + vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin") assert torch.equal( torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False) ) diff --git a/tests/models/utils.py b/tests/models/utils.py index d84b4b820533e..bf26c21fb5f58 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -399,6 +399,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo): @dataclass class RerankModelInfo(ModelInfo): mteb_score: float | None = None + chat_template_name: str | None = None @dataclass diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 412b21328a325..535f028202275 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") with vllm_runner(model_path, enforce_eager=True) as llm: @@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs( current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") if use_aiter: if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL: @@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token( current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") if use_aiter: if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 62203186510ce..a4b6d35987e13 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import ( Fp8Config, Fp8KVCacheMethod, Fp8LinearMethod, + Fp8MoeBackend, Fp8MoEMethod, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -216,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None: ref_y, inv_scale = ops.scaled_fp8_quant(x, None) ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype) - # Reference dynamic quantizaton + # Reference dynamic quantization y = quantize_ref(x, inv_scale) torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) @@ -324,7 +325,10 @@ def test_fp8_reloading( weight_loader=default_weight_loader, ) + # Fp8LinearMethod uses use_marlin + # Fp8MoEMethod uses fp8_backend method.use_marlin = use_marlin + method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None # capture weights format during loading original_metadata = [ diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index 0298994c396f6..154b29d7017ac 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`. """ import os +from typing import NoReturn import pytest import torch @@ -19,6 +20,28 @@ def enable_pickle(monkeypatch): monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") +def _skip(msg: str) -> NoReturn: + pytest.skip(msg) + raise RuntimeError(msg) + + +def _snapshot_download_or_skip(model_id: str) -> str: + try: + from huggingface_hub import snapshot_download + except Exception as e: # pragma: no cover + _skip(f"huggingface_hub is required to download {model_id}: {e}") + + try: + return snapshot_download( + repo_id=model_id, + repo_type="model", + # These checkpoints are already small; download full repo for simplicity. + allow_patterns=["*"], + ) + except Exception as e: + _skip(f"Failed to download {model_id} from the HF Hub: {e}") + + @pytest.mark.skipif( not is_quant_method_supported("modelopt"), reason="ModelOpt FP8 is not supported on this GPU type.", @@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner): output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8 output: {output}") + + +@pytest.mark.skipif( + not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.", +) +def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup.""" + model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt" + model_path = _snapshot_download_or_skip(model_id) + + with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8PcPtLinearMethod, + ) + + assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod) + + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Per-channel scales; activations are dynamically scaled per token. + assert hasattr(qkv_proj, "weight_scale") + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.weight_scale.dim() == 1 + assert not hasattr(qkv_proj, "input_scale") + + assert hasattr(o_proj, "weight_scale") + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.weight_scale.dim() == 1 + assert not hasattr(o_proj, "input_scale") + + assert hasattr(gate_up_proj, "weight_scale") + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.weight_scale.dim() == 1 + assert not hasattr(gate_up_proj, "input_scale") + + assert hasattr(down_proj, "weight_scale") + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.weight_scale.dim() == 1 + assert not hasattr(down_proj, "input_scale") + + llm.apply_model(check_model) + + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) + assert output + print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}") + + +@pytest.mark.skipif( + not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.", +) +def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8_PB_WO checkpoint setup.""" + model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo" + model_path = _snapshot_download_or_skip(model_id) + + with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8PbWoLinearMethod, + ) + + assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod) + + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor. + assert hasattr(qkv_proj, "weight_scale") + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.weight_scale.dim() == 2 + + assert hasattr(o_proj, "weight_scale") + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.weight_scale.dim() == 2 + + assert hasattr(gate_up_proj, "weight_scale") + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.weight_scale.dim() == 2 + + assert hasattr(down_proj, "weight_scale") + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.weight_scale.dim() == 2 + + llm.apply_model(check_model) + + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) + assert output + print(f"ModelOpt FP8_PB_WO output: {output}") diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index 2017e34030d60..ebf199a5056fb 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -18,25 +18,37 @@ for i in {1..5}; do echo "Checking metadata.json URL (attempt $i)..." if curl --fail "$meta_json_url" > metadata.json; then echo "INFO: metadata.json URL is valid." - # check whether it is valid json by python + # check whether it is valid json by python (printed to stdout) if python3 -m json.tool metadata.json; then - echo "INFO: metadata.json is valid JSON. Proceeding with the test." + echo "INFO: metadata.json is valid JSON. Proceeding with the check." + # check whether there is an object in the json matching: + # "package_name": "vllm", and "platform_tag" matches the current architecture + # see `determine_wheel_url` in setup.py for more details + if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \ + s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \ + for o in d)))" 2>/dev/null; then + echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture." + break + else + echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture." + fi else echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!" + echo "INFO: metadata.json content:" + cat metadata.json exit 1 fi - break fi - # failure handling + # failure handling & retry logic if [ $i -eq 5 ]; then - echo "ERROR: metadata.json URL is still not valid after 5 attempts." - echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists." + echo "ERROR: metadata is still not available after 5 attempts." + echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available." echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes." echo " NOTE: If it fails, please report in #sig-ci channel." exit 1 else - echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..." - sleep 180 + echo "WARNING: metadata is not available. Retrying after 5 minutes..." + sleep 300 fi done diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py new file mode 100644 index 0000000000000..7b90b949aa457 --- /dev/null +++ b/tests/test_attention_backend_registry.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionImpl, +) +from vllm.attention.backends.registry import ( + AttentionBackendEnum, + MambaAttentionBackendEnum, + register_backend, +) + + +class CustomAttentionImpl(AttentionImpl): + """Mock custom attention implementation for testing.""" + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, *args, **kwargs): + """Mock forward pass.""" + pass + + +class CustomAttentionBackend(AttentionBackend): + """Mock custom attention backend for testing.""" + + @staticmethod + def get_name(): + return "CUSTOM" + + @staticmethod + def get_impl_cls(): + return CustomAttentionImpl + + @staticmethod + def get_builder_cls(): + """Mock builder class.""" + return None + + @staticmethod + def get_required_kv_cache_layout(): + """Mock KV cache layout.""" + return None + + +class CustomMambaAttentionImpl(AttentionImpl): + """Mock custom mamba attention implementation for testing.""" + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, *args, **kwargs): + """Mock forward pass.""" + pass + + +class CustomMambaAttentionBackend(AttentionBackend): + """Mock custom mamba attention backend for testing.""" + + @staticmethod + def get_name(): + return "CUSTOM_MAMBA" + + @staticmethod + def get_impl_cls(): + return CustomMambaAttentionImpl + + @staticmethod + def get_builder_cls(): + """Mock builder class.""" + return None + + @staticmethod + def get_required_kv_cache_layout(): + """Mock KV cache layout.""" + return None + + +def test_custom_is_not_alias_of_any_backend(): + # Get all members of AttentionBackendEnum + all_backends = list(AttentionBackendEnum) + + # Find any aliases of CUSTOM + aliases = [] + for backend in all_backends: + if backend.name != "CUSTOM" and backend is AttentionBackendEnum.CUSTOM: + aliases.append(backend.name) + + # CUSTOM should not be an alias of any other backend + assert len(aliases) == 0, ( + f"BUG! CUSTOM is an alias of: {', '.join(aliases)}!\n" + f"CUSTOM.value = {repr(AttentionBackendEnum.CUSTOM.value)}\n" + f"This happens when CUSTOM has the same value as another backend.\n" + f"When you register to CUSTOM, you're actually registering to {aliases[0]}!\n" + f"All backend values:\n" + + "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends) + ) + + # Verify CUSTOM has its own unique identity + assert AttentionBackendEnum.CUSTOM.name == "CUSTOM", ( + f"CUSTOM.name should be 'CUSTOM', but got '{AttentionBackendEnum.CUSTOM.name}'" + ) + + +def test_register_custom_backend_with_class_path(): + # Register with explicit class path + register_backend( + backend=AttentionBackendEnum.CUSTOM, + class_path="tests.test_attention_backend_registry.CustomAttentionBackend", + is_mamba=False, + ) + + # Check that CUSTOM backend is registered + assert AttentionBackendEnum.CUSTOM.is_overridden(), ( + "CUSTOM should be overridden after registration" + ) + + # Get the registered class path + class_path = AttentionBackendEnum.CUSTOM.get_path() + assert class_path == "tests.test_attention_backend_registry.CustomAttentionBackend" + + # Get the backend class + backend_cls = AttentionBackendEnum.CUSTOM.get_class() + assert backend_cls.get_name() == "CUSTOM" + assert backend_cls.get_impl_cls() == CustomAttentionImpl + + +def test_mamba_custom_is_not_alias_of_any_backend(): + # Get all mamba backends + all_backends = list(MambaAttentionBackendEnum) + + # Find any aliases of CUSTOM + aliases = [] + for backend in all_backends: + if backend.name != "CUSTOM" and backend is MambaAttentionBackendEnum.CUSTOM: + aliases.append(backend.name) + + # CUSTOM should not be an alias of any other backend + assert len(aliases) == 0, ( + f"BUG! MambaAttentionBackendEnum.CUSTOM is an alias of: {', '.join(aliases)}!\n" + f"CUSTOM.value = {repr(MambaAttentionBackendEnum.CUSTOM.value)}\n" + f"All mamba backend values:\n" + + "\n".join(f" {b.name}: {repr(b.value)}" for b in all_backends) + ) + + +def test_register_custom_mamba_backend_with_class_path(): + # Register with explicit class path + register_backend( + backend=MambaAttentionBackendEnum.CUSTOM, + class_path="tests.test_attention_backend_registry.CustomMambaAttentionBackend", + is_mamba=True, + ) + + # Check that the backend is registered + assert MambaAttentionBackendEnum.CUSTOM.is_overridden() + + # Get the registered class path + class_path = MambaAttentionBackendEnum.CUSTOM.get_path() + assert ( + class_path + == "tests.test_attention_backend_registry.CustomMambaAttentionBackend" + ) + + # Get the backend class + backend_cls = MambaAttentionBackendEnum.CUSTOM.get_class() + assert backend_cls.get_name() == "CUSTOM_MAMBA" + assert backend_cls.get_impl_cls() == CustomMambaAttentionImpl diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py index e8826eb441a24..44cbdeed45074 100644 --- a/tests/test_routing_simulator.py +++ b/tests/test_routing_simulator.py @@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device): envs.environment_variables[env_name] = lambda s=strategy: s # Test the select_experts method - topk_weights, topk_ids, _ = fused_moe.select_experts( + topk_weights, topk_ids = fused_moe.select_experts( hidden_states=hidden_states, router_logits=router_logits, ) diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py index d307993d04df9..ad6c5fb415aad 100644 --- a/tests/tokenizers_/test_detokenize.py +++ b/tests/tokenizers_/test_detokenize.py @@ -38,7 +38,8 @@ TOKENIZERS = [ "EleutherAI/gpt-j-6b", "EleutherAI/pythia-70m", "bigscience/bloom-560m", - "mosaicml/mpt-7b", + # FIXME: mosaicml/mpt-7b has been deleted + # "mosaicml/mpt-7b", "tiiuae/falcon-7b", "meta-llama/Llama-3.2-1B-Instruct", "codellama/CodeLlama-7b-hf", diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py index 9400a67267f4c..d2502079d0de9 100644 --- a/tests/tool_parsers/test_mistral_tool_parser.py +++ b/tests/tool_parsers/test_mistral_tool_parser.py @@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer( "single_tool_add", "single_tool_weather", "multiple_tool_calls", + "complex", + "wrong_json", ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer( ], None, ), + ( + # Complex + """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + )[:-2], + ) + ) + ], + "hi{hi", + ), + ( + # Wrong json + """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + ), + ) + ) + ], + "hi{hi", + ), ], ) def test_extract_tool_calls( @@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming( ), ( # Complex - """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 [ ToolCall( function=FunctionCall( @@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming( ) ) ], - "", + "hi{hi", ), ], ) diff --git a/tests/utils.py b/tests/utils.py index d8102331b3612..1b338e93182a5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -106,6 +106,7 @@ class RemoteOpenAIServer: env.update(env_dict) serve_cmd = ["vllm", "serve", model, *vllm_serve_args] print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}") + print(f"Environment variables: {env}") self.proc: subprocess.Popen = subprocess.Popen( serve_cmd, env=env, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 0880a17c78d40..977ec71bcbecf 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int): assert len(manager.block_pool.cached_block_hash_to_block) == 0 +def test_null_parent_block_hash(): + block_size = 1 + num_cached_blocks = 2 + num_full_blocks = 4 + + pool = BlockPool( + num_gpu_blocks=8, + enable_caching=True, + hash_block_size=block_size, + enable_kv_cache_events=True, + ) + + req = make_request( + "req_null_parent", + prompt_token_ids=[10, 11, 12, 13], + block_size=block_size, + hash_fn=sha256, + ) + assert len(req.block_hashes) == num_full_blocks + + # Physical parent is `null_block` (no hash), while the logical parent hash + # still exists in `request.block_hashes[num_cached_blocks - 1]`. + assert pool.null_block.block_hash is None + new_blocks = pool.get_new_blocks(num_full_blocks - 1) + blocks = [ + new_blocks[: num_cached_blocks - 1], + pool.null_block, # physical parent + *new_blocks[num_cached_blocks - 1 :], + ] + + pool.cache_full_blocks( + request=req, + blocks=blocks, + num_cached_blocks=num_cached_blocks, + num_full_blocks=num_full_blocks, + block_size=block_size, + kv_cache_group_id=0, + ) + + events = pool.take_events() + assert len(events) == 1 + event = events[0] + assert isinstance(event, BlockStored) + + expected_parent = kv_cache_utils.maybe_convert_block_hash( + req.block_hashes[num_cached_blocks - 1] + ) + assert event.parent_block_hash == expected_parent + assert event.parent_block_hash is not None + + expected_new_hashes = [ + kv_cache_utils.maybe_convert_block_hash(h) + for h in req.block_hashes[num_cached_blocks:num_full_blocks] + ] + assert event.block_hashes == expected_new_hashes + + # Ensure we didn't accidentally assign a hash to the null block. + assert pool.null_block.block_hash is None + # Sanity check: newly cached physical blocks should have hashes assigned. + assert blocks[num_cached_blocks].block_hash is not None + assert blocks[num_full_blocks - 1].block_hash is not None + + @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) def test_kv_cache_events_with_lora(blocks_to_cache: int): """Test BlockStored events contain correct lora_id when using LoRA requests.""" diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py index 616d34441ab8e..eae4b7427240f 100644 --- a/tests/v1/ec_connector/integration/test_epd_correctness.py +++ b/tests/v1/ec_connector/integration/test_epd_correctness.py @@ -31,7 +31,7 @@ import openai import requests from vllm.assets.image import ImageAsset -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url MAX_OUTPUT_LEN = 256 @@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [ "content": [ { "type": "image_url", - "image_url": { - "url": f"data:image;base64,{encode_image_base64(image_1)}" - }, + "image_url": {"url": encode_image_url(image_1)}, }, {"type": "text", "text": "What's in this image?"}, ], @@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [ "content": [ { "type": "image_url", - "image_url": { - "url": f"data:image;base64,{encode_image_base64(image_2)}" - }, + "image_url": {"url": encode_image_url(image_2)}, }, { "type": "image_url", diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 224e5d741024b..11681cfcebca4 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind): # Use multi-abort to abort multiple requests at once abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT] - await engine.abort(abort_request_ids) + await engine.abort(abort_request_ids, internal=False) # Wait for all tasks to complete results = await asyncio.gather(*tasks, return_exceptions=True) @@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind): await asyncio.sleep(0.5) # Abort the request - await engine.abort(request_id) + await engine.abort(request_id, internal=False) # Wait for generation to complete and return final output final_output = await generated diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 5fa16897b4e0c..4f96ded7ec351 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) PROMPT = "I am Gyoubu Masataka Oniwa" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids +_REQUEST_COUNTER = 0 + def make_request() -> EngineCoreRequest: + global _REQUEST_COUNTER + _REQUEST_COUNTER += 1 + request_id = f"request-{_REQUEST_COUNTER}" return EngineCoreRequest( - request_id=str(uuid.uuid4()), + request_id=request_id, + external_req_id=f"{request_id}-{uuid.uuid4()}", prompt_token_ids=PROMPT_TOKENS, mm_features=None, sampling_params=SamplingParams(), diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 770560a5e549e..c8d25f9700bf1 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import importlib import os import signal import time import uuid from dataclasses import dataclass from threading import Thread +from types import SimpleNamespace from typing import Any from unittest.mock import MagicMock @@ -24,7 +26,11 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils.torch_utils import set_default_torch_num_threads from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core import EngineCore -from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient +from vllm.v1.engine.core_client import ( + AsyncMPClient, + EngineCoreClient, + SyncMPClient, +) from vllm.v1.engine.utils import CoreEngineProcManager from vllm.v1.executor.abstract import Executor @@ -39,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) PROMPT = "Hello my name is Robert and I love quantization kernels" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids +_REQUEST_COUNTER = 0 + def make_request( params: SamplingParams, prompt_tokens_ids: list[int] | None = None @@ -46,8 +54,12 @@ def make_request( if not prompt_tokens_ids: prompt_tokens_ids = PROMPT_TOKENS + global _REQUEST_COUNTER + _REQUEST_COUNTER += 1 + request_id = f"request-{_REQUEST_COUNTER}" return EngineCoreRequest( - request_id=str(uuid.uuid4()), + request_id=request_id, + external_req_id=f"{request_id}-{uuid.uuid4()}", prompt_token_ids=prompt_tokens_ids, mm_features=None, sampling_params=params, @@ -60,6 +72,91 @@ def make_request( ) +def _reload_envs_module(): + import vllm.envs as envs_mod + + cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None) + if cache_clear is not None: + cache_clear() + return importlib.reload(envs_mod) + + +def _reload_core_client_module(): + module = importlib.import_module("vllm.v1.engine.core_client") + return importlib.reload(module) + + +def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch): + timeout_value = 654 + monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value)) + + # Ensure that the environment variable is loaded if caching is enabled + _reload_envs_module() + core_client_mod = _reload_core_client_module() + + poll_timeouts: list[int] = [] + + class ShadowSocket: + def poll(self, timeout: int) -> int: + # Capture the timeout value for each poll call + poll_timeouts.append(timeout) + return 1 + + def recv_multipart(self): + return (b"\x00\x00", b"ready") + + class DummySocket: + def send_multipart(self, _msg, *, copy: bool = False, track: bool = False): + if track: + return SimpleNamespace(done=True) + + def recv_multipart(self, *, copy: bool = False): + return (b"", b"") + + def close(self, *, linger: int = 0): + pass + + def bind(self, _address): + pass + + def connect(self, _address): + pass + + def setsockopt(self, *_args, **_kwargs): + pass + + monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket()) + monkeypatch.setattr( + core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket() + ) + + parallel_config = SimpleNamespace( + data_parallel_size=1, + data_parallel_rank=0, + data_parallel_size_local=1, + data_parallel_rank_local=None, + data_parallel_hybrid_lb=False, + data_parallel_external_lb=False, + ) + vllm_config = SimpleNamespace(parallel_config=parallel_config) + + client = core_client_mod.MPClient( + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=object, + log_stats=False, + client_addresses={ + "input_address": "inproc://input", + "output_address": "inproc://output", + }, + ) + try: + # timeout_value is in seconds, but poll receives milliseconds + assert poll_timeouts == [timeout_value * 1000] + finally: + client.shutdown() + + def loop_until_done(client: EngineCoreClient, outputs: dict): while True: engine_core_outputs = client.get_output().outputs diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index 77e67d54e587e..67a3b6b012dcc 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case(): params = SamplingParams(skip_special_tokens=True) request = EngineCoreRequest( request_id="test", + external_req_id="test-ext", prompt_token_ids=prompt_token_ids, mm_features=None, sampling_params=params, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 990aa9d925855..f1185222f7137 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -58,12 +58,12 @@ def test_incremental_detokenization( output_processor = OutputProcessor( dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval ) - engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens) # Make N requests. requests = [ EngineCoreRequest( - request_id=f"request-{idx}", + request_id=f"request-{idx}-int", + external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -83,6 +83,11 @@ def test_incremental_detokenization( for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + request_ids=[req.request_id for req in requests], + ) + # Add requests to the detokenizer. for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): output_processor.add_request(request, prompt) @@ -438,15 +443,6 @@ def test_logprobs_processor( dummy_test_vectors, ): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens, - generated_logprobs_raw=None - if num_sample_logprobs is None - else dummy_test_vectors.generation_logprobs, - prompt_logprobs_raw=None - if num_prompt_logprobs is None - else dummy_test_vectors.prompt_logprobs, - ) # Make N requests. request_id_list = [ @@ -454,7 +450,8 @@ def test_logprobs_processor( ] requests = [ EngineCoreRequest( - request_id=request_id_list[idx], + request_id=request_id_list[idx] + "-int", + external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -476,6 +473,17 @@ def test_logprobs_processor( for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + generated_logprobs_raw=None + if num_sample_logprobs is None + else dummy_test_vectors.generation_logprobs, + prompt_logprobs_raw=None + if num_prompt_logprobs is None + else dummy_test_vectors.prompt_logprobs, + request_ids=[req.request_id for req in requests], + ) + # Add requests to the detokenizer. for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): output_processor.add_request(request, prompt) @@ -621,19 +629,12 @@ def test_stop_token( ] prompt_string = dummy_test_vectors.prompt_strings[0] prompt_tokens = dummy_test_vectors.prompt_tokens[0] - engine_core = MockEngineCore( - tokens_list=[generation_tokens], - generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, - prompt_logprobs_raw=None, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids, - ignore_eos=ignore_eos, - ) # Make request. request_id = "request-0" request = EngineCoreRequest( request_id=request_id, + external_req_id=request_id + "-ext", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=eos_token_id, @@ -655,6 +656,16 @@ def test_stop_token( pooling_params=None, ) + engine_core = MockEngineCore( + tokens_list=[generation_tokens], + generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, + prompt_logprobs_raw=None, + eos_token_id=eos_token_id, + stop_token_ids=stop_token_ids, + ignore_eos=ignore_eos, + request_ids=[request.request_id], + ) + # Add request to the detokenizer. output_processor.add_request(request, prompt_string) @@ -720,13 +731,6 @@ def test_stop_string( dummy_test_vectors, ): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) - engine_core = MockEngineCore( - tokens_list=dummy_test_vectors.generation_tokens, - generated_logprobs_raw=dummy_test_vectors.generation_logprobs - if num_sample_logprobs - else None, - prompt_logprobs_raw=None, - ) # Make N requests. request_id_list = [ @@ -734,7 +738,8 @@ def test_stop_string( ] requests = [ EngineCoreRequest( - request_id=request_id_list[idx], + request_id=request_id_list[idx] + "-int", + external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -756,6 +761,15 @@ def test_stop_string( for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + tokens_list=dummy_test_vectors.generation_tokens, + generated_logprobs_raw=dummy_test_vectors.generation_logprobs + if num_sample_logprobs + else None, + prompt_logprobs_raw=None, + request_ids=[req.request_id for req in requests], + ) + # Add requests to the detokenizer. for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): output_processor.add_request(request, prompt) @@ -813,9 +827,12 @@ def test_stop_string( for idx, (ref_gen_str, stop_str) in enumerate( zip(dummy_test_vectors.generation_strings, STOP_STRINGS) ): - # Request should be aborted. + # Request should be aborted (check internal ID in abort list). + internal_request_id = f"request-{idx}-int" + assert internal_request_id in aborted + + # Use external ID for collecting outputs request_id = f"request-{idx}" - assert request_id in aborted # Collected values that were generated. gen_str = gen_strings[request_id] @@ -848,13 +865,13 @@ def test_stop_string( def test_iteration_stats(dummy_test_vectors): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) - engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) engine_core_timestamp = time.monotonic() # Make N requests. requests = [ EngineCoreRequest( request_id=f"request-{idx}", + external_req_id=f"request-{idx}-ext", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors): for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + dummy_test_vectors.generation_tokens, + request_ids=[req.request_id for req in requests], + ) + # Add all requests except one to the OutputProcessor. num_active = len(dummy_test_vectors.generation_tokens) - 1 for request in requests[:num_active]: @@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): output_processor = OutputProcessor( dummy_test_vectors.tokenizer, log_stats=log_stats ) - engine_core = MockEngineCore(dummy_test_vectors.generation_tokens) engine_core_timestamp = time.monotonic() # Create LoRA requests @@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): lora_assignments = [lora1, lora2, None] requests = [ EngineCoreRequest( - request_id=f"request-{idx}", + request_id=f"request-{idx}-int", + external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] + engine_core = MockEngineCore( + dummy_test_vectors.generation_tokens, + request_ids=[req.request_id for req in requests], + ) + # Add all requests to the OutputProcessor for request in requests: output_processor.add_request(request, None) @@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): outputs = EngineCoreOutputs( outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() ) - # Find and mark request-0 as finished (it uses lora-1) + # Find and mark request-0-int as finished (it uses lora-1) for output in outputs.outputs: - if output.request_id == "request-0": + if output.request_id == "request-0-int": output.finish_reason = FinishReason.LENGTH break @@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): outputs = EngineCoreOutputs( outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() ) - # Find and mark request-1 as finished (it uses lora-2) + # Find and mark request-1-int as finished (it uses lora-2) for output in outputs.outputs: - if output.request_id == "request-1": + if output.request_id == "request-1-int": output.finish_reason = FinishReason.LENGTH break @@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): outputs = EngineCoreOutputs( outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() ) - # Find and mark request-2 as finished (it has no LoRA) + # Find and mark request-2-int as finished (it has no LoRA) for output in outputs.outputs: - if output.request_id == "request-2": + if output.request_id == "request-2-int": output.finish_reason = FinishReason.LENGTH break @@ -1107,7 +1134,9 @@ async def test_request_output_collector(): for idx in range(NUM_REQS) ] - collector = RequestOutputCollector(RequestOutputKind.DELTA) + collector = RequestOutputCollector( + RequestOutputKind.DELTA, request_id="my-request-id-int" + ) # CASE 1: Put then get. outputs = make_outputs() @@ -1163,7 +1192,9 @@ async def test_request_output_collector(): @pytest.mark.asyncio async def test_cumulative_output_collector_n(): """Test collector correctly handles multiple outputs by index.""" - collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE) + collector = RequestOutputCollector( + RequestOutputKind.CUMULATIVE, request_id="my-request-id-int" + ) outputs = [ RequestOutput( request_id="my-request-id", @@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n(): @pytest.mark.parametrize("runner", ["generate", "pooling"]) -def test_abort_requests(runner: str, dummy_test_vectors): +@pytest.mark.parametrize("abort_by", ["internal", "external"]) +def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors): output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) requests = [ EngineCoreRequest( request_id=f"request-{idx}", + external_req_id=f"external-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, eos_token_id=None, @@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors): output_kind = request.sampling_params.output_kind else: output_kind = request.pooling_params.output_kind - queue = RequestOutputCollector(output_kind=output_kind) + queue = RequestOutputCollector( + output_kind=output_kind, request_id=request.request_id + ) output_processor.add_request(request, None, queue=queue) for request in requests: - output_processor.abort_requests([request.request_id]) + if abort_by == "internal": + output_processor.abort_requests([request.request_id], internal=True) + else: + output_processor.abort_requests([request.external_req_id], internal=False) diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py index 736c0e54837fe..fe6f15df20982 100644 --- a/tests/v1/engine/test_parallel_sampling.py +++ b/tests/v1/engine/test_parallel_sampling.py @@ -4,11 +4,12 @@ from vllm import SamplingParams from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind +from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.parallel_sampling import ParentRequest def test_parent_request_to_output_stream() -> None: - parent_request = ParentRequest("parent_id", SamplingParams(n=2)) + parent_request = ParentRequest(make_request(SamplingParams(n=2))) parent_request.child_requests = {"child_id_0", "child_id_1"} output_0 = CompletionOutput( index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None @@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None: index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None ) # Request not finished - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert ("parent_id", [output_1], False) == parent_request.get_outputs( - "child_id_1", output_1 - ) - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert ("parent_id", [output_1], False) == parent_request.get_outputs( - "child_id_1", output_1 - ) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1) # output_1 finished output_1.finish_reason = "ended" - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert ("parent_id", [output_1], False) == parent_request.get_outputs( - "child_id_1", output_1 - ) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1) # Finished output_1 had already returned, DO NOT returned again - assert ("parent_id", [output_0], False) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert parent_request.get_outputs("child_id_1", output_1) == ( - "parent_id", - [], - False, - ) + assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0) + assert parent_request.get_outputs("child_id_1", output_1) == ([], False) # output_0 finished output_0.finish_reason = "ended" - assert ("parent_id", [output_0], True) == parent_request.get_outputs( - "child_id_0", output_0 - ) - assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True) + assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0) + assert parent_request.get_outputs("child_id_1", output_1) == ([], True) # Finished output_0 had already returned, DO NOT returned again - assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True) - assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True) + assert parent_request.get_outputs("child_id_0", output_0) == ([], True) + assert parent_request.get_outputs("child_id_1", output_1) == ([], True) def test_parent_request_to_output_final_only() -> None: parent_request = ParentRequest( - "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY) + make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)) ) parent_request.child_requests = {"child_id_0", "child_id_1"} output_0 = CompletionOutput( @@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None: index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None ) # Request not finished, return nothing - assert parent_request.get_outputs("child_id_0", output_0) == ( - "parent_id", - [], - False, - ) - assert parent_request.get_outputs("child_id_1", output_1) == ( - "parent_id", - [], - False, - ) + assert parent_request.get_outputs("child_id_0", output_0) == ([], False) + assert parent_request.get_outputs("child_id_1", output_1) == ([], False) # output_1 finished, but outputs won't be returned until all child requests finished output_1.finish_reason = "ended" - assert parent_request.get_outputs("child_id_0", output_0) == ( - "parent_id", - [], - False, - ) - assert parent_request.get_outputs("child_id_1", output_1) == ( - "parent_id", - [], - False, - ) + assert parent_request.get_outputs("child_id_0", output_0) == ([], False) + assert parent_request.get_outputs("child_id_1", output_1) == ([], False) # output_0 finished, as all child requests finished, the output would be returned output_0.finish_reason = "ended" - assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs( + assert ([output_0, output_1], True) == parent_request.get_outputs( "child_id_0", output_0 ) - assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs( + assert ([output_0, output_1], True) == parent_request.get_outputs( "child_id_1", output_1 ) + + +def make_request(sampling_params: SamplingParams) -> EngineCoreRequest: + return EngineCoreRequest( + request_id="parent_id", + external_req_id="ext_parent_id", + prompt_token_ids=None, + mm_features=None, + sampling_params=sampling_params, + pooling_params=None, + eos_token_id=None, + arrival_time=0.0, + lora_request=None, + cache_salt=None, + data_parallel_rank=None, + ) diff --git a/tests/v1/engine/test_preprocess_error_handling.py b/tests/v1/engine/test_preprocess_error_handling.py index 0586cc64fa104..821ac168d97a9 100644 --- a/tests/v1/engine/test_preprocess_error_handling.py +++ b/tests/v1/engine/test_preprocess_error_handling.py @@ -5,6 +5,7 @@ import pytest import torch.cuda from vllm import LLM, SamplingParams +from vllm.platforms import current_platform from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core import EngineCore @@ -14,6 +15,11 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch): """Test that preprocessing errors are handled gracefully.""" + if current_platform.is_rocm(): + pytest.skip( + "Skipped on ROCm: this test only works with 'fork', but ROCm uses 'spawn'." + ) + assert not torch.cuda.is_initialized(), ( "fork needs to be used for the engine " "core process and this isn't possible if cuda is already initialized" diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 1b11b8af49d17..1a16e391316f1 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -6,6 +6,7 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.multimodal import MultiModalUUIDDict from vllm.sampling_params import SamplingParams from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine.input_processor import InputProcessor @@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False ) - captured: dict[str, object] = {} + captured: dict[str, MultiModalUUIDDict] = {} def fake_preprocess( prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None @@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): ) # Expect request-id-based overrides are passed through - assert captured["mm_uuids"] == { - "image": [f"{request_id}-image-0", f"{request_id}-image-1"], - "video": [f"{request_id}-video-0"], - } + mm_uuids = captured["mm_uuids"] + assert set(mm_uuids.keys()) == {"image", "video"} + assert len(mm_uuids["image"]) == 2 + assert len(mm_uuids["video"]) == 1 + assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[ + "image" + ][0].endswith("-0") + assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[ + "image" + ][1].endswith("-1") + assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[ + "video" + ][0].endswith("-0") diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index 3541ef89bfc14..d14775668147e 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -343,6 +343,7 @@ class MockEngineCore: eos_token_id: int | None = None, stop_token_ids: list[int] | None = None, ignore_eos: bool = False, + request_ids: list[str] | None = None, ) -> None: self.num_requests = len(tokens_list) self.tokens_list = tokens_list @@ -355,6 +356,11 @@ class MockEngineCore: self.eos_token_id = eos_token_id self.stop_token_ids = stop_token_ids self.ignore_eos = ignore_eos + self.request_ids = ( + request_ids + if request_ids is not None + else [f"request-{i}" for i in range(self.num_requests)] + ) def get_outputs(self) -> list[EngineCoreOutput]: do_logprobs = self.do_logprobs @@ -386,7 +392,7 @@ class MockEngineCore: prompt_logprobs = None new_token_id = token_ids[token_idx] output = EngineCoreOutput( - request_id=f"request-{req_idx}", + request_id=self.request_ids[req_idx], new_token_ids=[new_token_id], new_logprobs=logprobs, new_prompt_logprobs_tensors=prompt_logprobs, diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py index be5693bbf2736..644d8ce00686e 100644 --- a/tests/v1/entrypoints/openai/serving_responses/test_image.py +++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py @@ -8,7 +8,7 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url # Use a small vision model for testing MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -52,9 +52,9 @@ async def client(image_server): @pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: +def url_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(local_asset_server.get_image_asset(image_url)) + image_url: encode_image_url(local_asset_server.get_image_asset(image_url)) for image_url in TEST_IMAGE_ASSETS } @@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, raw_image_url: str, - base64_encoded_image: dict[str, str], + url_encoded_image: dict[str, str], ): content_text = "What's in this image?" messages = [ @@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded( "content": [ { "type": "input_image", - "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501 + "image_url": url_encoded_image[raw_image_url], "detail": "auto", }, {"type": "input_text", "text": content_text}, diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py index 75edb79fb4af4..8312231716935 100644 --- a/tests/v1/kv_connector/unit/test_example_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -9,7 +9,7 @@ from PIL import Image from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.config import KVTransferConfig -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url from vllm.platforms import current_platform MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8" @@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]): placeholders = [ { "type": "image_url", - "image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"}, + "image_url": {"url": encode_image_url(image_pil)}, } for image_pil in image_urls ] diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 20ef566416b8f..f4389a405196f 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import ( has_kv_transfer_group, ) from vllm.forward_context import ForwardContext +from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.platforms.interface import Platform from vllm.sampling_params import SamplingParams from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from vllm.v1.request import RequestStatus @@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend): run_test_and_cleanup() +class RequestIdMapper: + """Helper class to map external request IDs to internal request IDs.""" + + def __init__(self, output_processor: OutputProcessor): + self.req_id_mapping: dict[str, str] = {} + self.original_add_request = output_processor.add_request + output_processor.add_request = self._add_request + + def _add_request(self, request: EngineCoreRequest, *args, **kwargs): + self.req_id_mapping[request.external_req_id] = request.request_id + return self.original_add_request(request, *args, **kwargs) + + def __call__(self, external_req_id: str) -> str: + return self.req_id_mapping[external_req_id] + + def _run_abort_timeout_test(llm: LLM, timeout: int): """Helper function to run the abort timeout test logic.""" remote_prefill_opts = { @@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int): 0 ].req_to_blocks + id_mapper = RequestIdMapper(llm.llm_engine.output_processor) + + def req_id(outputs: list[RequestOutput]) -> str: + assert len(outputs) == 1 + return id_mapper(outputs[0].request_id) + padding = "Just making this request a little longer so that we're sure " "we're not hitting the small-request lower bound beneath which we don't " "actually trigger the whole kv transfer, but rather just recompute the " "blocks on D." - _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params) + req0_id = req_id( + llm.generate([f"What is the capital of Japan? {padding}"], sampling_params) + ) # Request finished but not freed - assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks + assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks # Some other request, 0 still not freed - _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params) - assert "0" in req_to_blocks - assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks + req1_id = req_id( + llm.generate([f"What is the capital of Italy? {padding}"], sampling_params) + ) + assert req0_id in req_to_blocks + assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks # Wait for timeout and trigger another scheduler loop time.sleep(timeout) _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params) # Request-0 times out and is cleared! - assert "0" not in req_to_blocks + assert req0_id not in req_to_blocks # Need to shutdown the background thread to release NIXL side channel port llm.llm_engine.engine_core.shutdown() diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index f63cd3a6e42aa..a5e326e82c592 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -306,10 +306,16 @@ def test_prepare_inputs_padded(): proposer = _create_proposer("eagle", num_speculative_tokens) - output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded( - common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count + output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = ( + proposer.prepare_inputs_padded( + common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count + ) ) + # Verify num_rejected_tokens_gpu is calculated correctly + expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device) + assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected) + assert output_metadata.max_query_len == 3 assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc) assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample) diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index 5bf823417d4dc..3caa7c14b393b 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -4,7 +4,7 @@ import openai import pytest -from vllm.multimodal.utils import encode_image_base64 +from vllm.multimodal.utils import encode_image_url from vllm.platforms import current_platform from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS @@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer @pytest.fixture(scope="session") -def base64_encoded_image(local_asset_server) -> dict[str, str]: +def url_encoded_image(local_asset_server) -> dict[str, str]: return { - image_asset: encode_image_base64( - local_asset_server.get_image_asset(image_asset) - ) + image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset)) for image_asset in TEST_IMAGE_ASSETS } @@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]: @pytest.mark.asyncio @pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU") @pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"]) -async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]): +async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]): pytest.skip("Skip this test until it's fixed.") - def whats_in_this_image_msg(b64): + def whats_in_this_image_msg(url): return [ { "role": "user", "content": [ {"type": "text", "text": "What's in this image?"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, - }, + {"type": "image_url", "image_url": {"url": url}}, ], } ] @@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str # Other requests now should be much faster for image_url in TEST_IMAGE_ASSETS: - image_base64 = base64_encoded_image[image_url] - chat_completion_from_base64 = await client.chat.completions.create( + image_url = url_encoded_image[image_url] + chat_completion_from_url = await client.chat.completions.create( model=model_name, - messages=whats_in_this_image_msg(image_base64), + messages=whats_in_this_image_msg(image_url), max_completion_tokens=24, temperature=0.0, ) - result = chat_completion_from_base64 + result = chat_completion_from_url assert result choice = result.choices[0] assert choice.finish_reason == "length" diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 0eae279acf5be..299c8219120ae 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -4,6 +4,7 @@ import functools from collections.abc import Callable import torch +from torch._ops import OpOverload import vllm.envs as envs from vllm.platforms import current_platform @@ -433,16 +434,16 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_impl( from aiter import rmsnorm2d_fwd_with_add residual_out = torch.empty_like(residual) - output = torch.empty_like(x) + out = torch.empty_like(x) rmsnorm2d_fwd_with_add( - output, # output + out, # output x, # input residual, # residual input residual_out, # residual output weight, variance_epsilon, ) - return output, residual_out + return out, residual_out def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( @@ -451,7 +452,84 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( weight: torch.Tensor, variance_epsilon: float, ) -> tuple[torch.Tensor, torch.Tensor]: - return torch.empty_like(x), torch.empty_like(residual) + residual_out = torch.empty_like(residual) + out = torch.empty_like(x) + return out, residual_out + + +def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + import aiter as rocm_aiter + + assert quant_dtype in [torch.int8, _FP8_DTYPE] + + y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device) + out = torch.empty(x.shape, dtype=quant_dtype, device=x.device) + residual_out = torch.empty_like(x) + + rocm_aiter.rmsnorm2d_fwd_with_add_dynamicquant( + out, + x, + residual, + residual_out, + y_scale, + weight, + epsilon, + use_model_sensitive_rmsnorm=0, + ) + + return out, residual_out, y_scale + + +def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device) + out = torch.empty(x.shape, dtype=quant_dtype, device=x.device) + residual_out = torch.empty_like(x) + + return out, residual_out, y_scale + + +def _rocm_aiter_rmsnorm_fused_dynamic_quant_impl( + x: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor]: + import aiter as rocm_aiter + + assert quant_dtype in [torch.int8, _FP8_DTYPE] + + y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device) + out = torch.empty(x.shape, dtype=quant_dtype, device=x.device) + + rocm_aiter.rmsnorm2d_fwd_with_dynamicquant( + out, x, y_scale, weight, epsilon, use_model_sensitive_rmsnorm=0 + ) + + return out, y_scale + + +def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake( + x: torch.Tensor, + weight: torch.Tensor, + epsilon: float, + quant_dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor]: + y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device) + out = torch.empty(x.shape, dtype=quant_dtype, device=x.device) + + return out, y_scale def _rocm_aiter_per_tensor_quant_impl( @@ -527,7 +605,11 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl( dtype_quant=AITER_FP8_DTYPE, res1=residual, ) - return (x_quant, x_quant_scales, res) + return ( + x_quant, + res, + x_quant_scales, + ) def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake( @@ -541,8 +623,8 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake( scale_shape = (M, (N + group_size - 1) // group_size) return ( torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device), - torch.empty(scale_shape, dtype=torch.float32, device=x.device), torch.empty_like(residual, device=residual.device), + torch.empty(scale_shape, dtype=torch.float32, device=x.device), ) @@ -761,7 +843,7 @@ class rocm_aiter_ops: @classmethod @if_aiter_supported - def is_linear_fp8_enaled(cls) -> bool: + def is_linear_fp8_enabled(cls) -> bool: return cls.is_linear_enabled() @classmethod @@ -901,6 +983,20 @@ class rocm_aiter_ops: dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm_fused_dynamic_quant", + op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl, + fake_impl=_rocm_aiter_rmsnorm_fused_dynamic_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm_fused_add_dynamic_quant", + op_func=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl, + fake_impl=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + direct_register_custom_op( op_name="rocm_aiter_rmsnorm_fp8_group_quant", op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl, @@ -936,13 +1032,54 @@ class rocm_aiter_ops: direct_register_custom_op( op_name="rocm_aiter_per_token_quant", op_func=_rocm_aiter_per_token_quant_impl, - mutates_args=["scale"], fake_impl=_rocm_aiter_per_token_quant_fake, dispatch_key=current_platform.dispatch_key, ) _OPS_REGISTERED = True + @staticmethod + def get_rmsnorm_fused_add_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default + + @staticmethod + def get_rmsnorm_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_rms_norm.default + + @staticmethod + def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default + + @staticmethod + def get_rmsnorm_fused_dynamic_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_rmsnorm_fused_dynamic_quant.default + + @staticmethod + def get_rmsnorm_group_fused_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default + + @staticmethod + def get_rmsnorm_group_add_fused_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default + + @staticmethod + def get_per_token_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_per_token_quant.default + + @staticmethod + def get_group_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_group_fp8_quant.default + + @staticmethod + def get_act_mul_fused_fp8_group_quant_op() -> OpOverload: + return torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default + + @staticmethod + def rms_norm( + x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float + ) -> torch.Tensor: + return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon) + @staticmethod def rms_norm2d_with_add( x: torch.Tensor, @@ -954,12 +1091,6 @@ class rocm_aiter_ops: x, residual, weight, variance_epsilon ) - @staticmethod - def rms_norm( - x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float - ) -> torch.Tensor: - return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon) - @staticmethod def gemm_a8w8( A: torch.Tensor, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index cf7f17a033be3..c1519fc177250 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -788,20 +788,6 @@ def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool: return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability) -def cutlass_blockwise_scaled_grouped_mm( - output: torch.Tensor, - a: torch.Tensor, - b: torch.Tensor, - scales_a: torch.Tensor, - scales_b: torch.Tensor, - problem_sizes: torch.Tensor, - expert_offsets: torch.Tensor, -): - torch.ops._C.cutlass_blockwise_scaled_grouped_mm( - output, a, b, scales_a, scales_b, problem_sizes, expert_offsets - ) - - def cutlass_scaled_fp4_mm( a: torch.Tensor, b: torch.Tensor, @@ -2342,18 +2328,6 @@ def concat_and_cache_mla( ) -def copy_blocks( - key_caches: list[torch.Tensor], - value_caches: list[torch.Tensor], - block_mapping: torch.Tensor, -) -> None: - torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) - - -def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None: - torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) - - def swap_blocks( src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor ) -> None: diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 95c17cb331f67..239f5376eb462 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -383,18 +383,6 @@ class ipex_ops: ) return None - @staticmethod - def copy_blocks( - key_caches: list[torch.Tensor], - value_caches: list[torch.Tensor], - block_mapping: torch.Tensor, - ) -> None: - torch.xpu.copy_blocks( # type: ignore - key_caches, - value_caches, - block_mapping, - ) - @staticmethod def swap_blocks( src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index ed0021db204ac..416b996df9f22 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -77,7 +77,8 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): ) CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend" # Placeholder for third-party/custom backends - must be registered before use - CUSTOM = "" + # set to None to avoid alias with other backend, whose value is an empty string + CUSTOM = None def get_path(self, include_classname: bool = True) -> str: """Get the class path for this backend (respects overrides). @@ -139,7 +140,8 @@ class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend" GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend" # Placeholder for third-party/custom backends - must be registered before use - CUSTOM = "" + # set to None to avoid alias with other backend, whose value is an empty string + CUSTOM = None def get_path(self, include_classname: bool = True) -> str: """Get the class path for this backend (respects overrides). diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py index 25f54cc867b5a..1c1623b13f55a 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/attention/layers/mm_encoder_attention.py @@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp): cu_seqlens=cu_seqlens, ) if is_reshaped: - output = output.view(bsz, q_len, -1) + output = output.reshape(bsz, q_len, -1) return output def _forward_fa( @@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp): fa_version=self._fa_version, ) if is_reshaped: - output = output.view(bsz, q_len, -1) + output = output.reshape(bsz, q_len, -1) return output def forward_native( diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py index 16106f3c93a6a..f347fb3fbba51 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/attention/ops/merge_attn_states.py @@ -15,7 +15,7 @@ def merge_attn_states( output_lse: torch.Tensor | None = None, ) -> None: # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel - # is not support for FP8 dtype, fallback to use Triton kernel. + # does not support FP8 dtype, fallback to use Triton kernel. def supported_dtypes(o: torch.Tensor) -> bool: return o.dtype in [torch.float32, torch.half, torch.bfloat16] diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py index ec9ed34f561b4..7301aa3e5932d 100644 --- a/vllm/compilation/matcher_utils.py +++ b/vllm/compilation/matcher_utils.py @@ -6,11 +6,13 @@ import torch from torch._higher_order_ops import auto_functionalized from torch._ops import OpOverload +from vllm._aiter_ops import rocm_aiter_ops from vllm.config import get_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, QuantKey, _normalize_quant_group_shape, kFp8Dynamic64Sym, @@ -150,26 +152,50 @@ class MatcherRotaryEmbedding(MatcherCustomOp): class MatcherRMSNorm(MatcherCustomOp): - def __init__(self, epsilon: float, enabled: bool | None = None): + def __init__( + self, + epsilon: float, + enabled: bool | None = None, + match_rocm_aiter: bool = False, + ): if enabled is None: enabled = RMSNorm.enabled() super().__init__(enabled) self.epsilon = epsilon + self._rmsnorm_op = RMS_OP + self.match_rocm_aiter = match_rocm_aiter + + if match_rocm_aiter: + self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_op() def inputs(self): input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16) weight = self.empty(16) return [input, weight] + def forward_rocm_aiter( + self, + input: torch.Tensor, + weight: torch.Tensor, + ) -> torch.Tensor: + return self._rmsnorm_op( + x=input, + weight=weight, + variance_epsilon=self.epsilon, + ) + def forward_custom( self, input: torch.Tensor, weight: torch.Tensor, ) -> torch.Tensor: + if self.match_rocm_aiter: + return self.forward_rocm_aiter(input, weight) + result = torch.empty_like(input) _, result = auto_functionalized( - RMS_OP, + self._rmsnorm_op, result=result, input=input, weight=weight, @@ -189,12 +215,23 @@ class MatcherRMSNorm(MatcherCustomOp): class MatcherFusedAddRMSNorm(MatcherCustomOp): - def __init__(self, epsilon: float, enabled: bool | None = None): + def __init__( + self, + epsilon: float, + enabled: bool | None = None, + match_rocm_aiter: bool = False, + ): if enabled is None: enabled = RMSNorm.enabled() super().__init__(enabled) self.epsilon = epsilon + self.match_rocm_aiter = match_rocm_aiter + + self._rmsnorm_op = RMS_ADD_OP + + if match_rocm_aiter: + self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op() def inputs(self): input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16) @@ -202,14 +239,27 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp): residual = self.empty(5, 16) return [input, weight, residual] + def forward_rocm_aiter( + self, + input: torch.Tensor, + weight: torch.Tensor, + residual: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + return self._rmsnorm_op( + x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon + ) + def forward_custom( self, input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: + if self.match_rocm_aiter: + return self.forward_rocm_aiter(input, weight, residual) + _, result, residual = auto_functionalized( - RMS_ADD_OP, + self._rmsnorm_op, input=input, residual=residual, weight=weight, @@ -236,22 +286,46 @@ class MatcherQuantFP8(MatcherCustomOp): enabled: bool | None = None, has_col_major_scales: bool = False, is_e8m0: bool = False, + match_rocm_aiter: bool = False, ): if enabled is None: enabled = QuantFP8.enabled() super().__init__(enabled) self.quant_key = quant_key - assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}" - self.QUANT_OP = QUANT_OPS[quant_key] - self.has_col_major_scales = has_col_major_scales self.is_e8m0 = is_e8m0 + self.match_rocm_aiter = match_rocm_aiter + + if match_rocm_aiter: + assert not quant_key.scale.group_shape.is_per_tensor(), ( + "ROCm aiter fusion pass does not support per tensor quantization" + ) + if quant_key.scale.group_shape.is_per_token(): + self.QUANT_OP = rocm_aiter_ops.get_per_token_quant_op() + else: + assert quant_key.scale.group_shape.col == 128, ( + "ROCm aiter fusion pass currently supports " + "quantization operation with group_size 128" + ) + if current_platform.is_fp8_fnuz(): + self.QUANT_OP = rocm_aiter_ops.get_group_quant_op() + else: + self.QUANT_OP = ( + torch.ops.vllm.triton_per_token_group_quant_fp8.default + ) + + else: + assert quant_key in QUANT_OPS, ( + f"unsupported quantization scheme {quant_key}" + ) + self.QUANT_OP = QUANT_OPS[quant_key] + + assert quant_key.dtype == current_platform.fp8_dtype(), ( + "Only QuantFP8 supported by" + ) + assert quant_key.scale2 is None - assert quant_key.dtype == current_platform.fp8_dtype(), ( - "Only QuantFP8 supported by" - ) - assert quant_key.scale2 is None self.quant_fp8 = QuantFP8( quant_key.scale.static, quant_key.scale.group_shape, @@ -259,11 +333,29 @@ class MatcherQuantFP8(MatcherCustomOp): use_ue8m0=is_e8m0, ) + def forward_rocm_aiter( + self, + input: torch.Tensor, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + quant_key_group_shape = self.quant_key.scale.group_shape + if quant_key_group_shape == GroupShape.PER_TOKEN: + return self.QUANT_OP( + x=input, + quant_dtype=self.quant_key.dtype, + scale=scale, + ) + else: + return self.QUANT_OP(input, quant_key_group_shape.col) + def forward_custom( self, input: torch.Tensor, scale: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: + if self.match_rocm_aiter: + return self.forward_rocm_aiter(input, scale) + result = torch.empty( input.shape, device=input.device, dtype=self.quant_key.dtype ) diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 4ebb386f75ed8..4c2dee505a941 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -16,7 +16,7 @@ from .vllm_inductor_pass import VllmInductorPass if rocm_aiter_ops.is_enabled(): from vllm.compilation.rocm_aiter_fusion import ( - RocmAiterRMSNormFp8GroupQuantFusionPass, + RocmAiterRMSNormFusionPass, RocmAiterSiluMulFp8GroupQuantFusionPass, ) @@ -117,7 +117,9 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.fuse_norm_quant: self.passes += [RMSNormQuantFusionPass(config)] if rocm_aiter_ops.is_enabled(): - self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)] + self.passes += [ + RocmAiterRMSNormFusionPass(config), + ] if self.pass_config.fuse_act_quant: self.passes += [ActivationQuantFusionPass(config)] if rocm_aiter_ops.is_enabled(): diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py index 8b5db9de38181..f66bb76b97f05 100644 --- a/vllm/compilation/rocm_aiter_fusion.py +++ b/vllm/compilation/rocm_aiter_fusion.py @@ -9,60 +9,195 @@ from torch._inductor.pattern_matcher import PatternMatcherPass from torch._ops import OpOverload import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 +from vllm._aiter_ops import rocm_aiter_ops from vllm.compilation.activation_quant_fusion import ActivationQuantPattern from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + QuantKey, + ScaleDesc, +) from vllm.platforms import current_platform -from .fusion import empty_bf16 +from .fusion import ( + FusedRMSQuantKey, +) from .inductor_pass import enable_fake_mode -from .matcher_utils import MatcherSiluAndMul +from .matcher_utils import ( + MatcherFusedAddRMSNorm, + MatcherQuantFP8, + MatcherRMSNorm, + MatcherSiluAndMul, +) from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass logger = init_logger(__name__) FP8_DTYPE = current_platform.fp8_dtype() -AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default -AITER_RMS_ADD_GROUP_QUANT_OP = ( - torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default -) -AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default -AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default +class AiterRMSNormQuantPattern: + def __init__( + self, epsilon: float, key: FusedRMSQuantKey, match_aiter_quant: bool = True + ): + self.epsilon = epsilon + self.quant_dtype = key.quant.dtype -AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default -TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default - -FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default + self.rmsnorm_matcher = ( + MatcherRMSNorm(epsilon, match_rocm_aiter=True) + if not key.fused_add + else MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True) + ) + self.quant_matcher = MatcherQuantFP8( + key.quant, + match_rocm_aiter=match_aiter_quant, + ) -class AiterRMSFp8GroupQuantPattern: +class AiterRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern): + """AITER RMSNorm + Dynamic Quantization pattern.""" + + FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_op() + + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + match_aiter_quant: bool = True, + group_shape: GroupShape = GroupShape.PER_TOKEN, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=False, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + + super().__init__(epsilon, key, match_aiter_quant) + + def register(self, pm_pass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + ): + result_rms = self.rmsnorm_matcher(input, weight) + result, scale = self.quant_matcher(result_rms) + return result, scale + + def replacement( + input: torch.Tensor, + weight: torch.Tensor, + ): + result = self.FUSED_OP( + x=input, + weight=weight, + epsilon=self.epsilon, + quant_dtype=self.quant_dtype, + ) + + return result[0], result[1] + + pm.register_replacement( + pattern, + replacement, + self.rmsnorm_matcher.inputs(), + pm.fwd_only, + pm_pass, + ) + + +class AiterFusedAddRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern): + """AITER RMSNorm Fused Add + Dynamic Quantization pattern.""" + + FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_op() + + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + match_aiter_quant: bool = True, + group_shape: GroupShape = GroupShape.PER_TOKEN, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=True, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + + super().__init__(epsilon, key, match_aiter_quant) + + def register(self, pm_pass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + residual: torch.Tensor, + ): + result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual) + result, scale = self.quant_matcher(result_rms) + + return result, residual_out, scale + + def replacement( + input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor + ): + result = self.FUSED_OP( + x=input, + residual=residual, + weight=weight, + epsilon=self.epsilon, + quant_dtype=self.quant_dtype, + ) + + return result[0], result[1], result[2] + + pm.register_replacement( + pattern, + replacement, + self.rmsnorm_matcher.inputs(), + pm.fwd_only, + pm_pass, + ) + + +class AiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern): """ This pattern fuses aiter rms_norm & group fp8 quant custom ops into an aiter rms_norm_group_fp8_quant op. """ - def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): - self.epsilon = epsilon - self.quant_dtype = quant_dtype - self.quant_op = quant_op + FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op() + + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + group_shape: GroupShape, + match_aiter_quant: bool = True, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=False, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + + super().__init__(epsilon, key, match_aiter_quant) def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, weight: torch.Tensor, ): - at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon) - - at2 = self.quant_op(at1, 128) - - return at2[0], at2[1] + result_rms = self.rmsnorm_matcher(input, weight) + result, scale = self.quant_matcher(result_rms) + return result, scale def replacement( input: torch.Tensor, weight: torch.Tensor, ): - at = AITER_RMS_GROUP_QUANT_OP( + at = self.FUSED_OP( x=input, weight=weight, variance_epsilon=self.epsilon, @@ -71,49 +206,52 @@ class AiterRMSFp8GroupQuantPattern: return at[0], at[1] - inputs = [ - empty_bf16(5, 4), # input - empty_bf16(1, 5), # weight - ] - - pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + pm.register_replacement( + pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass + ) -class AiterFusedAddRMSFp8GroupQuantPattern: +class AiterFusedAddRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern): """ This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops into a aiter rms_norm_with_add_group_fp8_quant op. """ - def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): - self.epsilon = epsilon - self.quant_dtype = quant_dtype - self.quant_op = quant_op + FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_op() + + def __init__( + self, + epsilon: float, + quant_dtype: torch.dtype, + group_shape: GroupShape, + match_aiter_quant: bool = True, + symmetric=True, + ): + scale = ScaleDesc(torch.float32, False, group_shape) + key = FusedRMSQuantKey( + fused_add=True, + quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric), + ) + + super().__init__(epsilon, key, match_aiter_quant) def register(self, pm_pass: PatternMatcherPass): def pattern( input: torch.Tensor, - residual: torch.Tensor, weight: torch.Tensor, + residual: torch.Tensor, ): - at1 = AITER_RMS_ADD_OP( - x=input, - residual=residual, - weight=weight, - variance_epsilon=self.epsilon, - ) + result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual) + result, scale = self.quant_matcher(result_rms) - at2 = self.quant_op(at1[0], 128) - - # result, scale, residual - return at2[0], at2[1], at1[1] + return result, residual_out, scale def replacement( input: torch.Tensor, - residual: torch.Tensor, weight: torch.Tensor, + residual: torch.Tensor, ): - at = AITER_RMS_ADD_GROUP_QUANT_OP( + at = self.FUSED_OP( x=input, residual=residual, weight=weight, @@ -124,18 +262,15 @@ class AiterFusedAddRMSFp8GroupQuantPattern: # result, scale, residual return at[0], at[1], at[2] - inputs = [ - empty_bf16(5, 4), # input - empty_bf16(5, 4), # residual - empty_bf16(1, 5), # weight - ] - - pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + pm.register_replacement( + pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass + ) -class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass): +class RocmAiterRMSNormFusionPass(VllmPatternMatcherPass): """ - This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. + This pass fuses aiter rms_norm & vllm/aiter quant custom ops + into a fused rms_norm_quant op. It also supports fused_add_rms_norm. """ @@ -144,20 +279,33 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass): super().__init__(config) self.patterns: PatternMatcherPass = PatternMatcherPass( - pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass" + pass_name="rocm_aiter_rms_norm_quant_fusion_pass" ) # Make sure fused add patterns are before simple rms norm, # as the latter is a subset of the former in torch ops for epsilon in [1e-5, 1e-6]: - # Fuse rms_norm + dynamic group fp8 quant - for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: - AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register( - self.patterns - ) + # Fuse aiter rms_norm + aiter dynamic group fp8 quant + AiterRMSFp8GroupQuantPattern( + epsilon, FP8_DTYPE, GroupShape(1, 128) + ).register(self.patterns) - AiterFusedAddRMSFp8GroupQuantPattern( - epsilon, FP8_DTYPE, quant_op + # Fuse aiter fused_add_rms_norm + aiter dynamic group fp8 quant + AiterFusedAddRMSFp8GroupQuantPattern( + epsilon, FP8_DTYPE, GroupShape(1, 128) + ).register(self.patterns) + + for match_aiter_quant in [True, False]: + # Fuse aiter rms_norm + (aiter / vllm built-in) + # dynamic per-token fp8 quant + AiterRMSNormDynamicQuantPattern( + epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant + ).register(self.patterns) + + # Fuse aiter fused_add_rms_norm + (aiter / vllm built-in) + # dynamic per-token fp8 quant + AiterFusedAddRMSNormDynamicQuantPattern( + epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant ).register(self.patterns) self.dump_patterns(config, self.patterns) @@ -169,6 +317,8 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass): def uuid(self) -> Any: fusion_patterns = [ + AiterRMSNormDynamicQuantPattern, + AiterFusedAddRMSNormDynamicQuantPattern, AiterRMSFp8GroupQuantPattern, AiterFusedAddRMSFp8GroupQuantPattern, ] @@ -181,6 +331,8 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): ops into an aiter silu_and_mul_group_fp8_quant op. """ + FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op() + def __init__(self, quant_op: OpOverload): self.silu_and_mul_matcher = MatcherSiluAndMul() self.quant_op = quant_op @@ -196,7 +348,7 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): def replacement( input: torch.Tensor, ): - at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128) + at = self.FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128) return at[0], at[1] inputs = [ @@ -216,6 +368,11 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 """ + AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op() + TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default + + QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP] + @enable_fake_mode def __init__(self, config: VllmConfig): super().__init__(config) @@ -224,7 +381,7 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" ) - for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: + for quant_op in self.QUANT_OPS: AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) self.dump_patterns(config, self.patterns) diff --git a/vllm/config/model.py b/vllm/config/model.py index db5789b709372..e26b227de976c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -11,7 +11,6 @@ import torch from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE -from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs from vllm.attention.backends.registry import AttentionBackendEnum @@ -29,6 +28,7 @@ from vllm.transformers_utils.config import ( get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, + is_rope_parameters_nested, try_get_dense_modules, try_get_generation_config, try_get_safetensors_metadata, @@ -71,7 +71,7 @@ else: logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] -ConvertType = Literal["none", "embed", "classify", "reward"] +ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"] ConvertOption = Literal["auto", ConvertType] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] @@ -843,12 +843,18 @@ class ModelConfig: producer_name = quant_cfg.get("producer", {}).get("name") if producer_name == "modelopt": quant_algo = quant_cfg.get("quantization", {}).get("quant_algo") - if quant_algo == "FP8": - quant_cfg["quant_method"] = "modelopt" - elif quant_algo == "NVFP4": - quant_cfg["quant_method"] = "modelopt_fp4" - elif quant_algo is not None: - raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") + if quant_algo is not None: + quant_algo_upper = str(quant_algo).upper() + if quant_algo_upper in { + "FP8", + "FP8_PER_CHANNEL_PER_TOKEN", + "FP8_PB_WO", + }: + quant_cfg["quant_method"] = "modelopt" + elif quant_algo_upper == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + else: + raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") return quant_cfg @@ -1088,11 +1094,10 @@ class ModelConfig: # The size of inputs_embeds is usually identical to the size # of the hidden states, however there are exceptions, such as # embedding models like CLIP and SigLIP - for target_attr in ("projection_dim", "projection_size"): - if hasattr(self.hf_text_config, target_attr): - return getattr(self.hf_text_config, target_attr) - - return self.get_hidden_size() + names = ("projection_dim", "projection_size") + return getattr_iter( + self.hf_text_config, names, default_factory=self.get_hidden_size + ) @property def is_deepseek_mla(self) -> bool: @@ -1225,14 +1230,12 @@ class ModelConfig: # For ChatGLM: "multi_query_group_num", ] - for attr in attributes: - num_kv_heads = getattr(self.hf_text_config, attr, None) - if num_kv_heads is not None: - return num_kv_heads - # For non-grouped-query attention models, the number of KV heads is # equal to the number of attention heads. - return self.hf_text_config.num_attention_heads + default_factory = lambda: self.hf_text_config.num_attention_heads + return getattr_iter( + self.hf_text_config, attributes, default_factory=default_factory + ) def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: """Returns the number of KV heads per GPU.""" @@ -1536,6 +1539,10 @@ class ModelConfig: def is_multimodal_raw_input_only_model(self) -> bool: return self._model_info.supports_multimodal_raw_input_only + @property + def requires_raw_input_tokens(self) -> bool: + return self._model_info.requires_raw_input_tokens + @property def is_cross_encoder(self) -> bool: return ( @@ -2119,9 +2126,7 @@ def _get_and_verify_max_len( # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. # To simplify the verification, we convert it to dict[str, TypedDict]. rope_parameters = getattr(hf_config, "rope_parameters", None) - if rope_parameters and not set(rope_parameters.keys()).issubset( - ALLOWED_LAYER_TYPES - ): + if rope_parameters and not is_rope_parameters_nested(rope_parameters): rope_parameters = {"": rope_parameters} # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 470296517deb1..614373782d12f 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -9,7 +9,7 @@ import inspect import json import pathlib import textwrap -from collections.abc import Iterable, Mapping, Sequence, Set +from collections.abc import Callable, Iterable, Mapping, Sequence, Set from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar @@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field: def getattr_iter( - object: object, names: Iterable[str], default: Any, warn: bool = False + object: object, + names: Iterable[str], + default: Any | None = None, + default_factory: Callable[[], Any] | None = None, + warn: bool = False, ) -> Any: """ A helper function that retrieves an attribute from an object which may @@ -96,7 +100,7 @@ def getattr_iter( names[0], ) return getattr(object, name) - return default + return default_factory() if default_factory is not None else default def contains_object_print(text: str) -> bool: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index 705960aebe2da..9a15d3fa6ed09 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -408,7 +408,13 @@ class MooncakeConnectorWorker: self.engine = TransferEngine() self.hostname = get_ip() - ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "") + protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get( # type: ignore[union-attr] + "mooncake_protocol", "rdma" + ) + logger.info( + "The Mooncake Transfer Engine is using %s as its protocol.", protocol + ) + ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "") if ret_value != 0: raise RuntimeError("Mooncake Transfer Engine initialization failed.") diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..bf656cf23de65 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -71,7 +71,11 @@ class EngineClient(ABC): truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model.""" + """Generate outputs for a request from a pooling model. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove this argument in v0.15. + """ ... @abstractmethod diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ab055dfb1fb0e..5e31f60ad0ca8 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -67,6 +67,15 @@ else: logger = init_logger(__name__) + +class ChatTemplateResolutionError(ValueError): + """Raised when chat template resolution fails. + + This is a subclass of ValueError for backward compatibility with + existing exception handlers. + """ + + MODALITY_PLACEHOLDERS_MAP = { "image": "<##IMAGE##>", "audio": "<##AUDIO##>", @@ -1814,7 +1823,7 @@ def apply_hf_chat_template( ) if hf_chat_template is None: - raise ValueError( + raise ChatTemplateResolutionError( "As of transformers v4.44, default chat template is no longer " "allowed, so you must provide a chat template if the tokenizer " "does not define one." diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2768e267f4837..6be1f1a126f55 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1280,6 +1280,7 @@ class LLM: pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, tokenization_kwargs: dict[str, Any] | None = None, + score_template: str | None = None, ) -> list[ScoringRequestOutput]: model_config = self.model_config @@ -1313,6 +1314,7 @@ class LLM: data_2=d, tokenizer=tokenizer, tokenization_kwargs=tokenization_kwargs, + score_template=score_template, ) if token_type_ids := engine_prompt.pop("token_type_ids", None): @@ -1347,6 +1349,7 @@ class LLM: use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + chat_template: str | None = None, ) -> list[ScoringRequestOutput]: """Generate similarity scores for all pairs `` or ``. @@ -1379,6 +1382,8 @@ class LLM: lora_request: LoRA request to use for generation, if any. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. + chat_template: The chat template to use for the scoring. If None, we + use the model's default chat template. Returns: A list of `ScoringRequestOutput` objects containing the generated scores in the same order as the input prompts. @@ -1406,6 +1411,11 @@ class LLM: ): raise ValueError("Score API is only enabled for num_labels == 1.") + if not model_config.is_cross_encoder and chat_template is not None: + raise ValueError( + "chat_template is only supported for cross-encoder models." + ) + # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs @@ -1475,6 +1485,7 @@ class LLM: use_tqdm, pooling_params, lora_request, + score_template=chat_template, ) else: return self._embedding_score( @@ -1610,7 +1621,7 @@ class LLM: added_request_ids.append(request_id) except Exception as e: if added_request_ids: - self.llm_engine.abort_request(added_request_ids) + self.llm_engine.abort_request(added_request_ids, internal=True) raise e def _validate_mm_data_and_uuids( @@ -1720,7 +1731,7 @@ class LLM: priority=priority, prompt_text=prompt_text, ) - return request_id + return engine_request.request_id def _run_engine( self, *, use_tqdm: bool | Callable[..., tqdm] = True diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d45773f5364e3..bc8855a76e2a2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -909,6 +909,16 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(RequestValidationError) async def validation_exception_handler(_: Request, exc: RequestValidationError): + from vllm.entrypoints.openai.protocol import VLLMValidationError + + param = None + for error in exc.errors(): + if "ctx" in error and "error" in error["ctx"]: + ctx_error = error["ctx"]["error"] + if isinstance(ctx_error, VLLMValidationError): + param = ctx_error.parameter + break + exc_str = str(exc) errors_str = str(exc.errors()) @@ -922,6 +932,7 @@ def build_app(args: Namespace) -> FastAPI: message=message, type=HTTPStatus.BAD_REQUEST.phrase, code=HTTPStatus.BAD_REQUEST, + param=param, ) ) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) @@ -1145,6 +1156,7 @@ async def init_app_state( engine_client, state.openai_serving_models, request_logger=request_logger, + score_template=resolved_chat_template, log_error_stack=args.log_error_stack, ) if ("embed" in supported_tasks or "score" in supported_tasks) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a3c347cb1bd3f..982f5533ad7f9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -131,6 +131,36 @@ class ErrorResponse(OpenAIBaseModel): error: ErrorInfo +class VLLMValidationError(ValueError): + """vLLM-specific validation error for request validation failures. + + Args: + message: The error message describing the validation failure. + parameter: Optional parameter name that failed validation. + value: Optional value that was rejected during validation. + """ + + def __init__( + self, + message: str, + *, + parameter: str | None = None, + value: Any = None, + ) -> None: + super().__init__(message) + self.parameter = parameter + self.value = value + + def __str__(self): + base = super().__str__() + extras = [] + if self.parameter is not None: + extras.append(f"parameter={self.parameter}") + if self.value is not None: + extras.append(f"value={self.value}") + return f"{base} ({', '.join(extras)})" if extras else base + + class ModelPermission(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") object: str = "model_permission" @@ -466,7 +496,9 @@ class ResponsesRequest(OpenAIBaseModel): @model_validator(mode="before") def validate_prompt(cls, data): if data.get("prompt") is not None: - raise ValueError("prompt template is not supported") + raise VLLMValidationError( + "prompt template is not supported", parameter="prompt" + ) return data @model_validator(mode="before") @@ -850,7 +882,10 @@ class ChatCompletionRequest(OpenAIBaseModel): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when `stream=True`.") + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) return data @@ -859,19 +894,29 @@ class ChatCompletionRequest(OpenAIBaseModel): def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise ValueError( - "`prompt_logprobs` are not available when `stream=True`." + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", ) if prompt_logprobs < 0 and prompt_logprobs != -1: - raise ValueError("`prompt_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) if (top_logprobs := data.get("top_logprobs")) is not None: if top_logprobs < 0 and top_logprobs != -1: - raise ValueError("`top_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`top_logprobs` must be a positive value or -1.", + parameter="top_logprobs", + value=top_logprobs, + ) if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): - raise ValueError( - "when using `top_logprobs`, `logprobs` must be set to true." + raise VLLMValidationError( + "when using `top_logprobs`, `logprobs` must be set to true.", + parameter="top_logprobs", ) return data @@ -1285,9 +1330,10 @@ class CompletionRequest(OpenAIBaseModel): for k in ("json", "regex", "choice") ) if count > 1: - raise ValueError( + raise VLLMValidationError( "You can only use one kind of constraints for structured " - "outputs ('json', 'regex' or 'choice')." + "outputs ('json', 'regex' or 'choice').", + parameter="structured_outputs", ) return data @@ -1296,14 +1342,23 @@ class CompletionRequest(OpenAIBaseModel): def check_logprobs(cls, data): if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise ValueError( - "`prompt_logprobs` are not available when `stream=True`." + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", ) if prompt_logprobs < 0 and prompt_logprobs != -1: - raise ValueError("`prompt_logprobs` must be a positive value or -1.") + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) if (logprobs := data.get("logprobs")) is not None and logprobs < 0: - raise ValueError("`logprobs` must be a positive value.") + raise VLLMValidationError( + "`logprobs` must be a positive value.", + parameter="logprobs", + value=logprobs, + ) return data @@ -1311,7 +1366,10 @@ class CompletionRequest(OpenAIBaseModel): @classmethod def validate_stream_options(cls, data): if data.get("stream_options") and not data.get("stream"): - raise ValueError("Stream options can only be defined when `stream=True`.") + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) return data @@ -2138,7 +2196,15 @@ class TranscriptionRequest(OpenAIBaseModel): stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream = data.get("stream", False) if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - raise ValueError("Stream options can only be defined when `stream=True`.") + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) return data @@ -2351,7 +2417,15 @@ class TranslationRequest(OpenAIBaseModel): stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream = data.get("stream", False) if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - raise ValueError("Stream options can only be defined when `stream=True`.") + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) return data diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 837e742e6be49..2cdb6a6f8eea2 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -495,6 +495,7 @@ async def run_batch( engine_client, openai_serving_models, request_logger=request_logger, + score_template=None, ) if ("embed" in supported_tasks or enable_serving_reranking) else None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d437d1e5c3b06..690fb22e2274d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -417,8 +417,7 @@ class OpenAIServingChat(OpenAIServing): generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -448,8 +447,7 @@ class OpenAIServingChat(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: @@ -682,7 +680,7 @@ class OpenAIServingChat(OpenAIServing): tool_parsers = [None] * num_choices except Exception as e: logger.exception("Error in tool parser creation.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" return @@ -811,6 +809,11 @@ class OpenAIServingChat(OpenAIServing): delta_text += harmony_parser.last_content_delta or "" cur_channel = harmony_parser.current_channel cur_recipient = harmony_parser.current_recipient + # handle the case where several tokens where generated at once + # including the final token, leading to a delta in the text + # but the current channel to be empty (start state) + if not cur_channel and delta_text: + cur_channel = "final" else: delta_text = output.text @@ -1323,9 +1326,8 @@ class OpenAIServingChat(OpenAIServing): except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" @@ -1349,8 +1351,7 @@ class OpenAIServingChat(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert final_res is not None @@ -1828,10 +1829,11 @@ class OpenAIServingChat(OpenAIServing): messages.append(sys_msg) # Add developer message. - dev_msg = get_developer_message( - tools=request.tools if should_include_tools else None - ) - messages.append(dev_msg) + if request.tools: + dev_msg = get_developer_message( + tools=request.tools if should_include_tools else None + ) + messages.append(dev_msg) # Add user message. messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 265ca9915e5db..d9a8ccb9f851d 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import ( PromptTokenUsageInfo, RequestResponseMetadata, UsageInfo, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import ( GenerationError, @@ -247,8 +248,7 @@ class OpenAIServingCompletion(OpenAIServing): generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) result_generator = merge_async_iterators(*generators) @@ -308,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) # When user requests streaming but we don't stream, we still need to # return a streaming response with a single event. @@ -510,9 +509,8 @@ class OpenAIServingCompletion(OpenAIServing): except GenerationError as e: yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in completion stream generator.") - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" @@ -660,8 +658,11 @@ class OpenAIServingCompletion(OpenAIServing): token = f"token_id:{token_id}" else: if tokenizer is None: - raise ValueError( - "Unable to get tokenizer because `skip_tokenizer_init=True`" + raise VLLMValidationError( + "Unable to get tokenizer because " + "`skip_tokenizer_init=True`", + parameter="skip_tokenizer_init", + value=True, ) token = tokenizer.decode(token_id) @@ -720,6 +721,15 @@ class OpenAIServingCompletion(OpenAIServing): request: CompletionRequest, max_input_length: int | None = None, ) -> RenderConfig: + # Validate max_tokens before using it + if request.max_tokens is not None and request.max_tokens > self.max_model_len: + raise VLLMValidationError( + f"'max_tokens' ({request.max_tokens}) cannot be greater than " + f"the model's maximum context length ({self.max_model_len}).", + parameter="max_tokens", + value=request.max_tokens, + ) + max_input_tokens_len = self.max_model_len - (request.max_tokens or 0) return RenderConfig( max_length=max_input_tokens_len, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index b9771963c6d4c..5ea2a7a572650 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -57,6 +57,7 @@ from vllm.entrypoints.openai.protocol import ( TranscriptionRequest, TranscriptionResponse, TranslationRequest, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.pooling.classify.protocol import ( @@ -322,8 +323,10 @@ class OpenAIServing: input_processor = self.input_processor tokenizer = input_processor.tokenizer if tokenizer is None: - raise ValueError( - "You cannot use beam search when `skip_tokenizer_init=True`" + raise VLLMValidationError( + "You cannot use beam search when `skip_tokenizer_init=True`", + parameter="skip_tokenizer_init", + value=True, ) eos_token_id: int = tokenizer.eos_token_id # type: ignore @@ -706,8 +709,7 @@ class OpenAIServing: return None except Exception as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _collect_batch( self, @@ -738,14 +740,43 @@ class OpenAIServing: return None except Exception as e: - return self.create_error_response(str(e)) + return self.create_error_response(e) def create_error_response( self, - message: str, + message: str | Exception, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> ErrorResponse: + exc: Exception | None = None + + if isinstance(message, Exception): + exc = message + + from vllm.entrypoints.openai.protocol import VLLMValidationError + + if isinstance(exc, VLLMValidationError): + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = exc.parameter + elif isinstance(exc, (ValueError, TypeError, RuntimeError)): + # Common validation errors from user input + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + elif exc.__class__.__name__ == "TemplateError": + # jinja2.TemplateError (avoid importing jinja2) + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + else: + err_type = "InternalServerError" + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + param = None + + message = str(exc) + if self.log_error_stack: exc_type, _, _ = sys.exc_info() if exc_type is not None: @@ -753,18 +784,27 @@ class OpenAIServing: else: traceback.print_stack() return ErrorResponse( - error=ErrorInfo(message=message, type=err_type, code=status_code.value) + error=ErrorInfo( + message=message, + type=err_type, + code=status_code.value, + param=param, + ) ) def create_streaming_error_response( self, - message: str, + message: str | Exception, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, ) -> str: json_str = json.dumps( self.create_error_response( - message=message, err_type=err_type, status_code=status_code + message=message, + err_type=err_type, + status_code=status_code, + param=param, ).model_dump() ) return json_str @@ -825,6 +865,7 @@ class OpenAIServing: message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND, + param="model", ) def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None: @@ -991,11 +1032,13 @@ class OpenAIServing: ClassificationChatRequest: "classification", } operation = operations.get(type(request), "embedding generation") - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " f"{token_num} tokens in the input for {operation}. " - f"Please reduce the length of the input." + f"Please reduce the length of the input.", + parameter="input_tokens", + value=token_num, ) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) @@ -1017,20 +1060,24 @@ class OpenAIServing: # Note: input length can be up to model context length - 1 for # completion-like requests. if token_num >= self.max_model_len: - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, your request has " f"{token_num} input tokens. Please reduce the length of " - "the input messages." + "the input messages.", + parameter="input_tokens", + value=token_num, ) if max_tokens is not None and token_num + max_tokens > self.max_model_len: - raise ValueError( + raise VLLMValidationError( "'max_tokens' or 'max_completion_tokens' is too large: " f"{max_tokens}. This model's maximum context length is " f"{self.max_model_len} tokens and your request has " f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" - f" - {token_num})." + f" - {token_num}).", + parameter="max_tokens", + value=max_tokens, ) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 1f9b5704624ab..e9eaaa49275d3 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import ( ResponsesResponse, ResponseUsage, StreamingResponsesResponse, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import ( GenerationError, @@ -271,6 +272,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message=error_message, status_code=HTTPStatus.BAD_REQUEST, + param="input", ) return None @@ -282,6 +284,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message="logprobs are not supported with gpt-oss models", status_code=HTTPStatus.BAD_REQUEST, + param="logprobs", ) if request.store and not self.enable_store and request.background: return self.create_error_response( @@ -294,6 +297,7 @@ class OpenAIServingResponses(OpenAIServing): "the vLLM server." ), status_code=HTTPStatus.BAD_REQUEST, + param="background", ) if request.previous_input_messages and request.previous_response_id: return self.create_error_response( @@ -301,6 +305,7 @@ class OpenAIServingResponses(OpenAIServing): message="Only one of `previous_input_messages` and " "`previous_response_id` can be set.", status_code=HTTPStatus.BAD_REQUEST, + param="previous_response_id", ) return None @@ -457,8 +462,7 @@ class OpenAIServingResponses(OpenAIServing): ) generators.append(generator) except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) assert len(generators) == 1 (result_generator,) = generators @@ -546,7 +550,7 @@ class OpenAIServingResponses(OpenAIServing): except GenerationError as e: return self._convert_generation_error_to_response(e) except Exception as e: - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _make_request( self, @@ -630,8 +634,7 @@ class OpenAIServingResponses(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) # NOTE: Implementation of stauts is still WIP, but for now # we guarantee that if the status is not "completed", it is accurate. @@ -1074,7 +1077,7 @@ class OpenAIServingResponses(OpenAIServing): response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) - response = self.create_error_response(str(e)) + response = self.create_error_response(e) finally: new_event_signal.set() @@ -1099,7 +1102,7 @@ class OpenAIServingResponses(OpenAIServing): response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) - response = self.create_error_response(str(e)) + response = self.create_error_response(e) if isinstance(response, ErrorResponse): # If the request has failed, update the status to "failed". @@ -1116,7 +1119,11 @@ class OpenAIServingResponses(OpenAIServing): starting_after: int | None = None, ) -> AsyncGenerator[StreamingResponsesResponse, None]: if response_id not in self.event_store: - raise ValueError(f"Unknown response_id: {response_id}") + raise VLLMValidationError( + f"Unknown response_id: {response_id}", + parameter="response_id", + value=response_id, + ) event_deque, new_event_signal = self.event_store[response_id] start_index = 0 if starting_after is None else starting_after + 1 @@ -1172,6 +1179,7 @@ class OpenAIServingResponses(OpenAIServing): return self.create_error_response( err_type="invalid_request_error", message="Cannot cancel a synchronous response.", + param="response_id", ) # Update the status to "cancelled". @@ -1191,6 +1199,7 @@ class OpenAIServingResponses(OpenAIServing): err_type="invalid_request_error", message=f"Response with id '{response_id}' not found.", status_code=HTTPStatus.NOT_FOUND, + param="response_id", ) def _make_store_not_supported_error(self) -> ErrorResponse: @@ -1203,6 +1212,7 @@ class OpenAIServingResponses(OpenAIServing): "starting the vLLM server." ), status_code=HTTPStatus.BAD_REQUEST, + param="store", ) async def _process_simple_streaming_events( diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 3e648f44f380b..22da46902da14 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -30,6 +30,7 @@ from vllm.entrypoints.openai.protocol import ( TranslationSegment, TranslationStreamResponse, UsageInfo, + VLLMValidationError, ) from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -259,7 +260,11 @@ class OpenAISpeechToText(OpenAIServing): ) if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: - raise ValueError("Maximum file size exceeded.") + raise VLLMValidationError( + "Maximum file size exceeded", + parameter="audio_filesize_mb", + value=len(audio_data) / 1024**2, + ) with io.BytesIO(audio_data) as bytes_: # NOTE resample to model SR here for efficiency. This is also a @@ -287,12 +292,18 @@ class OpenAISpeechToText(OpenAIServing): ) if request.response_format == "verbose_json": if not isinstance(prompt, dict): - raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}") + raise VLLMValidationError( + "Expected prompt to be a dict", + parameter="prompt", + value=type(prompt).__name__, + ) prompt_dict = cast(dict, prompt) decoder_prompt = prompt.get("decoder_prompt") if not isinstance(decoder_prompt, str): - raise ValueError( - f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}" + raise VLLMValidationError( + "Expected decoder_prompt to be str", + parameter="decoder_prompt", + value=type(decoder_prompt).__name__, ) prompt_dict["decoder_prompt"] = decoder_prompt.replace( "<|notimestamps|>", "<|0.00|>" @@ -412,7 +423,7 @@ class OpenAISpeechToText(OpenAIServing): except ValueError as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(e) list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None try: @@ -448,8 +459,7 @@ class OpenAISpeechToText(OpenAIServing): for i, prompt in enumerate(prompts) ] except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) if request.stream: return stream_generator_method( @@ -523,8 +533,7 @@ class OpenAISpeechToText(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) + return self.create_error_response(e) async def _speech_to_text_stream_generator( self, @@ -634,9 +643,8 @@ class OpenAISpeechToText(OpenAIServing): ) except Exception as e: - # TODO: Use a vllm-specific Validation Error logger.exception("Error in %s stream generator.", self.task_type) - data = self.create_streaming_error_response(str(e)) + data = self.create_streaming_error_response(e) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index edbfcd03ac92c..9762b23639853 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -52,6 +52,7 @@ class ServingScores(OpenAIServing): models: OpenAIServingModels, *, request_logger: RequestLogger | None, + score_template: str | None = None, log_error_stack: bool = False, ) -> None: super().__init__( @@ -60,6 +61,7 @@ class ServingScores(OpenAIServing): request_logger=request_logger, log_error_stack=log_error_stack, ) + self.score_template = score_template async def _embedding_score( self, @@ -169,6 +171,7 @@ class ServingScores(OpenAIServing): data_2=data_2, tokenizer=tokenizer, tokenization_kwargs=tokenization_kwargs, + score_template=self.score_template, ) self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt) if request.mm_processor_kwargs is not None: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 0f89c840be80f..ca2e27fa4428b 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -12,6 +12,7 @@ import torch from pydantic import Field from vllm.config import ModelConfig +from vllm.entrypoints.openai.protocol import VLLMValidationError from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import get_prompt_components, parse_raw_prompts from vllm.tokenizers import TokenizerLike @@ -162,8 +163,9 @@ class BaseRenderer(ABC): ) -> list[EmbedsPrompt]: """Load and validate base64-encoded embeddings into prompt objects.""" if not self.model_config.enable_prompt_embeds: - raise ValueError( - "You must set `--enable-prompt-embeds` to input `prompt_embeds`." + raise VLLMValidationError( + "You must set `--enable-prompt-embeds` to input `prompt_embeds`.", + parameter="prompt_embeds", ) def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: @@ -396,10 +398,12 @@ class CompletionRenderer(BaseRenderer): ) -> TokensPrompt: """Create validated TokensPrompt.""" if max_length is not None and len(token_ids) > max_length: - raise ValueError( + raise VLLMValidationError( f"This model's maximum context length is {max_length} tokens. " f"However, your request has {len(token_ids)} input tokens. " - "Please reduce the length of the input messages." + "Please reduce the length of the input messages.", + parameter="input_tokens", + value=len(token_ids), ) tokens_prompt = TokensPrompt(prompt_token_ids=token_ids) diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 072ddd4c90b16..d7c31cddffad6 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -11,9 +11,11 @@ from vllm.entrypoints.chat_utils import ( ChatCompletionContentPartImageEmbedsParam, ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam, + ChatTemplateResolutionError, MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part, + apply_hf_chat_template, ) from vllm.inputs import TokensPrompt from vllm.model_executor.models.interfaces import supports_score_template @@ -139,10 +141,8 @@ def _parse_score_content( return next(iter(mm_placeholder_storage.values()))[0] -def apply_score_template( - model_config: ModelConfig, - prompt_1: str, - prompt_2: str, +def _apply_model_score_template( + model_config: ModelConfig, prompt_1: str, prompt_2: str ) -> str: # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) from vllm.model_executor.model_loader import get_model_cls @@ -181,6 +181,7 @@ def get_score_prompt( tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, + score_template: str | None = None, ) -> tuple[str, TokensPrompt]: prompt_1, prompt_2, mm_data = parse_score_data( data_1, @@ -190,19 +191,48 @@ def get_score_prompt( from vllm.model_executor.model_loader import get_model_cls model = get_model_cls(model_config) - if supports_score_template(model): - full_prompt = apply_score_template(model_config, prompt_1, prompt_2) - prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) - elif model_config.use_pad_token: - # cross_encoder models defaults to using pad_token. - prompt_inputs = tokenizer( - text=prompt_1, text_pair=prompt_2, **tokenization_kwargs - ) - full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) + + def default_tokenizer_encode(): + if supports_score_template(model): + full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2) + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + else: + if model_config.use_pad_token: + # cross_encoder models defaults to using pad_token. + prompt_inputs = tokenizer( + text=prompt_1, text_pair=prompt_2, **tokenization_kwargs + ) + full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) + else: + # `llm as reranker` models defaults to not using pad_token. + full_prompt = prompt_1 + prompt_2 + prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) + return full_prompt, prompt_inputs + + # FIXME: For now, we only apply a template when one is explicitly provided. + # We cannot rely on the tokenizer's chat template because many models + # inherit junk templates from their base LLM, which breaks both the models + # and the tests that use them. + if score_template is None: + full_prompt, prompt_inputs = default_tokenizer_encode() else: - # `llm as reranker` models defaults to not using pad_token. - full_prompt = prompt_1 + prompt_2 - prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) + # FIXME: Try applying a score template from the CLI arg or tokenizer_config.json + # If that fails because there is no such template, + # fall back to the default implementation. + try: + full_prompt = apply_hf_chat_template( + tokenizer, + [ + {"role": "query", "content": prompt_1}, + {"role": "document", "content": prompt_2}, + ], + score_template, + tools=None, + model_config=model_config, + ) + prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) + except ChatTemplateResolutionError: + full_prompt, prompt_inputs = default_tokenizer_encode() engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"]) diff --git a/vllm/envs.py b/vllm/envs.py index f6db42e9124d6..1d4128d74b95c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -24,6 +24,7 @@ if TYPE_CHECKING: LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: str | None = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 + VLLM_ENGINE_READY_TIMEOUT_S: int = 600 VLLM_API_KEY: str | None = None VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False S3_ACCESS_KEY_ID: str | None = None @@ -604,6 +605,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENGINE_ITERATION_TIMEOUT_S": lambda: int( os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60") ), + # Timeout in seconds for waiting for engine cores to become ready + # during startup. Default is 600 seconds (10 minutes). + "VLLM_ENGINE_READY_TIMEOUT_S": lambda: int( + os.environ.get("VLLM_ENGINE_READY_TIMEOUT_S", "600") + ), # API key for vLLM API server "VLLM_API_KEY": lambda: os.environ.get("VLLM_API_KEY", None), # Whether to log responses from API Server for debugging diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 033cc1f544b3b..7a569ec32eac9 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -186,6 +186,7 @@ class DPMetadata: class ForwardContext: # copy from vllm_config.compilation_config.static_forward_context no_compile_layers: dict[str, Any] + attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]] """ Type Dict[str, AttentionMetadata] for v1, map from layer_name of each attention layer to its attention metadata @@ -193,7 +194,6 @@ class ForwardContext: for each microbatch. Set dynamically for each forward pass """ - attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]] # TODO: remove after making all virtual_engines share the same kv cache virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py index f5e36697ed18c..bc88c71eaf8d9 100644 --- a/vllm/lora/lora_model.py +++ b/vllm/lora/lora_model.py @@ -12,7 +12,6 @@ from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import ( get_lora_id, is_base_embeddding_weights, - is_regex_target_modules, parse_fine_tuned_lora_name, ) from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @@ -201,37 +200,13 @@ class LoRAModel: for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path): - # When a bin/pt file is provided, we rely on config to find - # unexpected modules. - unexpected_modules = [] - target_modules = peft_helper.target_modules - if not isinstance(target_modules, list): - target_modules = [target_modules] - for module in target_modules: - # Compatible with more modules, - # such as:layers.11.self_attn.k_proj - part_name = module.split(".")[-1] - if part_name not in expected_lora_modules: - unexpected_modules.append(module) - # loaded lora's target modules must be a subset of - # expected_lora_modules. It is not reliable. See - # https://github.com/vllm-project/vllm/pull/5909. But there's no - # other better mechanism. - if unexpected_modules and not is_regex_target_modules( - peft_helper.target_modules, expected_lora_modules - ): - raise ValueError( - f"While loading {lora_dir}, expected" - f" target modules in {expected_lora_modules}" - f" but received {unexpected_modules}." - f" Please verify that the loaded LoRA module is correct" - ) lora_file_path = ( lora_bin_file_path if os.path.isfile(lora_bin_file_path) else lora_pt_file_path ) tensors = torch.load(lora_file_path, map_location=device, weights_only=True) + check_unexpected_modules(tensors) else: raise ValueError(f"{lora_dir} doesn't contain tensors") diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 8ed42382e3a86..ed9e916455e5f 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -11,9 +11,11 @@ import torch from vllm import envs from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.platforms import current_platform logger = init_logger(__name__) +is_batch_invariant = vllm_is_batch_invariant() _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} @@ -150,7 +152,8 @@ def _get_lora_b_ptr( @functools.lru_cache def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None: user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER - if user_defined_config_folder is not None: + # Avoid optimizing for the batch invariant case. Use default config + if user_defined_config_folder is not None and not is_batch_invariant: gpu_name = torch.cuda.get_device_name() gpu_name = gpu_name.replace(" ", "_") gpu_name = gpu_name.replace("-", "_") @@ -203,11 +206,14 @@ def get_lora_op_configs( # default config default = {} if op_type == "shrink": + split_k = 64 if batch < 128 else 8 + if is_batch_invariant: + split_k = 1 default = { "block_m": 32, "block_n": 16, "block_k": 256 if batch < 128 else 32, - "split_k": 64 if batch < 128 else 8, + "split_k": split_k, "num_warps": 4, "num_ctas": 1, "group_size_m": 8, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 4d264c06826b8..75aeccd004422 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -5,7 +5,6 @@ import os from typing import TYPE_CHECKING, Optional import huggingface_hub -import regex as re from huggingface_hub.utils import ( EntryNotFoundError, HfHubHTTPError, @@ -186,39 +185,6 @@ def is_base_embeddding_weights(name: str) -> bool: return name.endswith(embedding_suffixes) -def is_regex_target_modules( - load_modules: str | list[str], expected_lora_modules: set[str] -) -> bool: - """ - PEFT supports passing `target_modules` in the form of regular expressions, - such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to - determine whether the suffix in the regular expression is present in the - `expected_lora_modules`. - """ - - def is_valid_regex(pattern): - try: - re.compile(pattern) - return True - except re.error: - return False - - def is_subset(sub_list, full_set): - return set(sub_list).issubset(full_set) - - # Similar to PEFT's processing logic, regex-related operations are only - # executed when the load_modules is a `str`. - if not isinstance(load_modules, str): - return False - - if is_valid_regex(load_modules): - match = re.search(r"\((.*?)\)\$?$", load_modules) - if match: - suffix = match.group(1).split("|") - return is_subset(suffix, expected_lora_modules) - return False - - def get_supported_lora_modules(model: nn.Module) -> list[str]: """ In vLLM, all linear layers support LoRA. diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 8fee4038b60b8..3d248e7fb9945 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.utils import activation_without_mul +from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( + ZeroExpertFusedMoE, +) from vllm.triton_utils import HAS_TRITON _config: dict[str, Any] | None = None @@ -54,6 +57,7 @@ __all__ = [ "FusedMoEPrepareAndFinalize", "RoutingMethodType", "SharedFusedMoE", + "ZeroExpertFusedMoE", "activation_without_mul", "override_config", "get_config", diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index a9a2990ca2b53..d581e91f36d03 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_Scheme, ) from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_triton_kernels from vllm.utils.math_utils import cdiv @@ -39,6 +40,7 @@ if has_triton_kernels(): def _get_config_dtype_str( dtype: torch.dtype, use_fp8_w8a8: bool = False, + use_fp8_w8a16: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, ocp_mx_scheme: str | None = None, @@ -50,6 +52,8 @@ def _get_config_dtype_str( """ if use_fp8_w8a8: return "fp8_w8a8" + elif use_fp8_w8a16: + return "fp8_w8a16" elif use_int8_w8a16: return "int8_w8a16" elif use_int4_w4a16: @@ -319,6 +323,10 @@ class FusedMoEQuantConfig: def use_int8_w8a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == torch.int8 + @property + def use_fp8_w8a16(self) -> bool: + return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype() + @property def use_int4_w4a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == "int4" @@ -362,6 +370,7 @@ class FusedMoEQuantConfig: """ return _get_config_dtype_str( use_fp8_w8a8=self.use_fp8_w8a8, + use_fp8_w8a16=self.use_fp8_w8a16, use_int8_w8a16=self.use_int8_w8a16, use_int4_w4a16=self.use_int4_w4a16, ocp_mx_scheme=self.ocp_mx_scheme, @@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config( ) -> FusedMoEQuantConfig: """ Construct a quant config for 16-bit float activations and int4 weights. - Note: Activations are pre-quantized. """ group_shape = GroupShape(*block_shape) if block_shape is not None else None return FusedMoEQuantConfig( @@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config( ) +def fp8_w8a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and fp8 weights. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc( + current_platform.fp8_dtype(), group_shape, w1_scale, None, None + ), + _w2=FusedMoEQuantDesc( + current_platform.fp8_dtype(), group_shape, w2_scale, None, None + ), + ) + + def int8_w8a16_moe_quant_config( w1_scale: torch.Tensor, w2_scale: torch.Tensor, @@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config( ) -> FusedMoEQuantConfig: """ Construct a quant config for 16-bit float activations and int8 weights. - Note: Activations are pre-quantized. """ group_shape = GroupShape(*block_shape) if block_shape is not None else None return FusedMoEQuantConfig( diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 4a0b4e82c1b39..9281780fca478 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP, ) -from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize, _resize_cache +from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -896,162 +896,6 @@ def cutlass_moe_fp4( ) -def _valid_cutlass_block_scaled_grouped_gemm( - w1: torch.Tensor, - w2: torch.Tensor, - inplace: bool, - activation: str, - apply_router_weight_on_input: bool, - expert_map: torch.Tensor | None, -) -> bool: - def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int): - return N % 128 == 0 and K % 128 == 0 - - _, K, N = w2.size() - if not _valid_cutlass_block_scaled_grouped_gemm_shape(N, K): - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: unaligned problem size. " - "N: %s, K: %s", - N, - K, - ) - return False - - if w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s). " - "w1.dtype: %s, w2.dtype: %s", - w1.dtype, - w2.dtype, - ) - return False - - if expert_map is not None: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: expert_parallel is not supported." - ) - return False - - if activation != "silu": - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: only activation silu is supported." - ) - return False - - if apply_router_weight_on_input: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled:" - " apply_router_weight_on_input is not supported." - ) - return False - - if inplace: - logger.debug_once( - "CutlassBlockScaledGroupedGemm disabled: inplace is not supported." - ) - return False - - return True - - -# TODO(bnell): would be nice combine/integrate with regular cutlass_fp8. -def run_cutlass_block_scaled_fused_experts( - a: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, -) -> torch.Tensor: - w1_q = w1.transpose(1, 2) - w2_q = w2.transpose(1, 2) - w1_scale = w1_scale.transpose(1, 2) - w2_scale = w2_scale.transpose(1, 2) - - assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" - assert a.shape[0] == topk_ids.shape[0], ( - "a and topk_ids must have the same batch size" - ) - assert w1_q.dtype == torch.float8_e4m3fn, "w1_q must be float8_e4m3fn" - assert w2_q.dtype == torch.float8_e4m3fn, "w2_q must be float8_e4m3fn" - assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1" - assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2" - assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch" - assert w1_q.shape[0] == w1_scale.shape[0], "w1_scale expert number mismatch" - assert w1_q.shape[0] == w2_scale.shape[0], "w2_scale expert number mismatch" - assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype" - - out_dtype = a.dtype - num_experts = w1_q.size(0) - m = a.size(0) - k = w1_q.size(1) - n = w2_q.size(1) - - topk = topk_ids.size(1) - - a_q, a1_scale = _fp8_quantize( - a, A_scale=None, per_act_token=False, block_shape=[128, 128] - ) - device = a_q.device - - expert_offsets = torch.empty((num_experts + 1,), dtype=torch.int32, device=device) - problem_sizes1 = torch.empty((num_experts, 3), dtype=torch.int32, device=device) - problem_sizes2 = torch.empty((num_experts, 3), dtype=torch.int32, device=device) - - a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) - c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device) - - ops.get_cutlass_moe_mm_data( - topk_ids, - expert_offsets, - problem_sizes1, - problem_sizes2, - a_map, - c_map, - num_experts, - n, - k, - ) - - rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype) - rep_a1_scales = a1_scale[a_map] - - c1 = torch.empty((m * topk, n * 2), dtype=out_dtype, device=device) - c2 = torch.empty((m * topk, k), dtype=out_dtype, device=device) - - ops.cutlass_blockwise_scaled_grouped_mm( - c1, - rep_a_q, - w1_q, - rep_a1_scales, - w1_scale, - problem_sizes1, - expert_offsets[:-1], - ) - - intermediate = torch.empty((m * topk, n), dtype=out_dtype, device=device) - torch.ops._C.silu_and_mul(intermediate, c1) - - intermediate_q, a2_scale = _fp8_quantize( - intermediate, A_scale=None, per_act_token=False, block_shape=[128, 128] - ) - - ops.cutlass_blockwise_scaled_grouped_mm( - c2, - intermediate_q, - w2_q, - a2_scale, - w2_scale, - problem_sizes2, - expert_offsets[:-1], - ) - - return ( - c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype) - ).sum(dim=1) - - # W4A8 def run_cutlass_moe_w4a8_fp8( output: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 92d72b75656cd..295a2a28156ed 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( batched_moe_align_block_size, moe_align_block_size, ) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP, @@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_moe_intermediate_size, marlin_quant_input, ) +from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): is_k_full: bool = True, ): # TODO (varun) : Enable activation quantization - assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, ( - "Supports only mxfp4_w4a16 or int4_w4a16" - ) + assert ( + quant_config.use_mxfp4_w4a16 + or quant_config.use_int4_w4a16 + or quant_config.use_fp8_w8a16 + ), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16" self.w13_g_idx = w13_g_idx self.w2_g_idx = w2_g_idx self.w13_g_idx_sort_indices = w13_g_idx_sort_indices @@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): @property def quant_type_id(self) -> int: # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4 - return ( - scalar_types.uint4b8.id - if self.quant_config.use_int4_w4a16 - else scalar_types.float4_e2m1f.id - ) + if self.quant_config.use_int4_w4a16: + return scalar_types.uint4b8.id + elif self.quant_config.use_mxfp4_w4a16: + return scalar_types.float4_e2m1f.id + elif ( + self.quant_config.use_fp8_w8a16 + and current_platform.fp8_dtype() == torch.float8_e4m3fn + ): + return scalar_types.float8_e4m3fn.id + else: + raise NotImplementedError("Unsupported quantization type.") def moe_problem_size( self, @@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase): ops.moe_sum(input, output) -def modular_marlin_fused_moe( - quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None -) -> mk.FusedMoEModularKernel: - return mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - MarlinExperts(quant_config), - shared_experts, - ) - - class BatchedMarlinExperts(MarlinExpertsBase): def __init__( self, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 37f8e7780f999..bf51554341607 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -25,10 +25,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, _get_config_dtype_str, ) -from vllm.model_executor.layers.fused_moe.cutlass_moe import ( - _valid_cutlass_block_scaled_grouped_gemm, - run_cutlass_block_scaled_fused_experts, -) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( _valid_deep_gemm, deep_gemm_moe_fp8, @@ -1678,11 +1674,9 @@ def fused_experts( expert_map: torch.Tensor | None = None, quant_config: FusedMoEQuantConfig | None = None, allow_deep_gemm: bool = False, - allow_cutlass_block_scaled_grouped_gemm: bool = False, ) -> torch.Tensor: if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG - use_fp8_w8a8 = quant_config.use_fp8_w8a8 # For now, disable DeepGemm for small N (<= 512) until better # permute/unpermute ops are available. @@ -1712,23 +1706,6 @@ def fused_experts( a2_scale=quant_config.a2_scale, apply_router_weight_on_input=apply_router_weight_on_input, ) - elif ( - allow_cutlass_block_scaled_grouped_gemm - and use_fp8_w8a8 - and _valid_cutlass_block_scaled_grouped_gemm( - w1, w2, inplace, activation, apply_router_weight_on_input, expert_map - ) - ): - assert quant_config is not None - return run_cutlass_block_scaled_fused_experts( - a=hidden_states, - w1=w1, - w2=w2, - w1_scale=quant_config.w1_scale, - w2_scale=quant_config.w2_scale, - topk_weights=topk_weights, - topk_ids=topk_ids, - ) else: return dispatch_fused_experts_func(inplace)( hidden_states=hidden_states, @@ -2155,6 +2132,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): torch.float16, torch.bfloat16, torch.float8_e4m3fn, + torch.float8_e4m3fnuz, ] E, num_tokens, N, K, top_k_num = self.moe_problem_size( @@ -2179,7 +2157,10 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): compute_type = tl.float16 elif hidden_states.dtype == torch.float32: compute_type = tl.float32 - elif hidden_states.dtype == torch.float8_e4m3fn: + elif ( + hidden_states.dtype == torch.float8_e4m3fn + or hidden_states.dtype == torch.float8_e4m3fnuz + ): compute_type = tl.bfloat16 else: raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 9c9bc2514bb4b..30ff1bf2f008a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, zero_expert_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): expert_map=None if self.disable_expert_map else layer.expert_map, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6a65b06014bca..2e7267d56d838 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, RoutingMethodType, ) -from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) @@ -350,8 +349,6 @@ class FusedMoE(CustomOp): num_redundant_experts: int = 0, has_bias: bool = False, is_sequence_parallel=False, - zero_expert_num: int | None = 0, - zero_expert_type: str | None = None, expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, routing_method_type: int | None = None, @@ -409,8 +406,6 @@ class FusedMoE(CustomOp): self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts - self.zero_expert_num = zero_expert_num - self.zero_expert_type = zero_expert_type # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping @@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp): self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + ) -> tuple[torch.Tensor, torch.Tensor]: """ Route the input hidden states to the top-k experts based on the router logits. Returns: - (topk_weights, topk_ids, zero_expert_result) - (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): - The weights, expert ids, and zero expert computation result. + (topk_weights, topk_ids) + (tuple[torch.Tensor, torch.Tensor]): + The weights and expert ids. **Compatibility**: When EPLB is not enabled, the returned ids are equivalent to global logical ids, so should be compatible with @@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp): assert topk_ids.dtype == indices_type or indices_type is None - # Compute zero expert result if needed - if ( - self.zero_expert_num is not None - and self.zero_expert_num > 0 - and self.zero_expert_type is not None - and self.global_num_experts is not None - ): - zero_expert_result = zero_experts_compute_triton( - expert_indices=topk_ids, - expert_scales=topk_weights, - num_experts=self.global_num_experts, - zero_expert_type=self.zero_expert_type, - hidden_states=hidden_states, - ) - else: - zero_expert_result = None - return topk_weights, topk_ids, zero_expert_result + return topk_weights, topk_ids def must_reduce_shared_expert_outputs(self) -> bool: """ @@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp): fused_output = torch.ops.vllm.moe_forward( hidden_states, router_logits, self.layer_name ) - if self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(fused_output, tuple) - fused_output, zero_expert_result = fused_output - return (reduce_output(fused_output) + zero_expert_result)[ - ..., :og_hidden_states - ] - else: - return reduce_output(fused_output)[..., :og_hidden_states] + return reduce_output(fused_output)[..., :og_hidden_states] else: if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we @@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp): final_hidden_states, ) - if self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, tuple) - assert self.shared_experts is None - final_hidden_states, zero_expert_result = final_hidden_states - if zero_expert_result is not None: - final_hidden_states += zero_expert_result - if not skip_result_store: if self.shared_experts is None: full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( @@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp): shared_output, final_hidden_states, ) - elif self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, tuple) - final_hidden_states, zero_expert_result = final_hidden_states def combine_output(states: torch.Tensor) -> torch.Tensor: if do_naive_dispatch_combine: @@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp): final_hidden_states[0], combine_output(final_hidden_states[1]), ) - elif self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, torch.Tensor) - return (combine_output(final_hidden_states), zero_expert_result) else: return combine_output(final_hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index e27e2eb32da0f..5d806fa843a3c 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -13,6 +13,10 @@ from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + def __init__(self, defer_input_quant: bool = False) -> None: + super().__init__() + self.defer_input_quant = defer_input_quant + @property def activation_format(self) -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -48,6 +52,11 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): # Note: do not use inplace for shared experts overlap a1 = a1 * topk_weights.to(a1.dtype) + # Defer input quant to moe kernel for backends (e.g. AITER, FI) + # which use a single kernel call for quant + experts. + if self.defer_input_quant: + return a1, None, None, None, None + a1q, a1q_scale = moe_kernel_quantize_input( a1, quant_config.a1_scale, diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 882ad0a537cd5..ebd9e3a4a8f2a 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -5,11 +5,15 @@ from functools import lru_cache import torch +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._aiter_ops import rocm_aiter_ops from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) class QuantMethod(IntEnum): @@ -263,3 +267,78 @@ def rocm_aiter_fused_experts( a2_scale=quant_config.a2_scale, doweight_stage1=apply_router_weight_on_input, ) + + +class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__(self, quant_config): + super().__init__(quant_config) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_expert_map(self): + return True + + def supports_chunking(self): + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # Workspaces are managed internally by AITER. + workspace1 = (0,) + workspace2 = (0,) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert a1q_scale is None + assert a2_scale is None + assert expert_tokens_meta is None + + result = rocm_aiter_fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.quant_config, + ) + assert result.shape == output.shape + output.copy_(result) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 6182f10aa70f0..82dbccf3fa9da 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -6,6 +6,7 @@ import torch import torch.nn.functional as F import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp @@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, ) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum @@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts - from .fused_moe import TritonExperts, fused_experts + from .fused_moe import TritonExperts else: - fused_experts = None # type: ignore + TritonExperts = None # type: ignore if current_platform.is_tpu(): from .moe_pallas import fused_moe as fused_moe_pallas @@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) else: layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) + elif current_platform.is_cuda_alike(): + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonExperts(self.moe_quant_config), + shared_experts=None, + ) def apply( self, @@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): router_logits=router_logits, ) - def get_fused_moe_quant_config( - self, layer: torch.nn.Module - ) -> FusedMoEQuantConfig | None: + def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: if self.moe.has_bias: return biased_moe_quant_config( layer.w13_bias, @@ -295,7 +304,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, zero_expert_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: - result = fused_experts( + result = self.kernel( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -330,19 +339,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): topk_ids=topk_ids, inplace=True, activation=layer.activation, - quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result def forward_cpu( self, diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py new file mode 100644 index 0000000000000..97d21767f4fc3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager + +import torch +from torch import nn + +from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +class ZeroExpertFusedMoE(FusedMoE): + """ + A FusedMoE operation that also computes the results of zero experts. + Zero experts perform identity operations (scaled pass-through) instead + of full MLP computations. + + This class uses memoization to avoid redundant routing computation: + routing is computed once and reused for both zero expert computation + and the main FusedMoE forward pass. + """ + + def __init__( + self, + zero_expert_num: int, + zero_expert_type: str, + router: nn.Module, + **kwargs, + ): + # ZeroExpertFusedMoE manages its own custom_routing_function for memoization + assert ( + "custom_routing_function" not in kwargs + or kwargs.get("custom_routing_function") is None + ), ( + "ZeroExpertFusedMoE does not support external custom_routing_function. " + "It manages its own for routing memoization." + ) + + # Automatically slice router's e_score_correction_bias to only include + # real experts (not zero_experts) for the base FusedMoE. + # The full bias will be used temporarily in forward() for routing. + if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs: + num_real_experts = kwargs["num_experts"] + router_bias = router.e_score_correction_bias + user_bias = kwargs.get("e_score_correction_bias") + + # Use router's bias if: + # 1. User didn't provide bias, or + # 2. User provided full bias (same size as router) + if user_bias is None or user_bias.shape[0] == router_bias.shape[0]: + kwargs["e_score_correction_bias"] = router_bias[:num_real_experts] + + # FusedMoE no longer accepts zero_expert_num/zero_expert_type. + # We handle zero experts ourselves in forward(). + super().__init__(**kwargs) + # Store the actual zero_expert_num and zero_expert_type for our own use + self._actual_zero_expert_num = zero_expert_num + self._actual_zero_expert_type = zero_expert_type + self._router = router # Full router (includes zero experts) + + # Expose zero_expert_num and zero_expert_type as attributes for + # compatibility with quantization methods that check these attributes + self.zero_expert_num = 0 + self.zero_expert_type = None + + # Memoization state for routing results + self._memoized_topk_weights: torch.Tensor | None = None + self._memoized_topk_ids: torch.Tensor | None = None + + # Create custom_routing_function to reuse memoized routing results + def custom_routing_function(hidden_states, gating_output, topk, renormalize): + """Return memoized `topk_weights` and `topk_ids`.""" + if self._memoized_topk_weights is None or self._memoized_topk_ids is None: + raise RuntimeError( + "ZeroExpertFusedMoE: routing results not memoized. " + "Call select_experts first to compute routing." + ) + return self._memoized_topk_weights, self._memoized_topk_ids + + self.custom_routing_function = custom_routing_function + + @contextmanager + def _temporarily_set_attrs(self, **attrs): + """ + Temporarily set attributes using object.__setattr__ and restore them. + + This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues. + When PyTorch Dynamo traces the forward pass, it cannot handle + nn.Module.__setattr__ calls (which include parameter registration logic), + resulting in "Unsupported" errors. Using object.__setattr__ directly + sets the attribute without triggering nn.Module's custom __setattr__, + allowing Dynamo to trace the code successfully. + """ + originals = {key: getattr(self, key) for key in attrs} + try: + for key, value in attrs.items(): + object.__setattr__(self, key, value) + yield + finally: + for key, value in originals.items(): + object.__setattr__(self, key, value) + + def _compute_zero_expert_result( + self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ) -> torch.Tensor | None: + """Compute zero expert results using pre-computed routing.""" + if ( + self._actual_zero_expert_num is None + or self._actual_zero_expert_num <= 0 + or self._actual_zero_expert_type is None + ): + return None + + return zero_experts_compute_triton( + expert_indices=topk_ids.clone(), + expert_scales=topk_weights.clone(), + num_experts=self.logical_num_experts, + zero_expert_type=self._actual_zero_expert_type, + hidden_states=hidden_states, + ) + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, # Full logits including zero experts + ) -> torch.Tensor: + """ + Forward pass with zero expert support and routing memoization. + + Args: + hidden_states: Input hidden states + router_logits: Full router logits (including zero experts) + + Returns: + Combined output from real experts and zero experts + """ + # Prepare temporary attribute overrides for routing computation + temp_attrs = { + "custom_routing_function": None, # Disable for first routing + } + if self._router is not None: + temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias + + # Compute routing with temporary attributes + # Pass full router_logits (including zero experts) so that zero experts + # can be properly identified in topk_ids + with self._temporarily_set_attrs(**temp_attrs): + topk_weights, topk_ids = self.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, # Full logits (includes zero experts) + ) + + # Compute zero expert result if needed + zero_expert_result = self._compute_zero_expert_result( + hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + + # Memoize routing results for reuse in super().forward() + self._memoized_topk_weights = topk_weights + self._memoized_topk_ids = topk_ids + + # Slice router_logits for real experts only + router_logits_sliced = router_logits[..., : self.logical_num_experts] + + # Compute real expert results (will reuse memoized routing via + # custom_routing_function) + # zero_expert_num is already 0, so FusedMoE won't handle zero experts + fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits_sliced, + ) + + # Combine results + # Both zero_expert_result and fused_out are computed from the same + # hidden_states, so they should be on the same device. + if zero_expert_result is not None: + fused_out = fused_out + zero_expert_result + + # Clear memoization after use + self._memoized_topk_weights = None + self._memoized_topk_ids = None + + return fused_out diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 4ca4f75711ac7..402f0bf69ceaa 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", + "ModelOptFp8PcPtLinearMethod", + "ModelOptFp8PbWoLinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", "HQQMarlinMethod", diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index 0bbad17d7ebc7..c9a80e9f7317d 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) state_indices_tensor = attn_metadata.state_indices_tensor has_initial_states_p = attn_metadata.has_initial_states_p + query_start_loc_p = attn_metadata.query_start_loc_p BCx, _ = self.in_proj(hidden_states) @@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp): [num_decodes, num_prefills], dim=0, ) - query_start_loc_p = ( - attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes - if has_prefill - else None - ) conv_output_list = [] diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 314848721a80a..602d02d2f15a4 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 1fd959cb3423d..efe5677045e4b 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fc359a3067a9c..f4038801c266b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): e_score_correction_bias=layer.e_score_correction_bias, ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): f"{layer.activation} not supported for Marlin MoE." ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." ) assert self.moe_quant_config is not None - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index ee99572f5f499..758a54c10605a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): ) self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() - self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() if self.weight_block_size is not None: assert not self.is_static_input_scheme diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 11097cf36f5ca..56b11b22f7ff5 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ec3fc5ace17d8..9da19c082dc27 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import Enum -from functools import partial from typing import TYPE_CHECKING, Any, Optional import torch @@ -33,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, RoutingMethodType, fp8_w8a8_moe_quant_config, + fp8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.linear import ( LinearBase, @@ -51,7 +50,6 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8, build_flashinfer_fp8_cutlass_moe_prepare_finalize, - flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend, register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, @@ -97,7 +95,6 @@ from vllm.model_executor.parameter import ( ) from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform -from vllm.scalar_type import scalar_types from vllm.utils.deep_gemm import ( is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -118,20 +115,22 @@ class Fp8MoeBackend(Enum): FLASHINFER_TRTLLM = 1 FLASHINFER_CUTLASS = 2 DEEPGEMM = 3 - CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4 - MARLIN = 5 - TRITON = 6 + MARLIN = 4 + TRITON = 5 + AITER = 6 def get_fp8_moe_backend( block_quant: bool, moe_parallel_config: FusedMoEParallelConfig, with_lora_support: bool, -) -> Fp8MoeBackend: +) -> Fp8MoeBackend | None: """ Select the primary FP8 MoE backend Note: Shape-specific fallbacks may still occur at runtime. """ + if current_platform.is_xpu(): + return None if with_lora_support: return Fp8MoeBackend.TRITON # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100. @@ -191,16 +190,9 @@ def get_fp8_moe_backend( logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local") return Fp8MoeBackend.DEEPGEMM - # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights - if ( - current_platform.is_cuda() - and current_platform.is_device_capability_family(100) - and block_quant - ): - logger.info_once( - "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local" - ) - return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM + if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MOE: + logger.info_once("Using ROCm AITER backend for FP8 MoE", scope="local") + return Fp8MoeBackend.AITER # default to Triton logger.info_once("Using Triton backend for FP8 MoE") @@ -306,6 +298,13 @@ class Fp8Config(QuantizationConfig): return UnquantizedLinearMethod() return XPUFp8LinearMethod(fp8_config) elif isinstance(layer, FusedMoE): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedFusedMoEMethod(layer.moe_config) + return XPUFp8MoEMethod(fp8_config, layer) elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) @@ -420,7 +419,7 @@ class Fp8LinearMethod(LinearMethodBase): if vllm_is_batch_invariant(): self.use_marlin = False - self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() self.use_deep_gemm = is_deep_gemm_supported() self.weight_block_size = self.quant_config.weight_block_size @@ -734,27 +733,33 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) self.marlin_input_dtype = None - self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN self.flashinfer_moe_backend: FlashinferMoeBackend | None = None if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS - if self.block_quant: - assert self.weight_block_size == [128, 128], ( - f"Only support weight_block_size == [128, 128], " - f"got {self.weight_block_size}" + if self.block_quant and self.weight_block_size != [128, 128]: + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend only supports block " + "size [128, 128]." + ) + if not self.block_quant: + if layer.renormalize or layer.custom_routing_function is not None: + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend does custom routing " + f"function or renormalization, but got {layer.renormalize} and " + f"{layer.custom_routing_function}." + ) + if layer.scoring_func != "sigmoid": + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend only supports " + f"'sigmoid' scoring function, but got {layer.scoring_func}." + ) + if layer.activation != "silu": + raise NotImplementedError( + "FlashInfer CUTLASS FP8 MoE backend only supports SiLU " + "activation function, but got {layer.activation}." ) - self.flashinfer_moe_fn = partial( - flashinfer_cutlass_moe_fp8, - moe=self.moe, - use_deepseek_fp8_block_scale=self.block_quant, - ) - - self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM - self.allow_cutlass_block_scaled_grouped_gemm = ( - self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM - ) def create_weights( self, @@ -888,16 +893,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w13_input_scale = None layer.w2_input_scale = None - self.rocm_aiter_moe_enabled = False - def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return - # Lazy import to avoid importing triton too early. - - self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - # TODO (rob): refactor block quant into separate class. if self.block_quant: assert self.quant_config.activation_scheme == "dynamic" @@ -932,7 +931,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv) replace_parameter(layer, "w2_weight", w2_weight) replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv) - if self.rocm_aiter_moe_enabled: + if self.fp8_backend == Fp8MoeBackend.AITER: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight.data, layer.w2_weight.data @@ -943,7 +942,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. - if self.allow_deep_gemm: + if self.fp8_backend == Fp8MoeBackend.DEEPGEMM: dg_w13_weight, dg_w13_weight_scale_inv = ( deepgemm_post_process_fp8_weight_block( wq=layer.w13_weight.data, @@ -1026,7 +1025,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) start += shard_size - if self.rocm_aiter_moe_enabled: + if self.fp8_backend == Fp8MoeBackend.AITER: shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight, layer.w2_weight ) @@ -1046,7 +1045,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) layer.w13_weight.data = w13_weight.data - if self.use_marlin: + if self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype ) @@ -1054,13 +1053,94 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale + # NOTE(rob): this is a WIP refactor. We are first migrating + # all of the kernels in the TP case to use mk. Once this is + # done, then we will initialzie the TP case and DP/EP case + # via the same code path (i.e. via maybe_init_modular_kernel). + # NOTE(rob): in progress migrating all into this format. + if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, + ) + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferAllGatherMoEPrepareAndFinalize, + ) + + config = self.get_fused_moe_quant_config(layer) + assert config is not None + self.moe_quant_config = config + + self.kernel = mk.FusedMoEModularKernel( + # TODO(rob): we can use the generic MoEPrepareAndFinalizeNoEP + # with the changes to defer input quantization + FlashInferAllGatherMoEPrepareAndFinalize( + use_dp=(self.moe.dp_size > 1), + use_deepseek_fp8_block_scale=self.block_quant, + ), + FlashInferExperts( + out_dtype=torch.get_default_dtype(), + quant_config=self.moe_quant_config, + ep_rank=self.moe.ep_rank, + ep_size=self.moe.ep_size, + tp_rank=self.moe.tp_rank, + tp_size=self.moe.tp_size, + use_dp=(self.moe.dp_size > 1), + use_deepseek_fp8_block_scale=self.block_quant, + ), + ) + self.use_inplace = False + + elif self.fp8_backend in [ + Fp8MoeBackend.DEEPGEMM, + Fp8MoeBackend.TRITON, + Fp8MoeBackend.MARLIN, + Fp8MoeBackend.AITER, + ]: + from vllm.model_executor.layers.fused_moe import ( + TritonOrDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + MarlinExperts, + ) + from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, + ) + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + AiterExperts, + ) + + config = self.get_fused_moe_quant_config(layer) + assert config is not None + self.moe_quant_config = config + + if self.fp8_backend == Fp8MoeBackend.AITER: + self.kernel = mk.FusedMoEModularKernel( + # TODO: make defer_input_quant an attr of the AiterExperts + MoEPrepareAndFinalizeNoEP(defer_input_quant=True), + AiterExperts(quant_config=self.moe_quant_config), + ) + elif self.fp8_backend == Fp8MoeBackend.MARLIN: + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + MarlinExperts(quant_config=self.moe_quant_config), + ) + else: + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonOrDeepGemmExperts( + quant_config=self.moe_quant_config, + allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), + ), + ) + self.use_inplace = True + def maybe_make_prepare_finalize( self, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: if ( - self.rocm_aiter_moe_enabled - or self.use_marlin + self.fp8_backend == Fp8MoeBackend.AITER + or self.fp8_backend == Fp8MoeBackend.MARLIN or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): return None @@ -1092,9 +1172,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): TritonOrDeepGemmExperts, ) - assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( - "Marlin and ROCm AITER are not supported with all2all yet." - ) + if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]: + raise NotImplementedError( + "Marlin and ROCm AITER are not supported with all2all yet." + ) assert self.moe_quant_config is not None @@ -1106,7 +1187,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): assert max_num_tokens_per_rank is not None experts_impl = ( - BatchedDeepGemmExperts if self.allow_deep_gemm else BatchedTritonExperts + BatchedDeepGemmExperts + if self.fp8_backend == Fp8MoeBackend.DEEPGEMM + else BatchedTritonExperts ) logger.debug( "%s(%s): max_tokens_per_rank=%s, block_size=%s, per_act_token=%s", @@ -1141,14 +1224,18 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) return TritonOrDeepGemmExperts( quant_config=self.moe_quant_config, - allow_deep_gemm=self.allow_deep_gemm, + allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), ) def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - if self.use_marlin: - return None + if self.fp8_backend == Fp8MoeBackend.MARLIN: + return fp8_w8a16_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + block_shape=self.weight_block_size, + ) return fp8_w8a8_moe_quant_config( w1_scale=( @@ -1179,6 +1266,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + # TODO(rob): convert this to MK. if layer.enable_eplb: raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.") assert layer.activation == "silu", ( @@ -1231,103 +1319,24 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) - select_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) + result = self.kernel( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + inplace=self.use_inplace, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + ) - topk_weights, topk_ids, zero_expert_result = select_result - - if self.rocm_aiter_moe_enabled: - from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 - rocm_aiter_fused_experts, - ) - - result = rocm_aiter_fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - activation=layer.activation, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - expert_map=layer.expert_map, - quant_config=self.moe_quant_config, - ) - elif self.use_marlin: - assert layer.activation == "silu", ( - f"{layer.activation} not supported for Marlin MoE." - ) - result = fused_marlin_moe( - x, - layer.w13_weight, - layer.w2_weight, - None, - None, - layer.w13_weight_scale, - layer.w2_weight_scale, - router_logits, - topk_weights, - topk_ids, - quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - input_dtype=self.marlin_input_dtype, - workspace=layer.workspace, - ) - elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert layer.activation == "silu", ( - f"Expected 'silu' activation but got {layer.activation}" - ) - if not self.block_quant: - assert ( - not layer.renormalize and layer.custom_routing_function is not None - ) - assert layer.scoring_func == "sigmoid", ( - f"Expected 'sigmoid' scoring func but got {layer.scoring_func}" - ) - # Delegate to CUTLASS FlashInfer path; function already bound with - # use_deepseek_fp8_block_scale for block-quant when applicable - result = self.flashinfer_moe_fn( - x, - layer, - topk_weights, - topk_ids, - inplace=False, - activation=layer.activation, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - ) - else: - from vllm.model_executor.layers.fused_moe import fused_experts - - result = fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=layer.activation, - global_num_experts=layer.global_num_experts, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - expert_map=layer.expert_map, - quant_config=self.moe_quant_config, - allow_deep_gemm=self.allow_deep_gemm, - allow_cutlass_block_scaled_grouped_gemm=( - self.allow_cutlass_block_scaled_grouped_gemm - ), - ) - - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result class Fp8OnlineMoEMethod(Fp8MoEMethod): @@ -1438,15 +1447,10 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): layer.w13_input_scale = None layer.w2_input_scale = None - self.rocm_aiter_moe_enabled = False - def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return - # Lazy import to avoid importing triton too early. - self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - # If checkpoint is fp16, quantize in place. fp8_dtype = current_platform.fp8_dtype() w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) @@ -1463,7 +1467,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): replace_parameter(layer, "w2_weight", w2_weight) # Reshuffle weights for AITER if needed. - if self.rocm_aiter_moe_enabled: + if self.fp8_backend == Fp8MoeBackend.AITER: shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight, layer.w2_weight ) @@ -1471,7 +1475,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): replace_parameter(layer, "w2_weight", shuffled_w2) # Rushuffle weights for MARLIN if needed. - if self.use_marlin: + elif self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 9dd734f2fea6a..9600bb42295dc 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): "fused GGUF MoE method." ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 347c7b2008d12..d2dafca99a230 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index a5db086fb4729..7994c838ad548 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -51,7 +51,7 @@ class QuantFP8(CustomOp): self.column_major_scales = column_major_scales self.use_ue8m0 = use_ue8m0 - self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled() self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index f33ee43727f19..9de2924ec71b1 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -6,13 +6,8 @@ from typing import Any, Optional import torch from packaging import version from torch.nn import Module -from torch.nn.parameter import Parameter from vllm._ipex_ops import ipex_ops as ops -from vllm.model_executor.layers.fused_moe import ( - FusedMoEMethodBase, - FusedMoeWeightScaleSupported, -) from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.linear import ( LinearBase, @@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import ( QuantizationMethods, ) from vllm.model_executor.layers.quantization.awq import AWQLinearMethod -from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod +from vllm.model_executor.layers.quantization.fp8 import ( + Fp8Config, + Fp8LinearMethod, + Fp8OnlineMoEMethod, +) from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - maybe_create_device_identity, -) -from vllm.model_executor.parameter import ModelWeightParameter -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform MIN_IPEX_VERSION = "2.6.0" @@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod): def __init__(self, quant_config: Fp8Config): super().__init__(quant_config) - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - maybe_create_device_identity() - - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - layer.logical_widths = output_partition_sizes - layer.input_size_per_partition = input_size_per_partition - layer.output_size_per_partition = output_size_per_partition - layer.orig_dtype = params_dtype - layer.weight_block_size = None - weight = ModelWeightParameter( - data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=params_dtype, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) - def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return # If checkpoint not serialized fp8, quantize the weights. if not self.quant_config.is_checkpoint_fp8_serialized: qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) # Update the layer with the new values. - layer.weight = Parameter(qweight, requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) + replace_parameter(layer, "weight", qweight.data) + replace_parameter(layer, "weight_scale", weight_scale.data) layer.input_scale = None def apply( @@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod): return output -class XPUFp8MoEMethod(FusedMoEMethodBase): +class XPUFp8MoEMethod(Fp8OnlineMoEMethod): def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(layer.moe_config) + super().__init__(quant_config, layer) self.quant_config = quant_config - def create_weights( - self, - layer: Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - layer.intermediate_size_per_partition = intermediate_size_per_partition - layer.hidden_size = hidden_size - layer.num_experts = num_experts - layer.orig_dtype = params_dtype - layer.weight_block_size = None - # WEIGHTS - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=params_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=params_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # Allocate 2 scales for w1 and w3 respectively. - # They will be combined to a single scale after weight loading. - w13_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False - ) - w2_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, dtype=torch.float32), requires_grad=False - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} - ) - # INPUT_SCALES - layer.w13_input_scale = None - layer.w2_input_scale = None - def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return if not self.quant_config.is_checkpoint_fp8_serialized: fp8_dtype = current_platform.fp8_dtype() w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) @@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) ) - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w2_weight", w2_weight) + import intel_extension_for_pytorch as ipex ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index aa3937d4c03ff..afbefe1fedc18 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( select_cutlass_fp8_gemm_impl, swap_w13_to_w31, ) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp, +) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( get_marlin_input_dtype, ) @@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, + cutlass_block_fp8_supported, requantize_with_max_scale, ) -from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) from vllm.scalar_type import scalar_types from vllm.utils.flashinfer import ( flashinfer_scaled_fp4_mm, @@ -88,7 +97,16 @@ if TYPE_CHECKING: logger = init_logger(__name__) -QUANT_ALGOS = ["FP8", "NVFP4"] +QUANT_ALGOS = [ + # FP8 (per-tensor weight + optional static activation scale). + "FP8", + # FP8 per-channel weight scale + per-token activation scale. + "FP8_PER_CHANNEL_PER_TOKEN", + # FP8 per-block weight-only (ModelOpt may emit this as lowercase). + "FP8_PB_WO", + # FP4 + "NVFP4", +] KV_CACHE_QUANT_ALGOS = ["FP8"] @@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig): if not quant_method: raise ValueError("Missing 'quant_algo' in quantization config") + # Normalize quant_algo for robust matching (ModelOpt may emit lowercase). + quant_method = str(quant_method).upper() + if kv_cache_quant_method is None: # No KV cache quantization, keep this branch just to have this comment pass @@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig): f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_method)}" ) + else: + kv_cache_quant_method = kv_cache_quant_method.upper() if not isinstance(exclude_modules, list): raise ValueError( @@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): def __init__( self, + quant_method: str, is_checkpoint_fp8_serialized: bool, kv_cache_quant_method: str | None, exclude_modules: list[str], ) -> None: super().__init__(exclude_modules) + self.quant_method = quant_method self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.kv_cache_quant_method = kv_cache_quant_method if is_checkpoint_fp8_serialized: logger.warning( - "Detected ModelOpt fp8 checkpoint. Please note that" - " the format is experimental and could change." + "Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note " + "that the format is experimental and could change.", + quant_method, + ) + + # Select LinearMethod implementation based on quant_algo. + if self.quant_method == "FP8": + self.LinearMethodCls = ModelOptFp8LinearMethod + elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN": + self.LinearMethodCls = ModelOptFp8PcPtLinearMethod + elif self.quant_method == "FP8_PB_WO": + self.LinearMethodCls = ModelOptFp8PbWoLinearMethod + else: + raise ValueError( + "Unsupported ModelOpt FP8 quant_algo for vLLM: " + f"{self.quant_method}. Supported: FP8 / " + "FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO." ) def get_name(self) -> QuantizationMethods: @@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): if "quantization" in hf_quant_cfg: quant_config = hf_quant_cfg["quantization"] if isinstance(quant_config, dict): - quant_algo = quant_config.get("quant_algo", "") - if "FP8" in quant_algo: + quant_algo = str(quant_config.get("quant_algo", "")) + if "FP8" in quant_algo.upper(): return "modelopt" else: # Check for compressed-tensors style config with specific quant_algo - quant_algo = hf_quant_cfg.get("quant_algo", "") - if isinstance(quant_algo, str) and "FP8" in quant_algo: + quant_algo = str(hf_quant_cfg.get("quant_algo", "")) + if "FP8" in quant_algo.upper(): return "modelopt" return None @@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): ) -> "ModelOptFp8Config": is_checkpoint_fp8_serialized = "FP8" in quant_method - return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules) + return cls( + quant_method, + is_checkpoint_fp8_serialized, + kv_cache_quant_method, + exclude_modules, + ) class ModelOptFp8LinearMethod(LinearMethodBase): @@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase): ) +class ModelOptFp8PcPtLinearMethod(LinearMethodBase): + """Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints. + + Expected checkpoint structure (per Linear): + - weight: fp8-e4m3fn, shape [out, in] + - weight_scale: fp32, shape [out] (per-output-channel) + - no input_scale (activations are dynamically quantized per-token) + """ + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + self.fp8_linear = Fp8LinearOp( + act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "FP8_PER_CHANNEL_PER_TOKEN currently only supports " + "FP8-serialized checkpoints." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + weight_scale = ChannelQuantScaleParameter( + data=torch.empty(output_size_per_partition, dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + layer.weight = Parameter(layer.weight.t(), requires_grad=False) + layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + bias=bias, + ) + + +class ModelOptFp8PbWoLinearMethod(LinearMethodBase): + """Linear method for ModelOpt FP8_PB_WO checkpoints. + + ModelOpt exports `weight_scale` as a 4D tensor: + [out_blk, 1, in_blk, 1] + where block size is typically 128 for both dims. + + vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant. + """ + + _WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128) + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + block_n, block_k = self._WEIGHT_BLOCK_SIZE + self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE) + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(block_n, block_k), + act_quant_group_shape=GroupShape(1, block_k), + cutlass_block_fp8_supported=cutlass_block_fp8_supported(), + use_aiter_and_is_supported=False, + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "FP8_PB_WO currently only supports FP8-serialized checkpoints." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + # Expose block size so the v2 weight loaders can translate offsets from + # element-space -> block-space for BlockQuantScaleParameter. + layer.weight_block_size = self.weight_block_size + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + block_n, block_k = self._WEIGHT_BLOCK_SIZE + if output_size_per_partition % block_n != 0: + raise ValueError( + "ModelOpt FP8_PB_WO requires out_features divisible by " + f"{block_n}, got {output_size_per_partition}." + ) + if input_size_per_partition % block_k != 0: + raise ValueError( + "ModelOpt FP8_PB_WO requires in_features divisible by " + f"{block_k}, got {input_size_per_partition}." + ) + + out_blks = output_size_per_partition // block_n + in_blks = input_size_per_partition // block_k + + # Match ModelOpt's exported shape so weight loading works without a + # custom loader: [out_blk, 1, in_blk, 1] + weight_scale = BlockQuantScaleParameter( + data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32), + input_dim=2, + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + # Keep weight in [out, in] layout for W8A8BlockFp8LinearOp. + layer.weight = Parameter(layer.weight.data, requires_grad=False) + + scale = layer.weight_scale + if scale.dim() == 4: + # [out_blk, 1, in_blk, 1] -> [out_blk, in_blk] + scale = scale.squeeze(1).squeeze(-1) + elif scale.dim() != 2: + raise ValueError( + "Unexpected ModelOpt FP8_PB_WO weight_scale shape: " + f"{tuple(scale.shape)}." + ) + + layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.w8a8_block_fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + bias=bias, + ) + + class ModelOptFp8MoEMethod(FusedMoEMethodBase): """MoE method for ModelOpt FP8. Supports loading FP8 checkpoints with static weight scale and @@ -796,7 +1038,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) # Expert selection - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1599,7 +1841,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): x_routing, _ = x else: x_routing = x - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x_routing, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 4bedb951a33f5..513f6f7b21abc 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase): from vllm.model_executor.layers.fused_moe import fused_experts assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 832925825c453..dc0fbfa7df35a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError("EPLB is not supported for mxfp4") if self.mxfp4_backend == Mxfp4Backend.MARLIN: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): ): from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 0b9b098afb1f6..81970480319ab 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index b2ecb0b175f81..dce9c661ec332 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 13e813952b30a..15ea9f7d60fff 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( + DeepGemmQuantScaleFMT, fp8_gemm_nt, is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp: self.act_quant_group_shape = act_quant_group_shape self.is_deep_gemm_supported = is_deep_gemm_supported() self.is_hopper = current_platform.is_device_capability(90) - self.is_blackwell = current_platform.is_device_capability_family(100) self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() # Get the correct blockscale mul and input quant operations. @@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp: weight: torch.Tensor, weight_scale: torch.Tensor, ) -> torch.Tensor: - if self.use_deep_gemm_e8m0 and self.is_blackwell: + if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0: q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( input_2d, group_size=self.act_quant_group_shape.col, diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 66e979b505f0d..3de2b6509e460 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -594,9 +594,15 @@ def apply_awq_marlin_linear( a_scales = None if input_dtype == torch.int8: + assert quant_type == scalar_types.uint4, ( + "W8A8-INT8 is not supported by marlin kernel." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) a_scales = a_scales * input_global_scale elif input_dtype == torch.float8_e4m3fn: + assert quant_type == scalar_types.uint4, ( + "INT8 weight + FP8 activation is not supported." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) output = ops.gptq_marlin_gemm( @@ -649,9 +655,15 @@ def apply_rtn_marlin_linear( a_scales = None if input_dtype == torch.int8: + assert quant_type == scalar_types.uint4b8, ( + "W8A8-INT8 is not supported by marlin kernel." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) a_scales = a_scales * input_global_scale elif input_dtype == torch.float8_e4m3fn: + assert quant_type == scalar_types.uint4b8, ( + "INT8 weight + FP8 activation is not supported." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) output = ops.gptq_marlin_gemm( diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 876c724bf972d..4d0a34c3be119 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin( ) is_nvfp4 = hasattr(layer, "weight_scale_2") + if input_dtype is not None and input_dtype.itemsize == 1: + if is_nvfp4: + raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.") + elif input_dtype != torch.float8_e4m3fn: + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + group_size = 16 if is_nvfp4 else 32 part_size_n = layer.output_size_per_partition @@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin( ) is_nvfp4 = hasattr(layer, "w13_weight_scale_2") + if input_dtype is not None and input_dtype.itemsize == 1: + if is_nvfp4: + raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.") + elif input_dtype != torch.float8_e4m3fn: + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + group_size = 16 if is_nvfp4 else 32 e = layer.num_experts diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 93d238a0524d8..4d2f2fd71ad36 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -99,6 +99,8 @@ def prepare_fp8_layer_for_marlin( "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads." ) + if input_dtype is not None and input_dtype.itemsize == 1: + raise RuntimeError("Marlin W8A8 is not supported.") part_size_n = layer.output_size_per_partition part_size_k = layer.input_size_per_partition @@ -142,10 +144,20 @@ def prepare_fp8_layer_for_marlin( # marlin kernel only support channel-wise and group-wise quantization # we need to convert the scales if weight_block_size is None: + logical_widths = getattr(layer, "logical_widths", []) if scales.nelement() == 1: # tensor-wise quantization -> channel-wise quantization # (1, 1) =>(repeat)=> (1, size_n) scales = scales.view(1, 1).repeat_interleave(part_size_n, 1) + elif scales.nelement() == len(logical_widths): + # tensor-wise quantization with logical_widths -> + # channel-wise quantization + assert sum(logical_widths) == part_size_n, ( + f"Sum of logical_widths ({sum(logical_widths)}) must be equal " + f"to part_size_n ({part_size_n})" + ) + lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64) + scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1) elif scales.nelement() > 1 and scales.nelement() != part_size_n: assert part_size_n % scales.nelement() == 0 s_size = scales.nelement() @@ -196,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin( "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads." ) + if input_dtype is not None and input_dtype.itemsize == 1: + raise RuntimeError("Marlin W8A8 is not supported.") e = layer.num_experts k = layer.hidden_size diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 74b02e4c62583..08d7a851ac9ab 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -189,7 +189,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ) convert_type = model_config.convert_type - if convert_type != "none" and supports_multimodal(model_cls): + if convert_type not in ["none", "mm_encoder_only"] and supports_multimodal( + model_cls + ): logger.debug_once("Detected conversion of Multi Modal model.") converted = try_create_mm_pooling_model_cls(model_cls) if converted is not None: @@ -200,6 +202,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], if convert_type == "none": pass + elif convert_type == "mm_encoder_only": + logger.debug_once("Converting to mm encoder only model.") + from vllm.model_executor.models.adapters import as_mm_encoder_only_model + + model_cls = as_mm_encoder_only_model(model_cls) elif convert_type == "embed": logger.debug_once("Converting to embedding model.") model_cls = as_embedding_model(model_cls) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 504de9fe10871..acf1e57a59a97 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -520,3 +520,64 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]): method = getattr(text_config, "method", None) assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" return SEQ_CLS_LOAD_METHODS[method](model, weights) + + +def as_mm_encoder_only_model(cls: _T) -> _T: + """ + Subclass an existing vLLM vl model to support mm encoder only for + EPD encoder instances. + """ + if not hasattr(cls, "embed_multimodal"): + # Submodel case: return the original class. + return cls + + if not hasattr(cls, "get_language_model_spec"): + raise TypeError(f"{cls} need to implement `get_language_model_spec` method.") + + lm_model_cls, lm_attr = cls.get_language_model_spec() + + if lm_model_cls is None or lm_attr is None: + raise TypeError( + f"{cls}.get_language_model_spec() must return (lm_model_cls, lm_attr)" + ) + + class DummyLM(nn.Module): + def __init__(self, *args, **kwargs): + self.make_empty_intermediate_tensors = None + + class ModelForMMEncoderOnly(cls): + def __init__( + self, + *, + vllm_config: "VllmConfig", + prefix: str = "", + **kwargs: Any, + ) -> None: + self.is_mm_encoder_only_model = True + origin_init = lm_model_cls.__init__ + try: + lm_model_cls.__init__ = DummyLM.__init__ + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) + + if hasattr(self, lm_attr): + delattr(self, lm_attr) + finally: + lm_model_cls.__init__ = origin_init + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + from .utils import AutoWeightsLoader + + origin_init_ = AutoWeightsLoader.__init__ + + def _new_init_(self, *args, **kwargs): + origin_init_(self, *args, **kwargs) + self.skip_prefixes = (self.skip_prefixes or []) + [f"{lm_attr}."] + + try: + AutoWeightsLoader.__init__ = _new_init_ + result = super().load_weights(weights) + finally: + AutoWeightsLoader.__init__ = origin_init_ + return result + + return ModelForMMEncoderOnly # type: ignore diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py index 0ca5f2c4e0a75..3609cc26a4c6b 100644 --- a/vllm/model_executor/models/audioflamingo3.py +++ b/vllm/model_executor/models/audioflamingo3.py @@ -111,7 +111,7 @@ class AudioFlamingo3EmbeddingInputs(TensorSchema): audio_embeds: Annotated[ list[torch.Tensor], - TensorShape("bn", "naf", "hs"), + TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}), ] diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index 98229c6d4ca1b..cf45fb9fe8370 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -487,7 +487,7 @@ class BagelForConditionalGeneration( # Split by image return tuple(vision_embeds) - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: """Get multimodal embeddings from input.""" image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index a3624b1cfa5f2..d33b3fdf47467 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -88,6 +88,26 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): } +class LlamaBidirectionalConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + from vllm.config.pooler import PoolingTypeStr + + hf_config = vllm_config.model_config.hf_config + hf_config.is_causal = False + + pooling_type_map: dict[str, PoolingTypeStr] = { + "avg": "MEAN", + "cls": "CLS", + "last": "LAST", + } + + pooling_type = pooling_type_map.get(hf_config.pooling, None) + if pooling_type is None: + raise ValueError(f"pool_type {hf_config.pooling} not supported") + vllm_config.model_config.pooler_config.pooling_type = pooling_type + + class NomicBertModelConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: @@ -401,7 +421,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # of attention tokens that would fit mamba_page_size: # e.g. for mamba page size = 788kB # attn_1_token = 2kB -> fits ~394 tokens - # then round up to a mulitple of 256 -> 512 tokens + # then round up to a multiple of 256 -> 512 tokens # End result: # attn_block_size = 512 # mamba_block_size = 512 (aligned to a multiple of chunk_size) @@ -509,6 +529,8 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteNewModel": GteNewModelConfig, "GteNewForSequenceClassification": GteNewModelConfig, "Gemma3TextModel": Gemma3TextModelConfig, + "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig, + "LlamaBidirectionalModel": LlamaBidirectionalConfig, "NomicBertModel": NomicBertModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 6670143cda250..4899f5476f955 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -878,8 +878,14 @@ class Indexer(nn.Module): ) q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1)) - q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1) - k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1) + # Note: RoPE (NeoX) can introduce extra leading dimensions during compilation + # so we need to reshape back to token-flattened shapes + q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim) + k_pe = k_pe.reshape(-1, 1, self.rope_dim) + + q = torch.cat([q_pe, q_nope], dim=-1) + # `k_pe` is [num_tokens, 1, rope_dim] (MQA). + k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1) # we only quant q here since k quant is fused with cache insertion q = q.view(-1, self.head_dim) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 541d3b2beff83..6fb09be7c67f2 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -21,7 +21,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights.""" +"""Inference-only GLM-4.5, GLM-4.6, GLM-4.7 model +compatible with HuggingFace weights.""" import typing from collections.abc import Callable, Iterable diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb99d57e8b8c7..f8288b92ebfae 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -94,6 +94,12 @@ class SupportsMultiModal(Protocol): `multimodal_config.mm_encoder_tp_mode="data"`. """ + requires_raw_input_tokens: ClassVar[bool] = False + """ + A flag that indicates this model processes input id tokens + in their raw form and not input embeddings. + """ + merge_by_field_config: ClassVar[bool | None] = None """ [DEPRECATED] A flag that indicates which implementation of @@ -141,6 +147,14 @@ class SupportsMultiModal(Protocol): """ ... + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return None, None + @overload def embed_input_ids(self, input_ids: Tensor) -> Tensor: ... @@ -298,10 +312,18 @@ def supports_multimodal_raw_input_only(model: type[object] | object) -> bool: return getattr(model, "supports_multimodal_raw_input_only", False) +def requires_raw_input_tokens(model: type[object] | object) -> bool: + return getattr(model, "requires_raw_input_tokens", False) + + def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool: return getattr(model, "supports_encoder_tp_data", False) +def supports_mm_encoder_only(model: type[object] | object) -> bool: + return getattr(model, "is_mm_encoder_only_model", False) + + @overload def supports_multimodal_pruning( model: type[object], diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 01e75338a8ced..aacc4abd43e61 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -48,7 +48,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -167,7 +166,6 @@ class Jais2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=getattr(config, "rope_parameters", None), is_neox_style=is_neox_style, @@ -304,17 +302,12 @@ class Jais2Model(nn.Module): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config self.quant_config = quant_config self.padding_idx = config.pad_token_id - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab + + self.vocab_size = config.vocab_size self.org_vocab_size = config.vocab_size if get_pp_group().is_first_rank or ( config.tie_word_embeddings and get_pp_group().is_last_rank @@ -456,29 +449,15 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config - self.lora_config = lora_config - self.model = self._init_model( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=( - DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size - ), quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) @@ -487,7 +466,7 @@ class Jais2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size, logit_scale + config.vocab_size, scale=logit_scale ) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3507a2bc66c17..84f4211df4c20 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,7 +57,14 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP +from .adapters import as_embedding_model, as_seq_cls_model +from .interfaces import ( + SupportsEagle, + SupportsEagle3, + SupportsLoRA, + SupportsPP, +) +from .interfaces_base import attn_type from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -698,3 +705,17 @@ class LlamaForCausalLM( name = name.replace(item, mapping[item]) return name, loaded_weight + + +@attn_type("encoder_only") +class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)): + # This class sets the correct attention type and pooling type + # through LlamaBidirectionalConfig. + pass + + +@attn_type("encoder_only") +class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)): + # This class sets the correct attention type and pooling type + # through LlamaBidirectionalConfig. + pass diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index c5441283f9711..774737387639b 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig): self.intermediate_size = ( self.ffn_hidden_size if hasattr(self, "ffn_hidden_size") - else self.intermediate_size + else intermediate_size ) if hasattr(self, "moe_intermediate_size"): self.moe_intermediate_size = self.moe_intermediate_size @@ -280,10 +280,6 @@ class LongcatMoe(nn.Module): ): super().__init__() self.hidden_size = hidden_size - self.zero_expert_num = config.zero_expert_num - self.zero_expert_type = config.zero_expert_type - self.routed_scaling_factor = config.routed_scaling_factor - self.enable_eplb = enable_eplb # Gate always runs at half / full precision for now. self.rounter_params_dtype = params_dtype if config.router_dtype == "float32": @@ -291,25 +287,27 @@ class LongcatMoe(nn.Module): self.router = LongcatRouter( config=config, - zero_expert_num=self.zero_expert_num, + zero_expert_num=config.zero_expert_num, rounter_params_dtype=self.rounter_params_dtype, prefix=f"{prefix}.gate", ) - self.experts = FusedMoE( + assert config.zero_expert_num is not None + assert config.zero_expert_type is not None + self.experts = ZeroExpertFusedMoE( + zero_expert_num=config.zero_expert_num, + zero_expert_type=config.zero_expert_type, + router=self.router, num_experts=num_experts, top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, reduce_results=True, params_dtype=params_dtype, - e_score_correction_bias=self.router.e_score_correction_bias, renormalize=False, quant_config=quant_config, prefix=f"{prefix}.experts", - zero_expert_num=self.zero_expert_num, - zero_expert_type=self.zero_expert_type, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, routed_scaling_factor=config.routed_scaling_factor, ) @@ -317,11 +315,34 @@ class LongcatMoe(nn.Module): num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - router_logits = self.router(hidden_states.to(self.rounter_params_dtype)) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits + # Align to FusedMoE padded hidden size to avoid dim mismatch + padded_hidden = self.experts.hidden_size + if hidden_dim < padded_hidden: + hidden_states_padded = torch.nn.functional.pad( + hidden_states, + (0, padded_hidden - hidden_dim), + mode="constant", + value=0.0, + ) + else: + hidden_states_padded = hidden_states + + router_logits_full = self.router( + hidden_states_padded.to(self.rounter_params_dtype) ) + # ZeroExpertFusedMoE handles routing memoization and zero expert computation + # internally. Pass full router_logits (including zero experts) so that + # zero experts can be properly identified in routing. + final_hidden_states = self.experts( + hidden_states=hidden_states_padded, + router_logits=router_logits_full, # Full logits (includes zero experts) + ) + + # Crop back to original hidden dimension if padded earlier + if padded_hidden != hidden_dim: + final_hidden_states = final_hidden_states[..., :hidden_dim] + return final_hidden_states.view(num_tokens, hidden_dim) @@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module): hidden_states = self.self_attn[0]( positions=positions, hidden_states=hidden_states, + llama_4_scaling=None, ) hidden_states, residual = self.post_attention_layernorm[0]( @@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module): hidden_states = self.self_attn[1]( positions=positions, hidden_states=hidden_states, + llama_4_scaling=None, ) hidden_states, residual = self.post_attention_layernorm[1]( hidden_states, residual diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index c45bdf95e7487..930ff737bcdac 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -139,7 +139,7 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema): type: Literal["image_embeds"] image_embeds: Annotated[ torch.Tensor | list[torch.Tensor], - TensorShape("bn", "ns", "hs"), + TensorShape("bn", "ns", "hs", dynamic_dims={"ns"}), ] diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b730ac0315893..0b44ff622f05b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -34,7 +34,7 @@ import einops import torch import torch.nn as nn import torch.nn.functional as F -from transformers import BatchFeature +from transformers import BatchFeature, Qwen2ForCausalLM from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, @@ -1567,3 +1567,11 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) + + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return Qwen2ForCausalLM, "language_model" diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f84ddfa84f6ab..c97e6873e0d17 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -101,7 +101,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema): audio_embeds: Annotated[ list[torch.Tensor], - TensorShape("bn", "naf", "hs"), + TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}), ] diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 089129e443c01..5ca6b3d852ac3 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -323,7 +323,7 @@ class Qwen3Omni_VisionTransformer(nn.Module): hidden_size=self.hidden_size, ) - # vit pos embeding, TODO: spatial_patch_size vs patch_size + # vit pos embedding, TODO: spatial_patch_size vs patch_size if self.apply_vit_abs_pos_embed: self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size) else: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4838f68e06f70..fea73557f1e82 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2090,3 +2090,11 @@ class Qwen3VLForConditionalGeneration( connector="visual.merger", tower_model="visual.", ) + + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return Qwen3LLMForCausalLM, "language_model" diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3ba61b52cfdf1..fd39afe259ae3 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -46,6 +46,7 @@ from .interfaces import ( has_noops, is_attention_free, is_hybrid, + requires_raw_input_tokens, supports_cross_encoding, supports_mamba_prefix_caching, supports_multimodal, @@ -203,6 +204,7 @@ _EMBEDDING_MODELS = { "GteNewModel": ("bert_with_rope", "GteNewModel"), "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"), "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501 + "LlamaBidirectionalModel": ("llama", "LlamaBidirectionalModel"), "LlamaModel": ("llama", "LlamaForCausalLM"), **{ # Multiple models share the same architecture, so we include them all @@ -246,6 +248,11 @@ _CROSS_ENCODER_MODELS = { "bert_with_rope", "GteNewForSequenceClassification", ), + "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), + "LlamaBidirectionalForSequenceClassification": ( + "llama", + "LlamaBidirectionalForSequenceClassification", + ), "ModernBertForSequenceClassification": ( "modernbert", "ModernBertForSequenceClassification", @@ -259,8 +266,6 @@ _CROSS_ENCODER_MODELS = { "roberta", "RobertaForSequenceClassification", ), - # [Auto-converted (see adapters.py)] - "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501, } _MULTIMODAL_MODELS = { @@ -418,6 +423,7 @@ _MULTIMODAL_MODELS = { ), "UltravoxModel": ("ultravox", "UltravoxModel"), "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 + "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501 # [Encoder-decoder] "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } @@ -535,6 +541,7 @@ class _ModelInfo: supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input_only: bool + requires_raw_input_tokens: bool supports_multimodal_encoder_tp_data: bool supports_pp: bool has_inner_state: bool @@ -558,6 +565,7 @@ class _ModelInfo: supports_multimodal_raw_input_only=supports_multimodal_raw_input_only( model ), + requires_raw_input_tokens=requires_raw_input_tokens(model), supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data( model ), diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index efdee255ab5eb..15d0ff30ed9bb 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -163,8 +163,10 @@ def apply_rotary_pos_emb( enable_fp32_compute=True, ) - if is_flash_attn_backend and not current_platform.is_cuda(): + if is_flash_attn_backend and current_platform.is_cuda(): apply_rotary_emb_func = apply_rotary_emb.forward_cuda + elif is_flash_attn_backend and current_platform.is_rocm(): + apply_rotary_emb_func = apply_rotary_emb.forward_hip else: apply_rotary_emb_func = apply_rotary_emb.forward_native diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index b807f45b5d52b..c7844381eb633 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -22,7 +22,6 @@ from typing import TYPE_CHECKING, Literal import torch from torch import nn -from transformers.configuration_utils import ALLOWED_LAYER_TYPES from vllm.config.utils import getattr_iter from vllm.logger import init_logger @@ -32,6 +31,7 @@ from vllm.model_executor.layers.linear import ( ReplicatedLinear, RowParallelLinear, ) +from vllm.transformers_utils.config import is_rope_parameters_nested if TYPE_CHECKING: from vllm.config import VllmConfig @@ -207,7 +207,7 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {} if rope_parameters: # Nest rope_parameters if not nested already to simplify logic - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if not is_rope_parameters_nested(rope_parameters): rope_parameters = {"": rope_parameters} return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values()) return True diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 331f0c54ecfbc..cbba1af89190c 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import inspect import math from collections.abc import Iterable, Mapping, Sequence from functools import cached_property @@ -116,10 +117,7 @@ class VoxtralProcessorAdapter: self, audio_length: int, ) -> int: - pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames( - audio_length, self.sampling_rate - ) - return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate)) + return ceil(audio_length / (self.sampling_rate // self.frame_rate)) def __call__( self, @@ -158,7 +156,14 @@ class VoxtralProcessorAdapter: assert audio.ndim == 1 # pad if necessary - audio = self._audio_processor.pad(audio, self.sampling_rate) + # TODO(Patrick) - remove once mistral-common is bumped + sig = inspect.signature(self._audio_processor.pad) + if "is_online_streaming" in sig.parameters: + audio = self._audio_processor.pad( + audio, self.sampling_rate, is_online_streaming=False + ) + else: + audio = self._audio_processor.pad(audio, self.sampling_rate) audio_tokens = [self.begin_audio_token_id] + [ self.audio_token_id @@ -510,6 +515,7 @@ class VoxtralForConditionalGeneration( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: remapping_rules = [ + (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"), (r"mm_whisper_embeddings\.(.*)", r"\1"), (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"), ( @@ -535,13 +541,16 @@ class VoxtralForConditionalGeneration( def llm_weights_generator(): nonlocal loaded_weights for name, w in weights: - is_encoder = ( - name.startswith("mm_whisper_embeddings") - and not name.startswith("mm_whisper_embeddings.tok_embeddings") - and not name.startswith( - "mm_whisper_embeddings.audio_language_projection" + is_encoder = False + for k in [ + "mm_whisper_embeddings", + "mm_streams_embeddings.embedding_module", + ]: + is_encoder |= ( + name.startswith(k) + and not name.startswith(f"{k}.tok_embeddings") + and not name.startswith(f"{k}.audio_language_projection") ) - ) for pattern, repl in remapping_rules: if re.fullmatch(pattern, name): @@ -676,6 +685,7 @@ class VoxtralEncoderModel(nn.Module): packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} mistral_remapping = [ + (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"), ( r"whisper_encoder\.conv_layers\.0\.(weight|bias)", r"whisper_encoder.conv1.\1", @@ -684,6 +694,14 @@ class VoxtralEncoderModel(nn.Module): r"whisper_encoder\.conv_layers\.1\.(weight|bias)", r"whisper_encoder.conv2.\1", ), + ( + r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)", + r"whisper_encoder.conv1.\1", + ), # noqa: E501 + ( + r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)", + r"whisper_encoder.conv2.\1", + ), # noqa: E501 ( r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)", # noqa: E501 r"whisper_encoder.layers.\1.self_attn.\2_proj.\3", diff --git a/vllm/model_executor/models/voxtral_streaming.py b/vllm/model_executor/models/voxtral_streaming.py new file mode 100644 index 0000000000000..2e79e24e6f194 --- /dev/null +++ b/vllm/model_executor/models/voxtral_streaming.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from collections.abc import Mapping + +import torch + +from vllm.config.vllm import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.models.interfaces import MultiModalEmbeddings +from vllm.model_executor.models.voxtral import ( + VoxtralDummyInputsBuilder, + VoxtralForConditionalGeneration, + VoxtralMultiModalProcessor, + VoxtralProcessingInfo, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache +from vllm.multimodal.inputs import ( + MultiModalKwargsOptionalItems, +) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import ( + MultiModalPromptUpdates, + PlaceholderFeaturesInfo, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .utils import ( + _flatten_embeddings, +) + +logger = init_logger(__name__) + + +class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): + def __init__( + self, + info: _I, + dummy_inputs: BaseDummyInputsBuilder[_I], + *, + cache: BaseMultiModalProcessorCache | None = None, + ) -> None: + # streaming can't make use of a cache yet + super().__init__(info, dummy_inputs, cache=None) + + def _maybe_apply_prompt_updates( + self, + mm_items: MultiModalDataItems, + prompt_ids: list[int], + mm_kwargs: MultiModalKwargsOptionalItems, + mm_prompt_updates: MultiModalPromptUpdates, + is_update_applied: bool, + ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]: + # there are no placeholder audio tokens for streaming + # so we need to build the place placeholder positions manually + + # in streaming there is always only one audio input + audios = mm_kwargs.get("audio", []) + assert len(audios) == 1, ( + f"Expected only one audio input for streaming, got {mm_kwargs=}" + ) + tokenizer = self.info.get_tokenizer() + audio_config = tokenizer.instruct.audio_encoder.audio_config + + num_audio_samples = audios[0]["audio_arrays"].data.shape[0] + length = audio_config.num_audio_tokens(num_audio_samples) + + features_info = PlaceholderFeaturesInfo( + modality="audio", + item_idx=0, + start_idx=0, + tokens=length + * [0], # only used for length computation, so we can take dummy inputs + is_embed=None, + ) + return prompt_ids, {"audio": [features_info]} + + +class TimeEmbedding(torch.nn.Module): + """Sinusoidal Embedding for encoding time""" + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + inv_freq = torch.exp( + -math.log(self.theta) + * torch.arange(self.dim // 2).float() + / (self.dim // 2) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, t: torch.Tensor) -> torch.Tensor: + t = t[..., None] # (B,) -> (B, 1) or (B, T) -> (B, T, 1) + inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype) + emb = ( + t * inv_freq + ) # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2) + return torch.cat((emb.cos(), emb.sin()), dim=-1) # (B, D) or (B, T, D) + + +@MULTIMODAL_REGISTRY.register_processor( + VoxtralStreamingMultiModalProcessor, + info=VoxtralProcessingInfo, + dummy_inputs=VoxtralDummyInputsBuilder, +) +class VoxtralStreamingGeneration(VoxtralForConditionalGeneration): + requires_raw_input_tokens = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.time_embedding: TimeEmbedding = TimeEmbedding( + dim=self.config.text_config.hidden_size + ) + + audio_config = self.tokenizer.instruct.audio_encoder.audio_config + _n_delay_tokens = ( + audio_config.frame_rate * audio_config.transcription_delay_ms / 1000 + ) + assert _n_delay_tokens.is_integer(), ( + f"n_delay_tokens must be integer, got {_n_delay_tokens}" + ) + + self.n_delay_tokens = int(_n_delay_tokens) + + @property + def audio_config(self): + return self.tokenizer.instruct.audio_encoder.audio_config + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + # Multi-modal token ID may exceed vocab size + handle_oov_mm_token: bool = True, + ) -> torch.Tensor: + """Pass post-conv embeddings directly as input""" + # for streaming we simply flatten the multimodal embeddings + # to be in tensor format, we treat the input ids later + assert multimodal_embeddings is not None + assert len(multimodal_embeddings) > 0, ( + "For streaming you must provide a multimodal_embedding at every step." + ) + mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) + return mm_embeds_flat + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + assert inputs_embeds is not None + assert input_ids is not None + + pool_size = self.config.audio_config.block_pool_size + inputs_embeds = inputs_embeds.view( + inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size + ) + + audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers( + inputs_embeds + ) + + num_tokens, audio_hidden_size = audio_hidden_states.shape + assert num_tokens % self.downsample_factor == 0 + audio_hidden_states = audio_hidden_states.reshape( + num_tokens // self.downsample_factor, + audio_hidden_size * self.downsample_factor, + ) + audio_text_embeds = self.audio_language_adapter(audio_hidden_states) + + text_embeds = self.language_model.embed_input_ids(input_ids) + + # sum pool text and audio embeddings + inputs_embeds = audio_text_embeds + text_embeds + + time_tensor = torch.tensor( + [self.n_delay_tokens], + device=inputs_embeds.device, + dtype=inputs_embeds.dtype, + ) + inputs_embeds = inputs_embeds + self.time_embedding(time_tensor) + + hidden_states = self.language_model.model( + input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds + ) + + return hidden_states + + def embed_multimodal( + self, **kwargs + ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None: + """Transform audio waveforms -> initial whisper post-conv embeddings""" + audio_inputs = self._parse_and_validate_audio_arrays(**kwargs) + + assert audio_inputs is not None, ( + "For streaming you must provide an audio input at every step." + ) + + multiple_of = self.audio_config.raw_audio_length_per_tok + assert all( + (this_audio := audio.shape[0]) % multiple_of == 0 for audio in audio_inputs + ), ( + f"Every input audio waveform has to be a multiple of {multiple_of}, but" + f" one is {this_audio} with {(this_audio / multiple_of)=}." + ) + + mel_features = [ + self.whisper_encoder.compute_whisper_melspec(audio).to( + self.whisper_encoder.dtype + ) + for audio in audio_inputs + ] + seq_lens = [mel.shape[1] for mel in mel_features] + # [total_num_20ms_frames, hidden_size] + audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv( + mel_features + )[0] + conv_stride = self.whisper_encoder.whisper_encoder.total_stride + audio_embeddings_per_sample = audio_embeddings.split( + [s // conv_stride for s in seq_lens], dim=0 + ) + + # audio_embeddings per sample need to be divisible by 4 + pool_size = self.config.audio_config.block_pool_size + assert all( + (this_shape := sample.shape[0]) % pool_size == 0 + for sample in audio_embeddings_per_sample + ), f"Every audio embedding has to be a multiple of 4, but one is {this_shape}." + + audio_embeddings_per_sample = [ + e.view(e.shape[0] // pool_size, e.shape[1] * pool_size) + for e in audio_embeddings_per_sample + ] + return audio_embeddings_per_sample diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index f5a1e75d99617..f1bae28debad2 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,9 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import enum import math from collections.abc import Iterable, Mapping, Sequence from contextlib import nullcontext +from functools import partial from typing import Annotated, Literal, cast import numpy as np @@ -16,7 +18,10 @@ from transformers import ( ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.layer import Attention, AttentionType +from vllm.attention.backends.abstract import ( + AttentionType, +) +from vllm.attention.layer import Attention from vllm.attention.layers.cross_attention import CrossAttention from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig @@ -34,6 +39,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.whisper_utils import ( + ISO639_1_SUPPORTED_LANGS, + WhisperAttentionWithBlockPooling, + WhisperCausalConv1d, +) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -64,67 +74,11 @@ from .utils import ( logger = init_logger(__name__) -# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages -ISO639_1_SUPPORTED_LANGS = { - "af": "Afrikaans", - "ar": "Arabic", - "hy": "Armenian", - "az": "Azerbaijani", - "be": "Belarusian", - "bs": "Bosnian", - "bg": "Bulgarian", - "ca": "Catalan", - "zh": "Chinese", - "hr": "Croatian", - "cs": "Czech", - "da": "Danish", - "nl": "Dutch", - "en": "English", - "et": "Estonian", - "fi": "Finnish", - "fr": "French", - "gl": "Galician", - "de": "German", - "el": "Greek", - "he": "Hebrew", - "hi": "Hindi", - "hu": "Hungarian", - "is": "Icelandic", - "id": "Indonesian", - "it": "Italian", - "ja": "Japanese", - "kn": "Kannada", - "kk": "Kazakh", - "ko": "Korean", - "lv": "Latvian", - "lt": "Lithuanian", - "mk": "Macedonian", - "ms": "Malay", - "mr": "Marathi", - "mi": "Maori", - "ne": "Nepali", - "no": "Norwegian", - "fa": "Persian", - "pl": "Polish", - "pt": "Portuguese", - "ro": "Romanian", - "ru": "Russian", - "sr": "Serbian", - "sk": "Slovak", - "sl": "Slovenian", - "es": "Spanish", - "sw": "Swahili", - "sv": "Swedish", - "tl": "Tagalog", - "ta": "Tamil", - "th": "Thai", - "tr": "Turkish", - "uk": "Ukrainian", - "ur": "Urdu", - "vi": "Vietnamese", - "cy": "Welsh", -} +class WhisperPosEmbedType(enum.Enum): + SINUSOIDAL = "sinusoidal" + NOPE = "nope" + LEARNED = "learned" class WhisperAudioInputs(TensorSchema): @@ -184,6 +138,8 @@ class WhisperAttention(nn.Module): num_heads: int, bias: bool = True, attn_type: AttentionType = AttentionType.DECODER, + per_layer_sliding_window: int | None = None, + block_pool_size: int = 1, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -242,7 +198,14 @@ class WhisperAttention(nn.Module): attn_type=self.attn_type, ) else: # AttentionType.DECODER (regular decoder self-attention) - self.attn = Attention( + if block_pool_size > 1: + attn_cls = partial( + WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size + ) + else: + attn_cls = Attention + + self.attn = attn_cls( self.num_heads, self.head_dim, self.scaling, @@ -251,6 +214,7 @@ class WhisperAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.attn", attn_type=self.attn_type, + per_layer_sliding_window=per_layer_sliding_window, ) def _init_qkv( @@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config + is_causal = getattr(config, "is_causal", False) + sliding_window = getattr(config, "sliding_window", None) + block_pool_size = getattr(config, "block_pool_size", 1) cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config @@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module): self.self_attn = WhisperAttention( embed_dim=self.embed_dim, num_heads=config.encoder_attention_heads, - attn_type=AttentionType.ENCODER, + attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER, + block_pool_size=block_pool_size, + per_layer_sliding_window=sliding_window, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", @@ -492,12 +461,21 @@ class WhisperEncoder(nn.Module): super().__init__() config = vllm_config.model_config.hf_config embed_dim = config.d_model + + self.pos_embed_type = WhisperPosEmbedType( + getattr(config, "pos_embed", "sinusoidal") + ) self.num_mel_bins = config.num_mel_bins self.max_source_positions = config.max_source_positions self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 - self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1) - self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1) + is_causal = getattr(config, "is_causal", False) + Conv1d = WhisperCausalConv1d if is_causal else partial(nn.Conv1d, padding=1) + + self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3) + self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3) + + self.total_stride = self.conv1.stride[0] * self.conv2.stride[0] self.start_layer, self.end_layer, self.layers = make_layers( config.encoder_layers, lambda prefix: WhisperEncoderLayer( @@ -507,29 +485,54 @@ class WhisperEncoder(nn.Module): ) self.layer_norm = nn.LayerNorm(config.d_model) - maybe_fp32_init_ctx = ( - set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext() - ) - - with ( - torch.no_grad(), - maybe_fp32_init_ctx, + if is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE: + raise ValueError( + "Only NOPE position embeddings are supported " + f"for causal models, but got {self.pos_embed_type}" + ) + elif self.pos_embed_type in ( + WhisperPosEmbedType.SINUSOIDAL, + WhisperPosEmbedType.LEARNED, ): - self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim) - self.embed_positions.weight.copy_( - sinusoids(*self.embed_positions.weight.shape) + maybe_fp32_init_ctx = ( + set_default_torch_dtype(torch.float32) + if init_in_fp32 + else nullcontext() ) - def forward(self, input_features: torch.Tensor | list[torch.Tensor]): + with ( + torch.no_grad(), + maybe_fp32_init_ctx, + ): + self.embed_positions = nn.Embedding( + self.max_source_positions, embed_dim + ) + self.embed_positions.weight.copy_( + sinusoids(*self.embed_positions.weight.shape) + ) + + def forward_conv( + self, input_features: torch.Tensor | list[torch.Tensor] + ) -> torch.Tensor: hidden_states = [] input_is_batched = False for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) embeds = nn.functional.gelu(self.conv2(embeds)) - embeds = embeds.transpose(-1, -2) - embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to( - embeds.dtype - ) + + if self.pos_embed_type in ( + WhisperPosEmbedType.SINUSOIDAL, + WhisperPosEmbedType.LEARNED, + ): + embeds = embeds.transpose(-1, -2) + embeds = ( + embeds + self.embed_positions.weight[: embeds.size(-2), :] + ).to(embeds.dtype) + elif self.pos_embed_type == WhisperPosEmbedType.NOPE: + embeds = embeds.transpose(-1, -2).to(embeds.dtype) + else: + raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}") + hidden_states.append(embeds) input_is_batched = embeds.ndim > 2 # Input to MHA must be B x T x D @@ -539,12 +542,19 @@ class WhisperEncoder(nn.Module): else: hidden_states = torch.stack(hidden_states, dim=0) + return hidden_states + + def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor: for encoder_layer in self.layers: hidden_states = encoder_layer(hidden_states) hidden_states = self.layer_norm(hidden_states) return hidden_states + def forward(self, input_features: torch.Tensor | list[torch.Tensor]): + hidden_states = self.forward_conv(input_features) + return self.forward_layers(hidden_states) + class WhisperDecoder(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py new file mode 100644 index 0000000000000..077b4aff6fec9 --- /dev/null +++ b/vllm/model_executor/models/whisper_utils.py @@ -0,0 +1,299 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy +import functools +import math +from dataclasses import replace + +import torch +import torch.nn.functional as F +from torch import nn + +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) +from vllm.attention.layer import Attention +from vllm.attention.selector import get_attn_backend +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, + subclass_attention_backend_with_overrides, +) +from vllm.v1.kv_cache_interface import AttentionSpec + +# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages +ISO639_1_SUPPORTED_LANGS = { + "af": "Afrikaans", + "ar": "Arabic", + "hy": "Armenian", + "az": "Azerbaijani", + "be": "Belarusian", + "bs": "Bosnian", + "bg": "Bulgarian", + "ca": "Catalan", + "zh": "Chinese", + "hr": "Croatian", + "cs": "Czech", + "da": "Danish", + "nl": "Dutch", + "en": "English", + "et": "Estonian", + "fi": "Finnish", + "fr": "French", + "gl": "Galician", + "de": "German", + "el": "Greek", + "he": "Hebrew", + "hi": "Hindi", + "hu": "Hungarian", + "is": "Icelandic", + "id": "Indonesian", + "it": "Italian", + "ja": "Japanese", + "kn": "Kannada", + "kk": "Kazakh", + "ko": "Korean", + "lv": "Latvian", + "lt": "Lithuanian", + "mk": "Macedonian", + "ms": "Malay", + "mr": "Marathi", + "mi": "Maori", + "ne": "Nepali", + "no": "Norwegian", + "fa": "Persian", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "sr": "Serbian", + "sk": "Slovak", + "sl": "Slovenian", + "es": "Spanish", + "sw": "Swahili", + "sv": "Swedish", + "tl": "Tagalog", + "ta": "Tamil", + "th": "Thai", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "cy": "Welsh", +} + + +def _pad1d( + x: torch.Tensor, + paddings: tuple[int, int], + mode: str = "constant", + value: float = 0.0, +) -> torch.Tensor: + """Tiny wrapper around F.pad, just to allow for + reflect padding on small input. + If this is the case, we insert extra 0 padding + to the right before the reflection happen. + """ + length = x.shape[-1] + padding_left, padding_right = paddings + assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) + if mode == "reflect": + max_pad = max(padding_left, padding_right) + extra_pad = 0 + if length <= max_pad: + extra_pad = max_pad - length + 1 + x = F.pad(x, (0, extra_pad)) + padded = F.pad(x, paddings, mode, value) + end = padded.shape[-1] - extra_pad + return padded[..., :end] + else: + return F.pad(x, paddings, mode, value) + + +class WhisperCausalConv1d(nn.Conv1d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + padding: int = 0, + bias: bool = True, + ) -> None: + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) + self._stride = self.stride[0] + self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1 + self._padding_total = self._effective_kernel_size - self._stride + + def forward(self, x: torch.Tensor) -> torch.Tensor: + n_frames = ( + x.shape[-1] - self._effective_kernel_size + self._padding_total + ) / self._stride + 1 + target_length = (math.ceil(n_frames) - 1) * self._stride + ( + self._effective_kernel_size - self._padding_total + ) + extra_padding = target_length - x.shape[-1] + x = _pad1d(x, (self._padding_total, extra_padding), mode="constant") + return super().forward(x) + + +@functools.lru_cache +def create_whisper_attention_backend_with_block_pooling( + underlying_attn_backend: AttentionBackend, block_pool_size: int +) -> type[AttentionBackend]: + prefix = "WhisperAttentionWithBlockPooling_" + underlying_builder = underlying_attn_backend.get_builder_cls() + + class WhisperAttentionWithBlockPoolingBuilder(underlying_builder): # type: ignore + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + assert kv_cache_spec.num_kv_heads % block_pool_size == 0 + kv_cache_spec = replace( + kv_cache_spec, + block_size=kv_cache_spec.block_size * block_pool_size, + num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size, + ) + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> AttentionMetadata: + new_common_attn_metadata = copy.deepcopy(common_attn_metadata) + new_common_attn_metadata.query_start_loc *= block_pool_size + new_common_attn_metadata.query_start_loc_cpu *= block_pool_size + new_common_attn_metadata.seq_lens *= block_pool_size + new_common_attn_metadata._seq_lens_cpu *= block_pool_size + new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size + new_common_attn_metadata.num_actual_tokens *= block_pool_size + new_common_attn_metadata.max_query_len *= block_pool_size + new_common_attn_metadata.max_seq_len *= block_pool_size + original_slot_mapping = common_attn_metadata.slot_mapping + common_prefix_len *= block_pool_size + new_common_attn_metadata.slot_mapping = ( + ( + original_slot_mapping.unsqueeze(1) * block_pool_size + + torch.arange(block_pool_size, device=original_slot_mapping.device) + ) + .flatten() + .clamp(min=-1) + ) + return super().build( + common_prefix_len, new_common_attn_metadata, fast_build + ) + + if not issubclass(underlying_attn_backend, FlashAttentionBackend): + raise NotImplementedError( + f"{underlying_attn_backend} is not yet supported." + "Contributions to support more backends are much " + "appreciated." + ) + + attn_backend = subclass_attention_backend_with_overrides( + name_prefix=prefix, + attention_backend_cls=underlying_attn_backend, + overrides={ + "get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder, + "get_kv_cache_shape": lambda num_blocks, + block_size, + num_kv_heads, + head_size, + cache_dtype_str: ( + 2, + num_blocks, + # we stretch each block by `block_pool_size` + block_size * block_pool_size, + num_kv_heads // block_pool_size, + head_size, + ), # TODO: generalize to other backends + }, + ) + + return attn_backend + + +class WhisperAttentionWithBlockPooling(Attention): + """Attention layer with block pooling.""" + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int | None = None, + alibi_slopes: list[float] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + logits_soft_cap: float | None = None, + per_layer_sliding_window: int | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: str | None = None, + block_pool_size: int = 1, + attn_backend: type[AttentionBackend] | None = None, + **extra_impl_args, + ) -> None: + self.block_pool_size = block_pool_size + dtype = torch.get_default_dtype() + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + else: + kv_cache_dtype = "auto" + block_size = 16 + + underlying_attn_backend = get_attn_backend( + head_size, + dtype, + kv_cache_dtype, + block_size, + attn_type=attn_type, + ) + attn_backend = create_whisper_attention_backend_with_block_pooling( + underlying_attn_backend, block_pool_size + ) + + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + cache_config=cache_config, + quant_config=quant_config, + logits_soft_cap=logits_soft_cap, + per_layer_sliding_window=per_layer_sliding_window, + prefix=prefix, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + attn_backend=attn_backend, + **extra_impl_args, + ) + + def get_kv_cache_spec(self, vllm_config: VllmConfig): + kv_cache_spec = super().get_kv_cache_spec(vllm_config) + assert isinstance(kv_cache_spec, AttentionSpec) + kv_cache_spec = replace( + kv_cache_spec, + num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads, + ) + return kv_cache_spec diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 51b8f77f29088..57e7be6344cd1 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -111,11 +111,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: return librosa.load(filepath, sr=None) - def encode_base64(self, media: tuple[npt.NDArray, int]) -> str: + def encode_base64( + self, + media: tuple[npt.NDArray, int], + *, + audio_format: str = "WAV", + ) -> str: audio, sr = media with BytesIO() as buffer: - soundfile.write(buffer, audio, sr, format="WAV") + soundfile.write(buffer, audio, sr, format=audio_format) data = buffer.getvalue() return base64.b64encode(data).decode("utf-8") diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 1506ecb8c7aa0..8e1178bc7ea44 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -8,8 +8,12 @@ import pybase64 import torch from PIL import Image +from vllm.logger import init_logger + from .base import MediaIO, MediaWithBytes +logger = init_logger(__file__) + def rescale_image_size( image: Image.Image, size_factor: float, transpose: int = -1 @@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]): self, media: Image.Image, *, - image_format: str = "JPEG", + image_format: str | None = None, ) -> str: + if image_format is None: + logger.warning_once( + "The default format of `ImageMediaIO.encode_base64` will be changed " + 'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. ' + "To continue using the old default, " + 'pass `format="JPEG"` explicitly to silence this warning.' + ) + image_format = "JPEG" + image = media with BytesIO() as buffer: diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 7fd05af583b0a..b2b0d1734727c 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,6 +3,7 @@ import asyncio import atexit +import mimetypes from collections.abc import Generator, Set from concurrent.futures import ThreadPoolExecutor from itertools import groupby @@ -357,17 +358,31 @@ class MediaConnector: def encode_audio_base64( audio: np.ndarray, sampling_rate: int, + *, + format: str = "WAV", ) -> str: """Encode audio as base64.""" audio_io = AudioMediaIO() - return audio_io.encode_base64((audio, sampling_rate)) + return audio_io.encode_base64((audio, sampling_rate), audio_format=format) + + +def encode_audio_url( + audio: np.ndarray, + sampling_rate: int, + *, + format: str = "WAV", +) -> str: + """Encode audio as a data URL.""" + audio_b64 = encode_audio_base64(audio, sampling_rate, format=format) + mimetype = mimetypes.types_map.get("." + format.lower(), "audio") + return f"data:{mimetype};base64,{audio_b64}" def encode_image_base64( image: Image.Image, *, image_mode: str = "RGB", - format: str = "JPEG", + format: str | None = None, ) -> str: """ Encode a pillow image to base64 format. @@ -378,10 +393,45 @@ def encode_image_base64( return image_io.encode_base64(image, image_format=format) -def encode_video_base64(frames: npt.NDArray) -> str: +def encode_image_url( + image: Image.Image, + *, + image_mode: str = "RGB", + format: str = "PNG", +) -> str: + """ + Encode a pillow image as a data URL. + + By default, the image is converted into RGB format before being encoded. + """ + image_b64 = encode_image_base64(image, image_mode=image_mode, format=format) + mimetype = mimetypes.types_map.get("." + format.lower(), "image") + return f"data:{mimetype};base64,{image_b64}" + + +def encode_video_base64( + frames: npt.NDArray, + *, + format: str = "JPEG", +) -> str: image_io = ImageMediaIO() video_io = VideoMediaIO(image_io) - return video_io.encode_base64(frames) + return video_io.encode_base64(frames, video_format=format) + + +def encode_video_url( + frames: npt.NDArray, + *, + format: str = "JPEG", +) -> str: + video_b64 = encode_video_base64(frames, format=format) + + if format.lower() == "jpeg": + mimetype = "video/jpeg" + else: + mimetype = mimetypes.types_map.get("." + format.lower(), "video") + + return f"data:{mimetype};base64,{video_b64}" def argsort_mm_positions( diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 5892639eba406..b95287906c1fe 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -408,7 +408,7 @@ class RocmPlatform(Platform): parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() - use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled() + use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled() if compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 2d67551eed9f6..2e39a216a10a0 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -156,7 +156,9 @@ class XPUPlatform(Platform): if vllm_config.lora_config is not None: compilation_config.mode = CompilationMode.NONE - + # decrease triton kernel compilation scratch space for speculative decoding + if vllm_config.speculative_config is not None: + os.environ["IGC_ForceOCLSIMDWidth"] = "16" # noqa: SIM112 # check and update parallel config parallel_config = vllm_config.parallel_config # Only override worker_cls if it's still the default "auto" diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index de3d1296ec734..48a36b4c6634c 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -104,7 +104,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser): # 3. Both BOT and EOT have been outputted. elif has_bot_token and has_eot_token: return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :] - # 4. Only EOT has been outputted => this should not have occured for a model + # 4. Only EOT has been outputted => this should not have occurred for a model # well prompted and trained. else: return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :] diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py index 181d8bcba9553..ee92727e1c9a4 100644 --- a/vllm/tool_parsers/__init__.py +++ b/vllm/tool_parsers/__init__.py @@ -42,6 +42,10 @@ _TOOL_PARSERS_TO_REGISTER = { "glm4_moe_tool_parser", "Glm4MoeModelToolParser", ), + "glm47": ( + "glm47_moe_tool_parser", + "Glm47MoeModelToolParser", + ), "granite-20b-fc": ( "granite_20b_fc_tool_parser", "Granite20bFCToolParser", diff --git a/vllm/tool_parsers/glm47_moe_tool_parser.py b/vllm/tool_parsers/glm47_moe_tool_parser.py new file mode 100644 index 0000000000000..ae42a640d9413 --- /dev/null +++ b/vllm/tool_parsers/glm47_moe_tool_parser.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import regex as re + +from vllm.logger import init_logger +from vllm.tokenizers import TokenizerLike +from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser + +logger = init_logger(__name__) + + +class Glm47MoeModelToolParser(Glm4MoeModelToolParser): + def __init__(self, tokenizer: TokenizerLike): + super().__init__(tokenizer) + self.func_detail_regex = re.compile( + r"(.*?)(.*?)?", re.DOTALL + ) + self.func_arg_regex = re.compile( + r"(.*?)(?:\\n|\s)*(.*?)", + re.DOTALL, + ) diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py index a1ab75f548bfc..67bd0e61620da 100644 --- a/vllm/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/tool_parsers/minimax_m2_tool_parser.py @@ -138,37 +138,167 @@ class MinimaxM2ToolParser(ToolParser): return name_str def _convert_param_value(self, value: str, param_type: str) -> Any: - """Convert parameter value to the correct type.""" + """Convert parameter value to the correct type (legacy single-type version).""" + return self._convert_param_value_with_types(value, [param_type]) + + def _extract_types_from_schema(self, schema: Any) -> list[str]: + """ + Extract all possible types from a JSON schema definition. + Handles anyOf, oneOf, allOf, type arrays, and enum fields. + + Args: + schema: The JSON schema definition for a parameter + + Returns: + List of type strings (e.g., ["string", "integer", "null"]) + """ + if schema is None: + return ["string"] + + if not isinstance(schema, dict): + return ["string"] + + types: set[str] = set() + + # Handle direct "type" field + if "type" in schema: + type_value = schema["type"] + if isinstance(type_value, str): + types.add(type_value) + elif isinstance(type_value, list): + for t in type_value: + if isinstance(t, str): + types.add(t) + + # Handle enum - infer types from enum values + if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]: + for value in schema["enum"]: + if value is None: + types.add("null") + elif isinstance(value, bool): + types.add("boolean") + elif isinstance(value, int): + types.add("integer") + elif isinstance(value, float): + types.add("number") + elif isinstance(value, str): + types.add("string") + elif isinstance(value, list): + types.add("array") + elif isinstance(value, dict): + types.add("object") + + # Handle anyOf, oneOf, allOf - recursively extract types + for choice_field in ("anyOf", "oneOf", "allOf"): + if choice_field in schema and isinstance(schema[choice_field], list): + for choice in schema[choice_field]: + extracted = self._extract_types_from_schema(choice) + types.update(extracted) + + # If no types found, default to string + if not types: + return ["string"] + + return list(types) + + def _convert_param_value_with_types( + self, value: str, param_types: list[str] + ) -> Any: + """ + Convert parameter value to the correct type based on a list of possible types. + Tries each type in order until one succeeds. + + Args: + value: The string value to convert + param_types: List of possible type strings + + Returns: + The converted value + """ if value.lower() == "null": return None - param_type = param_type.lower() - if param_type in ["string", "str", "text"]: + # Normalize types + normalized_types = [t.lower() for t in param_types] + + # Try null first if it's in the list + if "null" in normalized_types or value.lower() in ("null", "none", "nil"): + return None + + # Try each type in order of preference (most specific first, string as fallback) + # Priority: integer > number > boolean > object > array > string + type_priority = [ + "integer", + "int", + "number", + "float", + "boolean", + "bool", + "object", + "array", + "string", + "str", + "text", + ] + + for param_type in type_priority: + if param_type not in normalized_types: + continue + + if param_type in ["string", "str", "text"]: + return value + elif param_type in ["integer", "int"]: + try: + return int(value) + except (ValueError, TypeError): + continue + elif param_type in ["number", "float"]: + try: + val = float(value) + return val if val != int(val) else int(val) + except (ValueError, TypeError): + continue + elif param_type in ["boolean", "bool"]: + lower_val = value.lower().strip() + if lower_val in ["true", "1", "yes", "on"]: + return True + elif lower_val in ["false", "0", "no", "off"]: + return False + continue + elif param_type in ["object", "array"]: + try: + return json.loads(value) + except json.JSONDecodeError: + continue + + # Fallback: try JSON parse, then return as string + try: + return json.loads(value) + except json.JSONDecodeError: return value - elif param_type in ["integer", "int"]: - try: - return int(value) - except (ValueError, TypeError): - return value - elif param_type in ["number", "float"]: - try: - val = float(value) - return val if val != int(val) else int(val) - except (ValueError, TypeError): - return value - elif param_type in ["boolean", "bool"]: - return value.lower() in ["true", "1"] - elif param_type in ["object", "array"]: - try: - return json.loads(value) - except json.JSONDecodeError: - return value - else: - # Try JSON parse first, fallback to string - try: - return json.loads(value) - except json.JSONDecodeError: - return value + + def _get_param_types_from_config( + self, param_name: str, param_config: dict + ) -> list[str]: + """ + Get parameter types from parameter configuration. + Handles anyOf, oneOf, allOf, and direct type definitions. + + Args: + param_name: The name of the parameter + param_config: The properties dict from the tool schema + + Returns: + List of type strings + """ + if param_name not in param_config: + return ["string"] + + param_schema = param_config[param_name] + if not isinstance(param_schema, dict): + return ["string"] + + return self._extract_types_from_schema(param_schema) def _parse_single_invoke( self, invoke_str: str, tools: list | None @@ -207,17 +337,11 @@ class MinimaxM2ToolParser(ToolParser): if param_value.endswith("\n"): param_value = param_value[:-1] - # Get parameter type - param_type = "string" - if ( - param_name in param_config - and isinstance(param_config[param_name], dict) - and "type" in param_config[param_name] - ): - param_type = param_config[param_name]["type"] + # Get parameter types (supports anyOf/oneOf/allOf) + param_type = self._get_param_types_from_config(param_name, param_config) # Convert value - param_dict[param_name] = self._convert_param_value( + param_dict[param_name] = self._convert_param_value_with_types( param_value, param_type ) @@ -593,7 +717,7 @@ class MinimaxM2ToolParser(ToolParser): # Store raw value for later processing self.accumulated_params[self.current_param_name] = param_value - # Get parameter configuration for type conversion + # Get parameter configuration with anyOf support param_config = {} if self.streaming_request and self.streaming_request.tools: for tool in self.streaming_request.tools: @@ -610,17 +734,12 @@ class MinimaxM2ToolParser(ToolParser): param_config = params["properties"] break - # Get parameter type - param_type = "string" - if ( - self.current_param_name in param_config - and isinstance(param_config[self.current_param_name], dict) - and "type" in param_config[self.current_param_name] - ): - param_type = param_config[self.current_param_name]["type"] + # Get parameter types (supports anyOf/oneOf/allOf) + param_type = self._get_param_types_from_config( + self.current_param_name, param_config + ) - # Convert param value to appropriate type - converted_value = self._convert_param_value( + converted_value = self._convert_param_value_with_types( param_value, param_type ) diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py index 49a175f69f434..35b853b0ad7e1 100644 --- a/vllm/tool_parsers/mistral_tool_parser.py +++ b/vllm/tool_parsers/mistral_tool_parser.py @@ -131,78 +131,105 @@ class MistralToolParser(ToolParser): request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: """ - Extract the tool calls from a complete model response. Requires - find-and-replacing single quotes with double quotes for JSON parsing, - make sure your tool call arguments don't ever include quotes! + Extract the tool calls from a complete model response. + + Content and tool calls formatting depends on the Mistral's tokenizer version + used to train the model: + + - < v11: `content[BOT] [{tool_call1},{tool_call2}]` + - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}` + + with [BOT] the tool call token. + + Note: + For tokenizer versions >= v11, tool calls with arguments wrongly formatted + are still returned as tool calls. This is to allow the model to know it + tried to make a tool call. It reduces chance of another failure and + prevents that the context is filled with tool calls wrongly placed in + assistant message contents. """ - # case -- if a tool call token is not present, return a text response + # If the tool call token is not present, return a text response if self.bot_token not in model_output: return ExtractedToolCallInformation( tools_called=False, tool_calls=[], content=model_output ) - # first remove the BOT token - tool_content = model_output.replace(self.bot_token, "").strip() + content_and_raw_tool_calls = model_output.split(self.bot_token) + content = content_and_raw_tool_calls[0] + raw_tool_calls = content_and_raw_tool_calls[1:] - try: + # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2} + if not self._is_pre_v11: + tool_calls = [] + for raw_tool_call in raw_tool_calls: + if "{" not in raw_tool_call: + continue + + end_name = raw_tool_call.find("{") + tool_name, args = ( + raw_tool_call[:end_name], + raw_tool_call[end_name:], + ) + + tool_calls.append({"name": tool_name, "arguments": args}) + + # < v11: content[BOT] [{tool_call1},{tool_call2}] + else: + if len(raw_tool_calls) != 1: + raise ValueError( + "Only one BOT token should have been outputted, " + f"but got {model_output}." + ) + stringified_tool_calls = raw_tool_calls[0].strip() try: - if not self._is_pre_v11: - function_call_arr = [] - for single_tool_content in model_output.split(self.bot_token): - if "{" not in single_tool_content: - continue - - end_name = single_tool_content.find("{") - fn_name, args = ( - single_tool_content[:end_name], - single_tool_content[end_name:], - ) - - # fn_name is encoded outside serialized json dump - # only arguments are serialized - function_call_arr.append( - {"name": fn_name, "arguments": json.loads(args)} - ) - else: - function_call_arr = json.loads(tool_content) + tool_calls = json.loads(stringified_tool_calls) except json.JSONDecodeError: # use a regex to find the part corresponding to the tool call. # NOTE: This use case should not happen if the model is trained # correctly. It's an easy possible fix so it's included, but # can be brittle for very complex / highly nested tool calls - raw_tool_call = self.tool_call_regex.findall(tool_content)[0] - function_call_arr = json.loads(raw_tool_call) - - # Tool Call - tool_calls: list[MistralToolCall] = [ - MistralToolCall( - type="function", - function=FunctionCall( - name=raw_function_call["name"], - # function call args are JSON but as a string - arguments=json.dumps( - raw_function_call["arguments"], ensure_ascii=False + try: + raw_tool_call = self.tool_call_regex.findall( + stringified_tool_calls + )[0] + tool_calls = json.loads(raw_tool_call) + except (IndexError, json.JSONDecodeError): + logger.exception("Error in extracting tool call from response: {e}") + # If raw decoding and decoding post regex rule fails, then just + # return content. + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=stringified_tool_calls, + ) + else: + tool_calls = [ + { + "name": tool_call["name"], + "arguments": json.dumps( + tool_call["arguments"], ensure_ascii=False ), - ), - ) - for raw_function_call in function_call_arr - ] + } + for tool_call in tool_calls + ] - # get any content before the tool call - content = model_output.split(self.bot_token)[0] - return ExtractedToolCallInformation( - tools_called=True, - tool_calls=tool_calls, - content=content if len(content) > 0 else None, + mistral_tool_calls: list[MistralToolCall] = [ + MistralToolCall( + type="function", + function=FunctionCall( + name=tool_call["name"], + arguments=tool_call["arguments"], + ), ) + for tool_call in tool_calls + ] - except Exception: - logger.exception("Error in extracting tool call from response.") - # return information to just treat the tool call as regular JSON - return ExtractedToolCallInformation( - tools_called=False, tool_calls=[], content=tool_content - ) + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=mistral_tool_calls, + content=content if len(content) > 0 else None, + ) def extract_tool_calls_streaming( self, diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py index db92ea8982d70..da1a9c773f78f 100644 --- a/vllm/tool_parsers/openai_tool_parser.py +++ b/vllm/tool_parsers/openai_tool_parser.py @@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser): elif msg.channel == "commentary" and not msg.recipient: commentary_content = msg_text + # Extract partial content from the parser state if the generation was truncated + if parser.current_content: + if parser.current_channel == "final": + final_content = parser.current_content + elif ( + parser.current_channel == "commentary" and not parser.current_recipient + ): + commentary_content = parser.current_content + return ExtractedToolCallInformation( tools_called=len(tool_calls) > 0, tool_calls=tool_calls, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 887f936a2d8ae..ecb9849bb3b5e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -15,7 +15,6 @@ from huggingface_hub import ( ) from packaging.version import Version from transformers import GenerationConfig, PretrainedConfig -from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -44,6 +43,16 @@ from .repo_utils import ( with_retry, ) +try: + # Transformers v5 + from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES +except ImportError: + # Transformers v4 + from transformers.configuration_utils import ( + ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES, + ) + + if envs.VLLM_USE_MODELSCOPE: from modelscope import AutoConfig else: @@ -104,6 +113,14 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = { } +def is_rope_parameters_nested(rope_parameters: dict[str, Any]) -> bool: + """Check if rope_parameters is nested by layer types.""" + # Cannot be nested if rope_parameters is empty + if not rope_parameters: + return False + return set(rope_parameters.keys()).issubset(ALLOWED_ATTENTION_LAYER_TYPES) + + class HFConfigParser(ConfigParserBase): def parse( self, @@ -313,19 +330,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: rope_theta = getattr_iter(config, names, None, warn=True) names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"] partial_rotary_factor = getattr_iter(config, names, None, warn=True) + ompe = getattr(config, "original_max_position_embeddings", None) if Version(version("transformers")) < Version("5.0.0.dev0"): # Transformers v4 installed, legacy config fields may be present if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: config.rope_parameters = rope_scaling if ( - rope_theta is not None or partial_rotary_factor is not None + rope_theta is not None + or partial_rotary_factor is not None + or ompe is not None ) and not getattr(config, "rope_parameters", None): config.rope_parameters = {"rope_type": "default"} + # Patch legacy fields into rope_parameters if rope_theta is not None: config.rope_parameters["rope_theta"] = rope_theta if partial_rotary_factor is not None: config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor + if ompe is not None: + config.rope_parameters["original_max_position_embeddings"] = ompe elif rope_theta is not None or getattr(config, "rope_parameters", None): # Transformers v5 installed # Patch these fields in case they used non-standard names @@ -341,12 +364,8 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: if getattr(config, "rope_parameters", None) is None: return - # Add original_max_position_embeddings if present - if ompe := getattr(config, "original_max_position_embeddings", None): - config.rope_parameters["original_max_position_embeddings"] = ompe - # Handle nested rope_parameters in interleaved sliding attention models - if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if is_rope_parameters_nested(config.rope_parameters): for rope_parameters_layer_type in config.rope_parameters.values(): patch_rope_parameters_dict(rope_parameters_layer_type) else: diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index d59169d95f0c9..4776c892eb722 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict: whisper_args = config["multimodal"].pop("whisper_model_args") encoder_args = whisper_args["encoder_args"] downsample_args = whisper_args["downsample_args"] + downsample_factor = downsample_args["downsample_factor"] + + # make sure that k/v blocks can be allocated with + # unified k/v cache class and pool whisper k/v cache blocks + # with downsample_factor:1 ratio + if encoder_args.get("causal"): + block_pool_size = downsample_factor + config["projection_size"] = downsample_factor * encoder_args["dim"] + else: + block_pool_size = 1 + + _maybe_sliding_window = encoder_args.get("ragged_attention", None) + if _maybe_sliding_window is None: + sliding_window = None + elif _maybe_sliding_window.isdigit(): + sliding_window = int(_maybe_sliding_window) + else: + raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}") + + architecture = ( + "VoxtralStreamingGeneration" + if encoder_args.get("causal") + else "VoxtralForConditionalGeneration" + ) quant_config = config.get("quantization_config") config = { - "model_type": "whixtral", - "architectures": ["VoxtralForConditionalGeneration"], + "model_type": "voxtral", + "architectures": [architecture], "text_config": PretrainedConfig.from_dict(config), "audio_config": WhisperConfig( num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"], window_size=encoder_args["audio_encoding_args"]["window_size"], sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"], hop_length=encoder_args["audio_encoding_args"]["hop_length"], - downsample_factor=downsample_args["downsample_factor"], + downsample_factor=downsample_factor, d_model=encoder_args["dim"], encoder_layers=encoder_args["n_layers"], encoder_ffn_dim=encoder_args["hidden_dim"], @@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict: vocab_size=encoder_args["vocab_size"], max_source_positions=encoder_args["max_source_positions"], is_encoder_decoder=False, # Override WhisperConfig default + is_causal=encoder_args.get("causal", False), + sliding_window=sliding_window, + block_pool_size=block_pool_size, + pos_embed=encoder_args.get("pos_embed", "sinusoidal"), ), } if quant_config: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 3d4f8449ad3b6..56c9ca361eaef 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -32,15 +32,34 @@ class DeepGemmQuantScaleFMT(Enum): # element contains 4 scale values. UE8M0 = 2 - @staticmethod - def from_oracle() -> "DeepGemmQuantScaleFMT": - if not is_deep_gemm_e8m0_used(): - return DeepGemmQuantScaleFMT.FLOAT32 - return ( - DeepGemmQuantScaleFMT.UE8M0 - if current_platform.is_device_capability_family(100) - else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0 + @classmethod + def init_oracle_cache(cls) -> None: + """Initialize the oracle decision and store it in the class cache""" + cached = getattr(cls, "_oracle_cache", None) + if cached is not None: + return + + use_e8m0 = ( + envs.VLLM_USE_DEEP_GEMM_E8M0 + and is_deep_gemm_supported() + and (_fp8_gemm_nt_impl is not None) ) + if not use_e8m0: + cls._oracle_cache = cls.FLOAT32 # type: ignore + return + + cls._oracle_cache = ( # type: ignore + cls.UE8M0 + if current_platform.is_device_capability_family(100) + else cls.FLOAT32_CEIL_UE8M0 + ) + + @classmethod + def from_oracle(cls) -> "DeepGemmQuantScaleFMT": + """Return the pre-initialized oracle decision""" + cached = getattr(cls, "_oracle_cache", None) + assert cached is not None, "DeepGemmQuantScaleFMT oracle cache not initialized" + return cached @functools.cache @@ -149,6 +168,7 @@ def _lazy_init() -> None: _transform_sf_into_required_layout_impl = getattr( _dg, "transform_sf_into_required_layout", None ) + DeepGemmQuantScaleFMT.init_oracle_cache() def get_num_sms() -> int: @@ -369,7 +389,7 @@ def should_use_deepgemm_for_fp8_linear( # Verify DeepGEMM N/K dims requirements # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul - # test inside kernels/quatization/test_block_fp8.py + # test inside kernels/quantization/test_block_fp8.py N_MULTIPLE = 64 K_MULTIPLE = 128 diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index ace2cbb0564c8..fcde986f48d46 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -143,7 +143,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] query_start_loc = m.query_start_loc context_lens = m.num_computed_tokens_cpu - context_lens_tensor = context_lens.to(query_start_loc.device) + context_lens_tensor = context_lens.to(query_start_loc.device, non_blocking=True) nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None if ( diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index fcda6134016ba..47dd44601377b 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -3,17 +3,11 @@ from dataclasses import dataclass -import torch - from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.config import VllmConfig -from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder -from vllm.v1.attention.backends.utils import ( - CommonAttentionMetadata, - split_decodes_and_prefills, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadata, + BaseMambaAttentionMetadataBuilder, ) -from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec class Mamba1AttentionBackend(AttentionBackend): @@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend): @dataclass -class Mamba1AttentionMetadata: - query_start_loc_p: torch.Tensor - state_indices_tensor: torch.Tensor - has_initial_states_p: torch.Tensor | None - num_prefills: int - num_prefill_tokens: int - num_decodes: int - num_decode_tokens: int - - block_idx_last_scheduled_token: torch.Tensor # shape: [batch,] - block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,] - block_idx_last_computed_token: torch.Tensor # shape: [batch,] - num_computed_tokens_p: torch.Tensor # shape: [batch,] +class Mamba1AttentionMetadata(BaseMambaAttentionMetadata): + pass class Mamba1AttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata] ): - def __init__( - self, - kv_cache_spec: AttentionSpec, - layer_names: list[str], - vllm_config: VllmConfig, - device: torch.device, - ): - super().__init__(kv_cache_spec, layer_names, vllm_config, device) - assert isinstance(kv_cache_spec, MambaSpec) - - def build( - self, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata, - fast_build: bool = False, - ) -> Mamba1AttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - - has_initial_states_p = None - query_start_loc_p = None - num_computed_tokens, num_computed_tokens_p = None, None - block_idx_first_scheduled_token = None - block_idx_first_scheduled_token_p = None - - # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here. - # We should consolidate this code - if self.vllm_config.cache_config.enable_prefix_caching: - # Return a tensor of shape (#requests, #max blocks) - state_indices_tensor = common_attn_metadata.block_table_tensor - mamba_block_size = self.kv_cache_spec.block_size - num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( - self.device - ) - ( - block_idx_last_computed_token, - block_idx_first_scheduled_token, - block_idx_last_scheduled_token, - ) = self._compute_prefix_caching_block_indices( - common_attn_metadata, mamba_block_size - ) - else: - # Always return just a single block per each request: - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - block_idx_last_scheduled_token = None - block_idx_last_computed_token = None - - if num_prefills > 0: - query_start_loc_p = ( - common_attn_metadata.query_start_loc[-num_prefills - 1 :] - - num_decode_tokens - ) - has_initial_states_cpu = ( - common_attn_metadata.num_computed_tokens_cpu[ - num_reqs - num_prefills : num_reqs - ] - > 0 - ) - has_initial_states_p = has_initial_states_cpu.to( - common_attn_metadata.query_start_loc.device - ) - - if self.vllm_config.cache_config.enable_prefix_caching: - assert num_computed_tokens is not None - num_computed_tokens_p = num_computed_tokens[ - num_reqs - num_prefills : num_reqs - ] - assert block_idx_first_scheduled_token is not None - block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ - num_reqs - num_prefills : num_reqs - ] - - elif ( - num_decodes > 0 - and num_decodes <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - self.state_indices_tensor[:num_decodes].copy_( - state_indices_tensor, non_blocking=True - ) - state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] - state_indices_tensor[num_decodes:] = PAD_SLOT_ID - - if self.vllm_config.cache_config.enable_prefix_caching: - self.block_idx_last_scheduled_token[:num_decodes].copy_( - block_idx_last_scheduled_token, non_blocking=True - ) - block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ - :num_decode_tokens - ] - - self.block_idx_last_computed_token[:num_decodes].copy_( - block_idx_last_computed_token, non_blocking=True - ) - block_idx_last_computed_token = self.block_idx_last_computed_token[ - :num_decode_tokens - ] - - return Mamba1AttentionMetadata( - query_start_loc_p=query_start_loc_p, - has_initial_states_p=has_initial_states_p, - state_indices_tensor=state_indices_tensor, - num_prefills=num_prefills, - num_prefill_tokens=num_prefill_tokens, - num_decodes=num_decodes, - num_decode_tokens=num_decode_tokens, - block_idx_last_scheduled_token=block_idx_last_scheduled_token, - block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, - block_idx_last_computed_token=block_idx_last_computed_token, - num_computed_tokens_p=num_computed_tokens_p, - ) + metadata_cls = Mamba1AttentionMetadata + supports_update_block_table: bool = False diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index f923371283aa0..b526f0a329972 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -1,19 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy import itertools -from dataclasses import dataclass +from dataclasses import dataclass, replace import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv -from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadata, + BaseMambaAttentionMetadataBuilder, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, - compute_causal_conv1d_metadata, - split_decodes_and_prefills, ) from vllm.v1.kv_cache_interface import AttentionSpec @@ -94,48 +94,26 @@ class Mamba2AttentionBackend(AttentionBackend): @dataclass -class Mamba2AttentionMetadata: - num_prefills: int - num_prefill_tokens: int - num_decodes: int - num_decode_tokens: int - query_start_loc_p: torch.Tensor - seq_lens: torch.Tensor - - prep_initial_states: bool - chunk_size: int - - # The following tensors only contain prefill requests and will be None if - # the batch has no prefill request. - has_initial_states_p: torch.Tensor | None - seq_idx_p: torch.Tensor | None +class Mamba2AttentionMetadata(BaseMambaAttentionMetadata): + prep_initial_states: bool = False + chunk_size: int = 0 + # Chunk-related metadata (only for prefill) + seq_idx_p: torch.Tensor | None = None # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for # each chunk, its offests into the varlen sequence dimension. It is defined # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to # cu_chunk_seqlen_p[i+1]. - cu_chunk_seqlen_p: torch.Tensor | None - + cu_chunk_seqlen_p: torch.Tensor | None = None # last_chunk_indices_p is a tensor of shape (batch,) that contains the # index of the last chunk for every sequence in the (prefill) batch. - last_chunk_indices_p: torch.Tensor | None - - state_indices_tensor: torch.Tensor # shape: [batch,] - block_idx_last_scheduled_token: torch.Tensor # shape: [batch,] - block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,] - block_idx_last_computed_token: torch.Tensor # shape: [batch,] - num_computed_tokens_p: torch.Tensor # shape: [batch,] - - # The following attributes are for triton implementation of causal_conv1d - nums_dict: dict | None = None - batch_ptr: torch.Tensor | None = None - token_chunk_offset_ptr: torch.Tensor | None = None + last_chunk_indices_p: torch.Tensor | None = None class Mamba2AttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata] ): - supports_update_block_table: bool = True + metadata_cls = Mamba2AttentionMetadata def __init__( self, @@ -150,87 +128,93 @@ class Mamba2AttentionMetadataBuilder( "chunk_size needs to be set in the model config for Mamba2 models" ) + def _compute_chunk_metadata( + self, + num_prefills: int, + num_computed_tokens_p_cpu: torch.Tensor, + query_start_loc_p_cpu: torch.Tensor, + ) -> tuple[list[int], list[int], list[int]]: + """ + Compute chunk-specific metadata for Mamba2. + + The code below carefully constructs the chunks such that: + 1. Chunks contain tokens from a *single* sequence only. + 2. For every sequence, we are guaranteed that we can + retrieve the mamba state *every* chunk_size tokens. + Constraint (1) dramatically simplifies the mamba2 kernels. + Constraint (2) dramatically simplifies the implementation + of prefix caching for mamba2 (wip). We need to take care + of the interaction with chunked prefill in order to + satisfy constraint (2). + """ + # TODO (tdoublep): This code could probably be optimized. + cu_chunk_seqlen = [] + seq_idx = [] + last_chunk_indices = [] + seqlen_pos = 0 + + for req_idx in range(num_prefills): + this_num_computed = num_computed_tokens_p_cpu[req_idx].item() + this_new_tokens = ( + query_start_loc_p_cpu[req_idx + 1].item() + - query_start_loc_p_cpu[req_idx].item() + ) + + # if computed tokens are not chunk-aligned, use the first + # chunk to finish it off + if this_num_computed % self.chunk_size != 0: + seq_idx.append(req_idx) + cu_chunk_seqlen.append(seqlen_pos) + # how many tokens to finish the chunk? + chunk_len = ( + cdiv(this_num_computed, self.chunk_size) * self.chunk_size + - this_num_computed + ) + # we can only use at most this_new_tokens + chunk_len = min(chunk_len, this_new_tokens) + seqlen_pos += chunk_len + this_new_tokens -= chunk_len + + n_chunks = cdiv(this_new_tokens, self.chunk_size) + for chunk in range(n_chunks): + seq_idx.append(req_idx) + cu_chunk_seqlen.append(seqlen_pos) + chunk_len = min(self.chunk_size, this_new_tokens) + seqlen_pos += chunk_len + this_new_tokens -= chunk_len + + assert this_new_tokens == 0 + last_chunk_indices.append(len(cu_chunk_seqlen) - 1) + + cu_chunk_seqlen.append(seqlen_pos) + + return cu_chunk_seqlen, seq_idx, last_chunk_indices + def build( self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False, ) -> Mamba2AttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - seq_lens = common_attn_metadata.seq_lens + common = self._compute_common_metadata(common_attn_metadata) - query_start_loc_p = None seq_idx_p = None cu_chunk_seqlen_p = None last_chunk_indices_p = None - - # Need flags to indicate if there are initial states - has_initial_states_p = None prep_initial_states = False - # for causal_conv1d - nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None - - num_computed_tokens, num_computed_tokens_p = None, None - block_idx_first_scheduled_token = None - block_idx_first_scheduled_token_p = None - - if self.vllm_config.cache_config.enable_prefix_caching: - # Return a tensor of shape (#requests, #max blocks) - state_indices_tensor = common_attn_metadata.block_table_tensor - # Additional cache-related varaiables: - mamba_block_size = self.kv_cache_spec.block_size - num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( - self.device - ) - ( - block_idx_last_computed_token, - block_idx_first_scheduled_token, - block_idx_last_scheduled_token, - ) = self._compute_prefix_caching_block_indices( - common_attn_metadata, mamba_block_size - ) - else: - # Always return just a single block per each request: - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - # Additional cache-related varaiables: - block_idx_last_scheduled_token = None - block_idx_last_computed_token = None - - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - # Compute seq_idx for prefill only - if num_prefills > 0: - # [batch,] - has_initial_states_cpu = ( - common_attn_metadata.num_computed_tokens_cpu[ - num_reqs - num_prefills : num_reqs - ] - > 0 - ) - prep_initial_states = torch.any(has_initial_states_cpu).item() - has_initial_states_p = has_initial_states_cpu.to( - common_attn_metadata.query_start_loc.device + if common.num_prefills > 0: + prep_initial_states = ( + torch.any(common.has_initial_states_p).item() + if common.has_initial_states_p is not None + else False ) - query_start_loc_p = ( - common_attn_metadata.query_start_loc[-num_prefills - 1 :] - - num_decode_tokens - ) + num_reqs = common.num_reqs + num_prefills = common.num_prefills + num_decode_tokens = common.num_decode_tokens - if self.vllm_config.cache_config.enable_prefix_caching: - assert num_computed_tokens is not None - num_computed_tokens_p = num_computed_tokens[ - num_reqs - num_prefills : num_reqs - ] - assert block_idx_first_scheduled_token is not None - block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ - num_reqs - num_prefills : num_reqs - ] num_computed_tokens_p_cpu = common_attn_metadata.num_computed_tokens_cpu[ num_reqs - num_prefills : num_reqs ] @@ -239,137 +223,33 @@ class Mamba2AttentionMetadataBuilder( - num_decode_tokens ) - # The code below carefully constructs the chunks such that: - # 1. Chunks contain tokens from a *single* sequence only. - # 2. For every sequence, we are guaranteed that we can - # retrieve the mamba state *every* chunk_size tokens. - # Constraint (1) dramatically simplifies the mamba2 kernels. - # Constraint (2) dramatically simplifies the implementation - # of prefix caching for mamba2 (wip). We need to take care - # of the interaction with chunked prefill in order to - # satisfy constraint (2). - # TODO (tdoublep): This code could probably be optimized. - cu_chunk_seqlen = [] - seq_idx = [] - last_chunk_indices = [] - seqlen_pos = 0 - for req_idx in range(num_prefills): - this_num_computed = num_computed_tokens_p_cpu[req_idx].item() - this_new_tokens = ( - query_start_loc_p_cpu[req_idx + 1].item() - - query_start_loc_p_cpu[req_idx].item() - ) - - # if computed tokens are not chunk-aligned, use the first - # chunk to finish it off - if this_num_computed % self.chunk_size != 0: - seq_idx.append(req_idx) - cu_chunk_seqlen.append(seqlen_pos) - # how many tokens to finish the chunk? - chunk_len = ( - cdiv(this_num_computed, self.chunk_size) * self.chunk_size - - this_num_computed - ) - # we can only use at most this_new_tokens - chunk_len = min(chunk_len, this_new_tokens) - seqlen_pos += chunk_len - this_new_tokens -= chunk_len - - n_chunks = cdiv(this_new_tokens, self.chunk_size) - for chunk in range(n_chunks): - seq_idx.append(req_idx) - cu_chunk_seqlen.append(seqlen_pos) - chunk_len = min(self.chunk_size, this_new_tokens) - seqlen_pos += chunk_len - this_new_tokens -= chunk_len - - assert this_new_tokens == 0 - last_chunk_indices.append(len(cu_chunk_seqlen) - 1) - - cu_chunk_seqlen.append(seqlen_pos) + cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata( + num_prefills, + num_computed_tokens_p_cpu, + query_start_loc_p_cpu, + ) seq_idx_p = torch.as_tensor( - seq_idx, device=query_start_loc_p.device, dtype=torch.int32 + seq_idx, + device=common_attn_metadata.query_start_loc.device, + dtype=torch.int32, ) cu_chunk_seqlen_p = torch.as_tensor( - cu_chunk_seqlen, device=query_start_loc_p.device, dtype=torch.int32 + cu_chunk_seqlen, + device=common_attn_metadata.query_start_loc.device, + dtype=torch.int32, ) last_chunk_indices_p = torch.as_tensor( - last_chunk_indices, device=query_start_loc_p.device, dtype=torch.int32 + last_chunk_indices, + device=common_attn_metadata.query_start_loc.device, + dtype=torch.int32, ) - nums_dict, batch_ptr, token_chunk_offset_ptr = ( - compute_causal_conv1d_metadata(query_start_loc_p) - ) - - elif ( - num_decodes <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - self.state_indices_tensor[:num_decodes].copy_( - state_indices_tensor, non_blocking=True - ) - state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] - - if self.vllm_config.cache_config.enable_prefix_caching: - self.block_idx_last_scheduled_token[:num_decodes].copy_( - block_idx_last_scheduled_token, non_blocking=True - ) - block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ - :num_decode_tokens - ] - - self.block_idx_last_computed_token[:num_decodes].copy_( - block_idx_last_computed_token, non_blocking=True - ) - block_idx_last_computed_token = self.block_idx_last_computed_token[ - :num_decode_tokens - ] - - attn_metadata = Mamba2AttentionMetadata( - num_prefills=num_prefills, - num_prefill_tokens=num_prefill_tokens, - num_decodes=num_decodes, - num_decode_tokens=num_decode_tokens, - query_start_loc_p=query_start_loc_p, - seq_lens=seq_lens, + return replace( + common, prep_initial_states=prep_initial_states, chunk_size=self.chunk_size, - has_initial_states_p=has_initial_states_p, seq_idx_p=seq_idx_p, - state_indices_tensor=state_indices_tensor, cu_chunk_seqlen_p=cu_chunk_seqlen_p, last_chunk_indices_p=last_chunk_indices_p, - nums_dict=nums_dict, - batch_ptr=batch_ptr, - token_chunk_offset_ptr=token_chunk_offset_ptr, - block_idx_last_scheduled_token=block_idx_last_scheduled_token, - block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, - block_idx_last_computed_token=block_idx_last_computed_token, - num_computed_tokens_p=num_computed_tokens_p, ) - return attn_metadata - - def update_block_table( - self, - metadata: Mamba2AttentionMetadata, - blk_table: torch.Tensor, - slot_mapping: torch.Tensor, - ) -> Mamba2AttentionMetadata: - new_metadata = copy.copy(metadata) - prefix_caching = self.vllm_config.cache_config.enable_prefix_caching - state_indices_t = blk_table if prefix_caching else blk_table[:, 0] - num_reqs = blk_table.shape[0] - - # For CUDA graphs, copy to persistent buffer - if ( - metadata.num_prefills == 0 - and num_reqs <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - persistent_state_indices_t = self.state_indices_tensor[:num_reqs] - persistent_state_indices_t.copy_(state_indices_t, non_blocking=True) - state_indices_t = persistent_state_indices_t - - new_metadata.state_indices_tensor = state_indices_t - return new_metadata diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index a9705db59f19d..4f876d66da147 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc +import copy +from dataclasses import dataclass from typing import ClassVar, TypeVar import torch @@ -9,20 +11,52 @@ import torch from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( + PAD_SLOT_ID, AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + compute_causal_conv1d_metadata, + split_decodes_and_prefills, ) from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec -M = TypeVar("M") +M = TypeVar("M", bound="BaseMambaAttentionMetadata") + + +@dataclass +class BaseMambaAttentionMetadata: + num_prefills: int + num_prefill_tokens: int + num_decodes: int + num_decode_tokens: int + num_reqs: int + + # The following tensors only contain prefill requests and will be None if + # the batch has no prefill request. + has_initial_states_p: torch.Tensor | None + query_start_loc_p: torch.Tensor | None + num_computed_tokens_p: torch.Tensor | None + + state_indices_tensor: torch.Tensor + + # The following tensors are only used for prefix caching and are None if disabled + block_idx_last_scheduled_token: torch.Tensor | None + block_idx_first_scheduled_token_p: torch.Tensor | None + block_idx_last_computed_token: torch.Tensor | None + + # The following attributes are for triton implementation of causal_conv1d + nums_dict: dict | None = None + batch_ptr: torch.Tensor | None = None + token_chunk_offset_ptr: torch.Tensor | None = None class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): + metadata_cls: type[M] reorder_batch_threshold: int = 1 _cudagraph_support: ClassVar[AttentionCGSupport] = ( AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE ) + supports_update_block_table: bool = True def __init__( self, @@ -87,6 +121,18 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): return self.build(0, m) + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> M: + """ + Default build implementation for Mamba-like attention backends. + Subclasses (e.g., Mamba2) can override to add additional metadata. + """ + return self._compute_common_metadata(common_attn_metadata) + def _compute_prefix_caching_block_indices( self, common_attn_metadata: CommonAttentionMetadata, @@ -115,3 +161,147 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): block_idx_first_scheduled_token, block_idx_last_scheduled_token, ) + + def _compute_common_metadata( + self, + common_attn_metadata: CommonAttentionMetadata, + ) -> M: + """ + Compute metadata common to both Mamba1 and Mamba2. + """ + num_reqs = common_attn_metadata.num_reqs + + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + split_decodes_and_prefills( + common_attn_metadata, decode_threshold=self.reorder_batch_threshold + ) + ) + + # Need flags to indicate if there are initial states + has_initial_states_p = None + query_start_loc_p = None + num_computed_tokens = None + num_computed_tokens_p = None + + # for prefix caching + block_idx_first_scheduled_token = None + block_idx_first_scheduled_token_p = None + block_idx_last_computed_token = None + block_idx_last_scheduled_token = None + + # for causal_conv1d + nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None + + if self.vllm_config.cache_config.enable_prefix_caching: + # Return a tensor of shape (#requests, #max blocks) + state_indices_tensor = common_attn_metadata.block_table_tensor + # Additional cache-related varaiables: + mamba_block_size = self.kv_cache_spec.block_size + num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( + self.device + ) + ( + block_idx_last_computed_token, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + ) = self._compute_prefix_caching_block_indices( + common_attn_metadata, mamba_block_size + ) + else: + # Always return just a single block per each request: + state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] + + if num_prefills > 0: + query_start_loc_p = ( + common_attn_metadata.query_start_loc[-num_prefills - 1 :] + - num_decode_tokens + ) + has_initial_states_cpu = ( + common_attn_metadata.num_computed_tokens_cpu[ + num_reqs - num_prefills : num_reqs + ] + > 0 + ) + has_initial_states_p = has_initial_states_cpu.to( + common_attn_metadata.query_start_loc.device + ) + + nums_dict, batch_ptr, token_chunk_offset_ptr = ( + compute_causal_conv1d_metadata(query_start_loc_p) + ) + + if self.vllm_config.cache_config.enable_prefix_caching: + assert num_computed_tokens is not None + num_computed_tokens_p = num_computed_tokens[ + num_reqs - num_prefills : num_reqs + ] + assert block_idx_first_scheduled_token is not None + block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ + num_reqs - num_prefills : num_reqs + ] + elif ( + num_decodes <= self.decode_cudagraph_max_bs + and self.compilation_config.cudagraph_mode.has_full_cudagraphs() + ): + self.state_indices_tensor[:num_decodes].copy_( + state_indices_tensor, non_blocking=True + ) + state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] + state_indices_tensor[num_decodes:] = PAD_SLOT_ID + + if self.vllm_config.cache_config.enable_prefix_caching: + self.block_idx_last_scheduled_token[:num_decodes].copy_( + block_idx_last_scheduled_token, non_blocking=True + ) + block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ + :num_decode_tokens + ] + + self.block_idx_last_computed_token[:num_decodes].copy_( + block_idx_last_computed_token, non_blocking=True + ) + block_idx_last_computed_token = self.block_idx_last_computed_token[ + :num_decode_tokens + ] + + return self.metadata_cls( + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, + query_start_loc_p=query_start_loc_p, + has_initial_states_p=has_initial_states_p, + state_indices_tensor=state_indices_tensor, + block_idx_last_scheduled_token=block_idx_last_scheduled_token, + block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, + block_idx_last_computed_token=block_idx_last_computed_token, + num_computed_tokens_p=num_computed_tokens_p, + num_reqs=num_reqs, + nums_dict=nums_dict, + batch_ptr=batch_ptr, + token_chunk_offset_ptr=token_chunk_offset_ptr, + ) + + def update_block_table( + self, + metadata: M, + blk_table: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> M: + new_metadata = copy.copy(metadata) + prefix_caching = self.vllm_config.cache_config.enable_prefix_caching + state_indices_t = blk_table if prefix_caching else blk_table[:, 0] + num_reqs = blk_table.shape[0] + + # For CUDA graphs, copy to persistent buffer + if ( + metadata.num_prefills == 0 + and num_reqs <= self.decode_cudagraph_max_bs + and self.compilation_config.cudagraph_mode.has_full_cudagraphs() + ): + persistent_state_indices_t = self.state_indices_tensor[:num_reqs] + persistent_state_indices_t.copy_(state_indices_t, non_blocking=True) + state_indices_t = persistent_state_indices_t + + new_metadata.state_indices_tensor = state_indices_t + return new_metadata diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index fea482493635f..e9ec96835f277 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -355,6 +355,8 @@ class MLACommonPrefillMetadata: max_query_len: int chunked_context: ChunkedContextMetadata | None = None query_seq_lens: torch.Tensor | None = None + workspace_buffer: torch.Tensor | None = None + q_data_type: torch.dtype | None = None @dataclass @@ -558,6 +560,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.dcp_rank = 0 self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size + self.cp_kv_cache_interleave_size = parallel_config.cp_kv_cache_interleave_size # Don't try to access the runner on AMD if self.aot_schedule: @@ -722,8 +725,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, @@ -773,13 +776,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): query_start_loc = common_attn_metadata.query_start_loc query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu seq_lens = common_attn_metadata.seq_lens - seq_lens_cpu = common_attn_metadata.seq_lens_cpu dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens - dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu - - query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] - - num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( split_decodes_and_prefills( @@ -794,6 +791,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill_metadata = None if num_prefills > 0: + num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu + reqs_start = num_decodes # prefill_start context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs] @@ -983,19 +982,29 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill_metadata.query_seq_lens = ( prefill_query_start_loc[1:] - prefill_query_start_loc[:-1] ) + prefill_metadata.workspace_buffer = self._workspace_buffer decode_metadata = None if num_decodes > 0: dcp_tot_seq_lens_device = None if self.dcp_world_size > 1: dcp_tot_seq_lens_device = seq_lens[:num_decodes] - seq_lens_cpu = dcp_local_seq_lens_cpu seq_lens = dcp_local_seq_lens + # After DCP distribution, the maximum number of tokens for any rank is + # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size, + # and I is cp_kv_cache_interleave_size. + # This eliminates GPU->CPU sync while minimizing workspace + # over-allocation. + num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size + max_seq_len = ( + (max_seq_len + num_partitions - 1) // num_partitions + ) * self.cp_kv_cache_interleave_size + decode_metadata = self._build_decode( block_table_tensor=block_table_tensor[:num_decodes, ...], - seq_lens_cpu=seq_lens_cpu[:num_decodes], seq_lens_device=seq_lens[:num_decodes], + max_seq_len=max_seq_len, query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1], query_start_loc_device=query_start_loc[: num_decodes + 1], num_decode_tokens=num_decode_tokens, @@ -1491,12 +1500,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): from flashinfer.prefill import trtllm_ragged_attention_deepseek assert prefill.query_seq_lens is not None + assert prefill.workspace_buffer is not None ret = trtllm_ragged_attention_deepseek( query=q, key=k, value=v, - workspace_buffer=self._workspace_buffer, + workspace_buffer=prefill.workspace_buffer, seq_lens=prefill.query_seq_lens, max_q_len=prefill.max_query_len, max_kv_len=prefill.max_query_len, @@ -1525,6 +1535,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): assert prefill.chunked_context is not None assert prefill.chunked_context.seq_lens[chunk_idx] is not None + assert prefill.workspace_buffer is not None out = torch.zeros( q.shape[0], @@ -1533,13 +1544,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): device=q.device, dtype=q.dtype, ) - self._workspace_buffer.fill_(0) + prefill.workspace_buffer.fill_(0) attn_out, lse = trtllm_ragged_attention_deepseek( query=q, key=k, value=v, - workspace_buffer=self._workspace_buffer, + workspace_buffer=prefill.workspace_buffer, seq_lens=prefill.chunked_context.seq_lens[chunk_idx], max_q_len=prefill.max_query_len, max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx], diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index b28814aceada9..b4a68f472e9c1 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -169,8 +169,8 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, @@ -178,7 +178,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] ) -> FlashAttnMLADecodeMetadata: query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] max_query_len = query_lens_cpu.max().item() - max_seq_len = seq_lens_cpu.max().item() # For Flash Attention MLA + full cudagraph max_num_splits = 0 @@ -193,7 +192,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] max_num_splits = 1 scheduler_metadata = self._schedule_decode( - num_reqs=seq_lens_cpu.numel(), + num_reqs=seq_lens_device.shape[0], cu_query_lens=query_start_loc_device, max_query_len=max_query_len, seqlens=seq_lens_device, diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 74a4cd8430250..913503ce44944 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -143,8 +143,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 589d6ef2f6348..e8921f8a1c403 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -106,8 +106,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index c8fe0faf71088..e2fae37f5619d 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -2,15 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -import torch - from vllm.attention.backends.abstract import AttentionBackend -from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder -from vllm.v1.attention.backends.utils import ( - PAD_SLOT_ID, - CommonAttentionMetadata, - compute_causal_conv1d_metadata, - split_decodes_and_prefills, +from vllm.v1.attention.backends.mamba_attn import ( + BaseMambaAttentionMetadata, + BaseMambaAttentionMetadataBuilder, ) @@ -21,84 +16,11 @@ class ShortConvAttentionBackend(AttentionBackend): @dataclass -class ShortConvAttentionMetadata: - num_prefills: int - num_prefill_tokens: int - num_decodes: int - num_decode_tokens: int - - query_start_loc: torch.Tensor - state_indices_tensor: torch.Tensor - has_initial_states_p: torch.Tensor | None - - # For causal_conv1d - nums_dict: dict | None = None - batch_ptr: torch.Tensor | None = None - token_chunk_offset_ptr: torch.Tensor | None = None +class ShortConvAttentionMetadata(BaseMambaAttentionMetadata): + pass class ShortConvAttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[ShortConvAttentionMetadata] ): - def build( - self, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata, - fast_build: bool = False, - ) -> ShortConvAttentionMetadata: - num_reqs = common_attn_metadata.num_reqs - query_start_loc = common_attn_metadata.query_start_loc - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - - # for causal_conv1d - nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None - - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - - has_initial_states_p = None - if num_prefills > 0: - has_initial_states_cpu = ( - common_attn_metadata.num_computed_tokens_cpu[ - num_reqs - num_prefills : num_reqs - ] - > 0 - ) - has_initial_states_p = has_initial_states_cpu.to(query_start_loc.device) - - query_start_loc_p = ( - common_attn_metadata.query_start_loc[-num_prefills - 1 :] - - num_decode_tokens - ) - - nums_dict, batch_ptr, token_chunk_offset_ptr = ( - compute_causal_conv1d_metadata(query_start_loc_p) - ) - - elif ( - num_decodes > 0 - and num_decodes <= self.decode_cudagraph_max_bs - and self.compilation_config.cudagraph_mode.has_full_cudagraphs() - ): - self.state_indices_tensor[:num_decodes].copy_( - state_indices_tensor, non_blocking=True - ) - state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] - state_indices_tensor[num_decodes:] = PAD_SLOT_ID - - attn_metadata = ShortConvAttentionMetadata( - query_start_loc=query_start_loc, - state_indices_tensor=state_indices_tensor, - has_initial_states_p=has_initial_states_p, - num_prefills=num_prefills, - num_prefill_tokens=num_prefill_tokens, - num_decodes=num_decodes, - num_decode_tokens=num_decode_tokens, - nums_dict=nums_dict, - batch_ptr=batch_ptr, - token_chunk_offset_ptr=token_chunk_offset_ptr, - ) - return attn_metadata + metadata_cls = ShortConvAttentionMetadata diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 56763f4b52539..6b94f786a26b2 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -835,6 +835,15 @@ def subclass_attention_backend( ) +def subclass_attention_backend_with_overrides( + name_prefix: str, + attention_backend_cls: type[AttentionBackend], + overrides: dict[str, Any], +) -> type[AttentionBackend]: + name: str = name_prefix + attention_backend_cls.__name__ # type: ignore + return type(name, (attention_backend_cls,), overrides) + + def split_decodes_prefills_and_extends( common_attn_metadata: CommonAttentionMetadata, decode_threshold: int = 1, diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index c779e3d34b3ed..a6f06d1b16a34 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -270,10 +270,8 @@ class BlockPool: if num_cached_blocks == 0: parent_block_hash: ExternalBlockHash | None = None else: - parent_block = blocks[num_cached_blocks - 1] - assert parent_block.block_hash is not None parent_block_hash = maybe_convert_block_hash( - get_block_hash(parent_block.block_hash) + block_hashes[num_cached_blocks - 1] ) self.kv_event_queue.append( diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 4f54d12f4b8d0..27d34f1c60da8 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -75,6 +75,12 @@ class EngineCoreRequest( trace_headers: Mapping[str, str] | None = None + # The user-provided request ID. This field is set internally, + # copied from the provided request_id that's originally assigned + # to the request_id field, see InputProcessor.assign_request_id(). + # Used in outputs and to support abort(req_id, internal=False). + external_req_id: str | None = None + @property def params(self) -> SamplingParams | PoolingParams: """Return the processed params (sampling or pooling).""" diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a6ee241c41151..87b700d13e9d8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,6 +4,7 @@ import asyncio import os import socket import time +import warnings from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, cast @@ -289,12 +290,15 @@ class AsyncLLM(EngineClient): is_pooling = isinstance(params, PoolingParams) - # Create a new output collector for the request. - queue = RequestOutputCollector(output_kind=params.output_kind) - # Convert Input --> Request. if isinstance(prompt, EngineCoreRequest): request = prompt + if request_id != request.request_id: + logger.warning_once( + "AsyncLLM.add_request() was passed a request_id parameter that " + "does not match the EngineCoreRequest.request_id attribute. The " + "latter will be used, and the former will be ignored." + ) else: assert prompt_text is None request = self.input_processor.process_inputs( @@ -313,6 +317,11 @@ class AsyncLLM(EngineClient): elif isinstance(prompt, Mapping): prompt_text = cast(str | None, prompt.get("prompt")) + self.input_processor.assign_request_id(request) + + # Create a new output collector for the request. + queue = RequestOutputCollector(params.output_kind, request.request_id) + # Use cloned params that may have been updated in process_inputs() params = request.params @@ -324,7 +333,7 @@ class AsyncLLM(EngineClient): assert isinstance(parent_params, SamplingParams) # Fan out child requests (for n>1). - parent_request = ParentRequest(request_id, parent_params) + parent_request = ParentRequest(request) for idx in range(parent_params.n): request_id, child_params = parent_request.get_child_info(idx) child_request = request if idx == parent_params.n - 1 else copy(request) @@ -395,6 +404,7 @@ class AsyncLLM(EngineClient): "prompt logprobs" ) + q: RequestOutputCollector | None = None try: # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us @@ -445,7 +455,8 @@ class AsyncLLM(EngineClient): # is cancelled or the generator is garbage collected. So, # we abort the request if we end up here. except (asyncio.CancelledError, GeneratorExit): - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s aborted.", request_id) raise @@ -464,7 +475,8 @@ class AsyncLLM(EngineClient): # Unexpected error in the generate() task (possibly recoverable). except Exception as e: - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e @@ -540,13 +552,15 @@ class AsyncLLM(EngineClient): self.output_handler = asyncio.create_task(output_handler()) - async def abort(self, request_id: str | Iterable[str]) -> None: + async def abort( + self, request_id: str | Iterable[str], internal: bool = False + ) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" request_ids = ( (request_id,) if isinstance(request_id, str) else as_list(request_id) ) - all_request_ids = self.output_processor.abort_requests(request_ids) + all_request_ids = self.output_processor.abort_requests(request_ids, internal) await self.engine_core.abort_requests_async(all_request_ids) if self.log_requests: @@ -580,7 +594,7 @@ class AsyncLLM(EngineClient): if not wait_for_inflight_requests: request_ids = list(self.output_processor.request_states.keys()) if request_ids: - await self.abort(request_ids) + await self.abort(request_ids, internal=True) # Wait for running requests to drain before clearing cache. if self.output_processor.has_unfinished_requests(): @@ -627,8 +641,12 @@ class AsyncLLM(EngineClient): The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove truncate_prompt_tokens in v0.15. """ + q: RequestOutputCollector | None = None try: # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us @@ -641,9 +659,19 @@ class AsyncLLM(EngineClient): if tokenization_kwargs is None: tokenization_kwargs = {} + + if truncate_prompt_tokens is not None: + warnings.warn( + "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` " + "is deprecated and will be removed in v0.15. " + "Please use `pooling_params.truncate_prompt_tokens` instead.", + DeprecationWarning, + stacklevel=2, + ) + _validate_truncation_size( self.model_config.max_model_len, - truncate_prompt_tokens, + pooling_params.truncate_prompt_tokens, tokenization_kwargs, ) @@ -673,7 +701,8 @@ class AsyncLLM(EngineClient): # If the request is disconnected by the client, generate() # is cancelled. So, we abort the request if we end up here. except asyncio.CancelledError: - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s aborted.", request_id) raise @@ -692,7 +721,8 @@ class AsyncLLM(EngineClient): # Unexpected error in the generate() task (possibly recoverable). except Exception as e: - await self.abort(request_id) + if q is not None: + await self.abort(q.request_id, internal=True) if self.log_requests: logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 807db8275fbf5..cacbc805e84f8 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -20,6 +20,7 @@ import zmq import zmq.asyncio from vllm.config import VllmConfig +from vllm.envs import VLLM_ENGINE_READY_TIMEOUT_S from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask @@ -528,7 +529,9 @@ class MPClient(EngineCoreClient): identities = set(self.core_engines) sync_input_socket = zmq.Socket.shadow(self.input_socket) while identities: - if not sync_input_socket.poll(timeout=600_000): + if not sync_input_socket.poll( + timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000 # convert to ms + ): raise TimeoutError( "Timed out waiting for engines to send" "initial message on input socket." @@ -1340,7 +1343,9 @@ class DPLBAsyncMPClient(DPAsyncMPClient): # Wait for ready messages from new engines on the input socket sync_input_socket = zmq.Socket.shadow(self.input_socket) while new_engine_identities: - if not sync_input_socket.poll(timeout=600_000): + if not sync_input_socket.poll( + timeout=VLLM_ENGINE_READY_TIMEOUT_S * 1000 # convert to ms + ): raise TimeoutError( "Timed out waiting for new engines to send initial " "message on input socket." diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 29293877cb69d..1d43a8253843f 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -21,7 +21,7 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tokenizers import TokenizerLike from vllm.tokenizers.mistral import MistralTokenizer -from vllm.utils import length_from_prompt_token_ids_or_embeds +from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats from vllm.v1.structured_output.backend_guidance import ( @@ -406,6 +406,19 @@ class InputProcessor: mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] return mm_uuids + @staticmethod + def assign_request_id(request: EngineCoreRequest): + """Replace the externally supplied request ID with an internal request ID + that adds 8 random characters in order to ensure uniquness. + """ + if request.external_req_id is not None: + raise ValueError( + "The external_req_id field should not be set on EngineCoreRequests" + " passed to vLLM; use the request_id field." + ) + request.external_req_id = request.request_id + request.request_id = f"{request.external_req_id}-{random_uuid():.8}" + def process_inputs( self, request_id: str, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1011317b706d3..33fc34b67af6f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -213,10 +213,10 @@ class LLMEngine: def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return self.engine_core.get_supported_tasks() - def abort_request(self, request_ids: list[str]) -> None: + def abort_request(self, request_ids: list[str], internal: bool = False) -> None: """Remove request_ids from EngineCore and Detokenizer.""" - request_ids = self.output_processor.abort_requests(request_ids) + request_ids = self.output_processor.abort_requests(request_ids, internal) self.engine_core.abort_requests(request_ids) def add_request( @@ -238,6 +238,12 @@ class LLMEngine: # Process raw inputs into the request. if isinstance(prompt, EngineCoreRequest): request = prompt + if request_id != request.request_id: + logger.warning_once( + "AsyncLLM.add_request() was passed a request_id parameter that " + "does not match the EngineCoreRequest.request_id attribute. The " + "latter will be used, and the former will be ignored." + ) else: assert prompt_text is None request = self.input_processor.process_inputs( @@ -255,6 +261,8 @@ class LLMEngine: elif isinstance(prompt, Mapping): prompt_text = cast(str | None, prompt.get("prompt")) + self.input_processor.assign_request_id(request) + # Use cloned params that may have been updated in process_inputs() params = request.params @@ -268,7 +276,7 @@ class LLMEngine: return # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) + parent_req = ParentRequest(request) for idx in range(n): request_id, child_params = parent_req.get_child_info(idx) child_request = request if idx == n - 1 else copy(request) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 8f7d8a71f1a2e..e8717e15198a7 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass from typing import Any, cast @@ -40,8 +41,9 @@ class RequestOutputCollector: producer gets ahead of the consumer. """ - def __init__(self, output_kind: RequestOutputKind): + def __init__(self, output_kind: RequestOutputKind, request_id: str): self.aggregate = output_kind == RequestOutputKind.DELTA + self.request_id = request_id self.output: RequestOutput | PoolingRequestOutput | Exception | None = None self.ready = asyncio.Event() @@ -92,6 +94,7 @@ class RequestState: def __init__( self, request_id: str, + external_req_id: str, parent_req: ParentRequest | None, request_index: int, lora_request: LoRARequest | None, @@ -111,6 +114,7 @@ class RequestState: temperature: float | None = None, ): self.request_id = request_id + self.external_req_id = external_req_id self.parent_req = parent_req self.request_index = request_index self.lora_request = lora_request @@ -176,8 +180,10 @@ class RequestState: assert request.pooling_params is not None output_kind = request.pooling_params.output_kind + assert request.external_req_id is not None return cls( request_id=request.request_id, + external_req_id=request.external_req_id, parent_req=parent_req, request_index=request_index, lora_request=request.lora_request, @@ -235,10 +241,13 @@ class RequestState: ] self.sent_tokens_offset = len(self.detokenizer.output_token_ids) - request_id = self.request_id + external_req_id = self.external_req_id + if pooling_output is not None: return self._new_request_output( - request_id, [self._new_pooling_output(pooling_output)], finished + external_req_id, + [self._new_pooling_output(pooling_output)], + finished, ) output = self._new_completion_output(new_token_ids, finish_reason, stop_reason) @@ -246,19 +255,18 @@ class RequestState: if self.parent_req is None: outputs = [output] else: - request_id, outputs, finished = self.parent_req.get_outputs( - request_id, output - ) + outputs, finished = self.parent_req.get_outputs(self.request_id, output) if not outputs: return None + external_req_id = self.parent_req.external_req_id return self._new_request_output( - request_id, outputs, finished, kv_transfer_params + external_req_id, outputs, finished, kv_transfer_params ) def _new_request_output( self, - request_id: str, + external_req_id: str, outputs: list[CompletionOutput] | list[PoolingOutput], finished: bool, kv_transfer_params: dict[str, Any] | None = None, @@ -269,7 +277,7 @@ class RequestState: # Prompt embeddings are currently not supported by pooling requests. assert self.prompt_token_ids is not None return PoolingRequestOutput( - request_id=request_id, + request_id=external_req_id, outputs=first_output, num_cached_tokens=self.num_cached_tokens, prompt_token_ids=self.prompt_token_ids, @@ -288,7 +296,7 @@ class RequestState: prompt_token_ids = [0] * len(self.prompt_embeds) return RequestOutput( - request_id=request_id, + request_id=external_req_id, # request_id is what was provided externally lora_request=self.lora_request, prompt=self.prompt, prompt_token_ids=prompt_token_ids, @@ -352,6 +360,7 @@ class OutputProcessor: self.stream_interval = stream_interval self.request_states: dict[str, RequestState] = {} self.parent_requests: dict[str, ParentRequest] = {} + self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list) self.lora_states = LoRARequestStates(log_stats) self.tracer: Tracer | None = None self._requests_drained = asyncio.Event() @@ -375,12 +384,41 @@ class OutputProcessor: assert state.queue is not None state.queue.put(e) - def abort_requests( - self, - request_ids: Iterable[str], - ) -> list[str]: - request_ids_to_abort = [] + def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]: + """Abort a list of requests. + + The request_ids may be either external request IDs (those passed to + InputProcessor.process_inputs()) or internal request IDs (those randomly + generated when creating the EngineCoreRequest). + + If an external request ID is provided, and that external request ID + was used for multiple requests, all requests associated with that external + request ID are aborted. + + In the case of parallel sampling, a request ID may be used to identify + a parent request, in which case the associated child requests are aborted + also. + """ + + internal_req_ids = [] for request_id in request_ids: + if internal: + # Internal ID - this may be a parent request + internal_req_ids.append(request_id) + + # Remove internal ID from the external->internal mapping + if req_state := self.request_states.get(request_id): + external_req_id = req_state.external_req_id + internal_ids = self.external_req_ids[external_req_id] + internal_ids.remove(request_id) + if not internal_ids: + del self.external_req_ids[external_req_id] + elif internal_ids := self.external_req_ids.pop(request_id, []): + # External ID - abort all requests in the external->internal mapping + internal_req_ids.extend(internal_ids) + + request_ids_to_abort = [] + for request_id in internal_req_ids: req_state = self.request_states.pop(request_id, None) if req_state is not None: self.lora_states.request_finished(request_id, req_state.lora_name) @@ -404,7 +442,7 @@ class OutputProcessor: # Abort children prior to removing the parent. if parent.child_requests: child_reqs = list(parent.child_requests) - child_reqs = self.abort_requests(child_reqs) + child_reqs = self.abort_requests(child_reqs, internal=True) request_ids_to_abort.extend(child_reqs) self.parent_requests.pop(request_id, None) if not self.request_states: @@ -439,6 +477,9 @@ class OutputProcessor: if parent_req: self.parent_requests[parent_req.request_id] = parent_req + # Track the external_req_id -> [internal_req_id, ...] mapping + self.external_req_ids[req_state.external_req_id].append(request_id) + def process_outputs( self, engine_core_outputs: list[EngineCoreOutput], @@ -522,6 +563,12 @@ class OutputProcessor: # Free completed requests. if finish_reason is not None: self.request_states.pop(req_id) + + internal_ids = self.external_req_ids[req_state.external_req_id] + internal_ids.remove(req_id) + if not internal_ids: + del self.external_req_ids[req_state.external_req_id] + # Remove parent request if applicable. parent_req = req_state.parent_req if parent_req and not parent_req.child_requests: @@ -597,7 +644,9 @@ class OutputProcessor: ) # meta - span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, req_state.request_id) + span.set_attribute( + SpanAttributes.GEN_AI_REQUEST_ID, req_state.external_req_id + ) if req_state.top_p: span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, req_state.top_p) if req_state.max_tokens_param: diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 59aacd1963076..b7761970ba92f 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -6,6 +6,7 @@ from typing import Optional, cast from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import IterationStats @@ -17,6 +18,7 @@ class ParentRequest: """ request_id: str + external_req_id: str sampling_params: SamplingParams # To track the completion of child requests @@ -31,8 +33,11 @@ class ParentRequest: # To efficiently obtain child sampling params cached_child_sampling_params: SamplingParams | None - def __init__(self, request_id: str, sampling_params: SamplingParams) -> None: - self.request_id = request_id + def __init__(self, request: EngineCoreRequest) -> None: + assert request.external_req_id is not None + sampling_params = request.params + self.request_id = request.request_id + self.external_req_id = request.external_req_id self.sampling_params = sampling_params self.child_requests = set() @@ -96,7 +101,7 @@ class ParentRequest: self, child_request_id: str, completion_output: CompletionOutput, - ) -> tuple[str, list[CompletionOutput], bool]: + ) -> tuple[list[CompletionOutput], bool]: already_finished_and_returned: bool = False if completion_output.finished(): if child_request_id in self.child_requests: @@ -118,7 +123,7 @@ class ParentRequest: outputs = [] if self.child_requests else self.output_aggregator finished = not self.child_requests - return self.request_id, outputs, finished + return outputs, finished def observe_num_generation_tokens(self, num_generation_tokens: int): self.max_num_generation_tokens = max( diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 751862aa9c767..7370f0aefafb4 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -80,17 +80,20 @@ class AttentionSpec(KVCacheSpec): @dataclass(frozen=True) class FullAttentionSpec(AttentionSpec): - sliding_window: int | None = None - attention_chunk_size: int | None = None """ - When hybrid allocator is disabled and the model contains both full - attention layers and sliding window attention layers, sliding - window attention are regarded as full attention in KV cache manager - (blocks are allocated for all tokens), while computed as sliding window + When hybrid allocator is disabled and the model contains both full + attention layers and sliding window attention layers, sliding + window attention are regarded as full attention in KV cache manager + (blocks are allocated for all tokens), while computed as sliding window attention in model runner. In this case, we use FullAttentionSpec and record the sliding window size. + """ + + sliding_window: int | None = None + """ Default to None for not using sliding window attention. """ + attention_chunk_size: int | None = None def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len @@ -390,10 +393,11 @@ class KVCacheConfig: The KV cache configuration of a model. """ - """The number of KV cache blocks""" num_blocks: int - """How should model runner initialize the KV cache tensors for each layer""" + """The number of KV cache blocks""" kv_cache_tensors: list[KVCacheTensor] + """How should model runner initialize the KV cache tensors for each layer""" + kv_cache_groups: list[KVCacheGroupSpec] """ The kv cache groups of the model. For models with only one type of attention, there is only one group that @@ -401,4 +405,3 @@ class KVCacheConfig: For models with multiple types of attention, there will be multiple groups, see `_get_kv_cache_config_uniform_page_size` for more details. """ - kv_cache_groups: list[KVCacheGroupSpec] diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 65a0a88ec0f5d..66697132b365c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -236,6 +236,7 @@ class EagleProposer: common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None, + num_rejected_tokens_gpu: torch.Tensor | None = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] @@ -414,6 +415,17 @@ class EagleProposer: common_attn_metadata.query_start_loc_cpu = torch.from_numpy( self.token_arange_np[: batch_size + 1] ).clone() + + # In padded drafter batch, we need to adjust the sequence lengths + # to remove the "padding" (i.e. rejected tokens). + # Only apply this adjustment when we have rejected tokens + # (i.e., not the first proposal). + if self.num_speculative_tokens > 1 and num_rejected_tokens_gpu is not None: + common_attn_metadata.seq_lens -= num_rejected_tokens_gpu + # Invalidate the CPU-side shadows to avoid H<>D sync. + common_attn_metadata._seq_lens_cpu = None + common_attn_metadata._num_computed_tokens_cpu = None + for token_index in range(self.num_speculative_tokens - 1): # Update the inputs. # cast to int32 is crucial when eagle model is compiled. @@ -628,13 +640,14 @@ class EagleProposer: common_attn_metadata: CommonAttentionMetadata, spec_decode_metadata: SpecDecodeMetadata, valid_sampled_tokens_count: torch.Tensor, - ) -> tuple[CommonAttentionMetadata, torch.Tensor]: + ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]: """ This function is used to prepare the inputs for speculative decoding It updates the common_attn_metadata for speculative decoding, but does not consider the rejected tokens. Instead, all tokens are included as inputs to the speculator, with the rejected tokens used as padding and filtered out later by `token_indices_to_sample`. + No blocking CPU operations should be introduced in this function. """ num_reqs = common_attn_metadata.num_reqs device = valid_sampled_tokens_count.device @@ -642,14 +655,17 @@ class EagleProposer: token_indices_to_sample = torch.empty( (num_reqs,), dtype=torch.int32, device=device ) + num_rejected_tokens_gpu = torch.empty( + (num_reqs,), dtype=torch.int32, device=device + ) - # Kernel grid: one program per request (row) grid = (num_reqs,) eagle_prepare_inputs_padded_kernel[grid]( spec_decode_metadata.cu_num_draft_tokens, valid_sampled_tokens_count, common_attn_metadata.query_start_loc, token_indices_to_sample, + num_rejected_tokens_gpu, num_reqs, ) @@ -674,7 +690,11 @@ class EagleProposer: dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens, ) - return spec_common_attn_metadata, token_indices_to_sample + return ( + spec_common_attn_metadata, + token_indices_to_sample, + num_rejected_tokens_gpu, + ) def propose_tree( self, diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py index 9d4399d00487a..783b6ed5961bc 100644 --- a/vllm/v1/spec_decode/utils.py +++ b/vllm/v1/spec_decode/utils.py @@ -23,6 +23,7 @@ def eagle_prepare_inputs_padded_kernel( valid_sampled_tokens_count_ptr, # [num_reqs] query_start_loc_gpu_ptr, # [num_reqs + 1] token_indices_to_sample_ptr, # [num_reqs] (output) + num_rejected_tokens_gpu_ptr, # [num_reqs] (output) num_reqs, # tl.int32 ): """ @@ -56,6 +57,7 @@ def eagle_prepare_inputs_padded_kernel( index_to_sample = q_last_tok_idx - num_rejected_tokens tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample) + tl.store(num_rejected_tokens_gpu_ptr + req_idx, num_rejected_tokens) @triton.jit diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 33be4dccfc710..16fc9fd7cb4d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -61,11 +61,13 @@ from vllm.model_executor.layers.rotary_embedding import ( ) from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal, SupportsXDRoPE, is_mixture_of_experts, supports_eagle3, + supports_mm_encoder_only, supports_mrope, supports_multimodal_pruning, supports_transcription, @@ -77,11 +79,7 @@ from vllm.model_executor.models.interfaces_base import ( is_text_generation_model, ) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import ( - BatchedTensorInputs, - MultiModalKwargsItem, - PlaceholderRange, -) +from vllm.multimodal.inputs import BatchedTensorInputs, MultiModalKwargsItem from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType @@ -1641,7 +1639,10 @@ class GPUModelRunner( ) -> None: attn_group = self.attn_groups[kv_cache_gid][attn_gid] builder = attn_group.get_metadata_builder(ubid or 0) - cache_key = (kv_cache_groups[kv_cache_gid].kv_cache_spec, type(builder)) + kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec + if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs): + kv_cache_spec = kv_cache_spec.kv_cache_specs[attn_group.layer_names[0]] + cache_key = (kv_cache_spec, type(builder)) cascade_attn_prefix_len = ( cascade_attn_prefix_lens[kv_cache_gid][attn_gid] @@ -2093,28 +2094,27 @@ class GPUModelRunner( ] return logits_indices_padded - def _batch_mm_kwargs_from_scheduler( + def _batch_mm_inputs_from_scheduler( self, scheduler_output: "SchedulerOutput", - ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]: - """Batch multimodal kwargs from scheduled encoder inputs. + ) -> tuple[list[str], list[MultiModalKwargsItem]]: + """Batch multimodal inputs from scheduled encoder inputs. Args: scheduler_output: The scheduler output containing scheduled encoder inputs. Returns: - A tuple of (mm_kwargs, req_ids_pos) where: - - mm_kwargs: List of multimodal kwargs items to be batched - - mm_hashes_pos: List of (mm_hash, position_info) tuples + A tuple of (mm_hashes, mm_kwargs) where: + - mm_hashes: List of multimodal hashes for each item + - mm_kwargs: List of multimodal kwargs for each item """ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs if not scheduled_encoder_inputs: return [], [] - # Batch the multi-modal inputs. + + mm_hashes = list[str]() mm_kwargs = list[MultiModalKwargsItem]() - # list of tuple (mm_hash, position_info) - mm_hashes_pos = list[tuple[str, PlaceholderRange]]() for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] @@ -2122,19 +2122,16 @@ class GPUModelRunner( mm_feature = req_state.mm_features[mm_input_id] if mm_feature.data is None: continue - mm_hash = mm_feature.identifier - mm_kwargs.append(mm_feature.data) - mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) - return mm_kwargs, mm_hashes_pos + mm_hashes.append(mm_feature.identifier) + mm_kwargs.append(mm_feature.data) + + return mm_hashes, mm_kwargs def _execute_mm_encoder( self, scheduler_output: "SchedulerOutput" ) -> list[torch.Tensor]: - # Batch the multi-modal inputs using the helper method. - mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( - scheduler_output - ) + mm_hashes, mm_kwargs = self._batch_mm_inputs_from_scheduler(scheduler_output) if not mm_kwargs: return [] @@ -2153,7 +2150,7 @@ class GPUModelRunner( device=self.device, pin_memory=self.pin_memory, ): - curr_group_outputs: list[torch.Tensor] = [] + curr_group_outputs: MultiModalEmbeddings # EVS-related change. # (ekhvedchenia): Temporary hack to limit peak memory usage when @@ -2169,6 +2166,7 @@ class GPUModelRunner( and modality == "video" and num_items > 1 ): + curr_group_outputs_lst = list[torch.Tensor]() for video_mm_kwargs_item in filter( lambda item: item.modality == "video", mm_kwargs ): @@ -2184,7 +2182,9 @@ class GPUModelRunner( **micro_batch_mm_inputs ) - curr_group_outputs.extend(micro_batch_outputs) + curr_group_outputs_lst.extend(micro_batch_outputs) + + curr_group_outputs = curr_group_outputs_lst else: # Run the encoder. # `curr_group_outputs` is either of the following: @@ -2193,7 +2193,7 @@ class GPUModelRunner( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) # type: ignore[assignment] + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -2202,7 +2202,7 @@ class GPUModelRunner( encoder_outputs.extend(curr_group_outputs) # Cache the encoder outputs by mm_hash - for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): + for mm_hash, output in zip(mm_hashes, encoder_outputs): self.encoder_cache[mm_hash] = output logger.debug("Finish execute for mm hash %s", mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) @@ -2453,6 +2453,17 @@ class GPUModelRunner( return round_up(num_scheduled_tokens, tp_size) return num_scheduled_tokens + def _prepare_mm_inputs( + self, num_tokens: int + ) -> tuple[torch.Tensor | None, torch.Tensor]: + if self.model.requires_raw_input_tokens: + input_ids = self.input_ids.gpu[:num_tokens] + else: + input_ids = None + + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + return input_ids, inputs_embeds + def _preprocess( self, scheduler_output: "SchedulerOutput", @@ -2495,8 +2506,7 @@ class GPUModelRunner( # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(inputs_embeds_scheduled) - input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_input_tokens) model_kwargs = { **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), @@ -3530,6 +3540,7 @@ class GPUModelRunner( next_token_ids, valid_sampled_tokens_count ) + num_rejected_tokens_gpu = None if spec_decode_metadata is None: token_indices_to_sample = None # input_ids can be None for multimodal models. @@ -3560,12 +3571,14 @@ class GPUModelRunner( else: target_hidden_states = hidden_states[token_indices] else: - common_attn_metadata, token_indices_to_sample = ( - self.drafter.prepare_inputs_padded( - common_attn_metadata, - spec_decode_metadata, - valid_sampled_tokens_count, - ) + ( + common_attn_metadata, + token_indices_to_sample, + num_rejected_tokens_gpu, + ) = self.drafter.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count, ) total_num_tokens = common_attn_metadata.num_actual_tokens # When padding the batch, token_indices is just a range @@ -3596,6 +3609,7 @@ class GPUModelRunner( sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, mm_embed_inputs=mm_embed_inputs, + num_rejected_tokens_gpu=num_rejected_tokens_gpu, ) return draft_token_ids @@ -4064,6 +4078,11 @@ class GPUModelRunner( remove_lora: If False, dummy LoRAs are not destroyed after the run activate_lora: If False, dummy_run is performed without LoRAs. """ + if supports_mm_encoder_only(self.model): + # The current dummy run only covers LM execution, so we can skip it. + # mm encoder dummy run may need to add in the future. + return torch.tensor([]), torch.tensor([]) + assert ( cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() @@ -4207,8 +4226,8 @@ class GPUModelRunner( assert num_tokens_padded <= self.max_num_tokens model_kwargs = self._init_model_kwargs(num_tokens_padded) if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: - input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded) + model_kwargs = { **model_kwargs, **self._dummy_mm_kwargs(num_reqs), @@ -4341,6 +4360,11 @@ class GPUModelRunner( # The dummy hidden states may contain special values, # like `inf` or `nan`. # To avoid breaking the sampler, we use a random tensor here instead. + + if supports_mm_encoder_only(self.model): + # MM Encoder only model no need to run sampler. + return torch.tensor([]) + hidden_states = torch.rand_like(hidden_states) logits = self.model.compute_logits(hidden_states) @@ -4469,6 +4493,10 @@ class GPUModelRunner( self, hidden_states: torch.Tensor, ) -> PoolerOutput: + if supports_mm_encoder_only(self.model): + # MM Encoder only model not need to run pooler. + return torch.tensor([]) + # Find the task that has the largest output for subsequent steps supported_pooling_tasks = self.get_supported_pooling_tasks() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index bc71351d2cc55..4747388e22b3d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -634,7 +634,12 @@ class Worker(WorkerBase): def profile(self, is_start: bool = True): if self.profiler is None: - raise RuntimeError("Profiling is not enabled.") + raise RuntimeError( + "Profiling is not enabled. Please set --profiler-config to enable " + "profiling. Example: " + "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir" + "=YOUR_DIR_PATH_TO_DUMP_TRACE'" + ) if is_start: self.profiler.start() else: