diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml index 46f1a9fbf6ff9..6c0b5540cbb6a 100644 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -8,3 +8,4 @@ tasks: value: 0.80 limit: 250 # will run on 250 * 14 subjects = 3500 samples num_fewshot: 5 +rtol: 0.05 diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt new file mode 100644 index 0000000000000..4fb0b84bc4d81 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 3627b760eddcf..f94d681197d2d 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \ --tp-size=1 """ +import os +from contextlib import contextmanager + import lm_eval import numpy as np import yaml -RTOL = 0.08 +DEFAULT_RTOL = 0.08 + + +@contextmanager +def scoped_env_vars(new_env: dict[str, str]): + if not new_env: + # Fast path: nothing to do + yield + return + + old_values = {} + new_keys = [] + + try: + for key, value in new_env.items(): + if key in os.environ: + old_values[key] = os.environ[key] + else: + new_keys.append(key) + os.environ[key] = str(value) + yield + finally: + # Restore / clean up + for key, value in old_values.items(): + os.environ[key] = value + for key in new_keys: + os.environ.pop(key, None) def launch_lm_eval(eval_config, tp_size): @@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size): f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}," ) - results = lm_eval.simple_evaluate( - model=backend, - model_args=model_args, - tasks=[task["name"] for task in eval_config["tasks"]], - num_fewshot=eval_config["num_fewshot"], - limit=eval_config["limit"], - # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help - # text models. however, this is regressing measured strict-match for - # existing text models in CI, so only apply it for mm, or explicitly set - apply_chat_template=eval_config.get( - "apply_chat_template", backend == "vllm-vlm" - ), - fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), - # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) - gen_kwargs=eval_config.get("gen_kwargs"), - batch_size=batch_size, - ) + + env_vars = eval_config.get("env_vars", None) + with scoped_env_vars(env_vars): + results = lm_eval.simple_evaluate( + model=backend, + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), + batch_size=batch_size, + ) return results @@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): results = launch_lm_eval(eval_config, tp_size) + rtol = eval_config.get("rtol", DEFAULT_RTOL) + success = True for task in eval_config["tasks"]: for metric in task["metrics"]: @@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size): measured_value = results["results"][task["name"]][metric["name"]] print( f"{task['name']} | {metric['name']}: " - f"ground_truth={ground_truth} | measured={measured_value}" + f"ground_truth={ground_truth:.3f} | " + f"measured={measured_value:.3f} | rtol={rtol}" ) - success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) + success = success and np.isclose(ground_truth, measured_value, rtol=rtol) assert success diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index 4d28ec9619de9..f10cb2f0b6e21 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -9,6 +9,7 @@ import argparse import json import sys from dataclasses import asdict, dataclass +from datetime import datetime from pathlib import Path from typing import Any from urllib.parse import quote @@ -20,6 +21,7 @@ if not sys.version_info >= (3, 12): INDEX_HTML_TEMPLATE = """ + {items} @@ -90,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo: ) -def generate_project_list(subdir_names: list[str]) -> str: +def generate_project_list(subdir_names: list[str], comment: str = "") -> str: """ Generate project list HTML content linking to each project & variant sub-directory. """ @@ -98,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str: for name in sorted(subdir_names): name = name.strip("/").strip(".") href_tags.append(f' {name}/
') - return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) + return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment) def generate_package_index_and_metadata( - wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path + wheel_files: list[WheelFileInfo], + wheel_base_dir: Path, + index_base_dir: Path, + comment: str = "", ) -> tuple[str, str]: """ Generate package index HTML content for a specific package, linking to actual wheel files. @@ -120,7 +125,7 @@ def generate_package_index_and_metadata( file_meta = asdict(file) file_meta["path"] = file_path_quoted metadata.append(file_meta) - index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) + index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment) metadata_str = json.dumps(metadata, indent=2) return index_str, metadata_str @@ -131,6 +136,7 @@ def generate_index_and_metadata( index_base_dir: Path, default_variant: str | None = None, alias_to_default: str | None = None, + comment: str = "", ): """ Generate index for all wheel files. @@ -141,6 +147,7 @@ def generate_index_and_metadata( index_base_dir (Path): Base directory to store index files. default_variant (str | None): The default variant name, if any. alias_to_default (str | None): Alias variant name for the default variant, if any. + comment (str | None): Optional comment to include in the generated HTML files. First, parse all wheel files to extract metadata. We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory). @@ -234,6 +241,10 @@ def generate_index_and_metadata( variant_to_files[alias_to_default] = variant_to_files["default"].copy() print(f"Alias variant '{alias_to_default}' created for default variant.") + # Generate comment in HTML header + comment_str = f" ({comment})" if comment else "" + comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}" + # Generate index for each variant subdir_names = set() for variant, files in variant_to_files.items(): @@ -253,7 +264,7 @@ def generate_index_and_metadata( subdir_names = subdir_names.union(packages) else: # generate project list for this variant directly - project_list_str = generate_project_list(sorted(packages)) + project_list_str = generate_project_list(sorted(packages), comment_tmpl) with open(variant_dir / "index.html", "w") as f: f.write(project_list_str) @@ -263,7 +274,7 @@ def generate_index_and_metadata( package_dir = variant_dir / package package_dir.mkdir(parents=True, exist_ok=True) index_str, metadata_str = generate_package_index_and_metadata( - package_files, wheel_base_dir, package_dir + package_files, wheel_base_dir, package_dir, comment ) with open(package_dir / "index.html", "w") as f: f.write(index_str) @@ -271,7 +282,7 @@ def generate_index_and_metadata( f.write(metadata_str) # Generate top-level project list index - project_list_str = generate_project_list(sorted(subdir_names)) + project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl) with open(index_base_dir / "index.html", "w") as f: f.write(project_list_str) @@ -283,6 +294,7 @@ if __name__ == "__main__": --current-objects : path to JSON file containing current S3 objects listing in this version directory --output-dir : directory to store generated index files --alias-to-default : (optional) alias variant name for the default variant + --comment : (optional) comment string to include in generated HTML files """ parser = argparse.ArgumentParser( @@ -312,6 +324,12 @@ if __name__ == "__main__": default=None, help="Alias variant name for the default variant", ) + parser.add_argument( + "--comment", + type=str, + default="", + help="Optional comment string to include in generated HTML files", + ) args = parser.parse_args() @@ -366,5 +384,6 @@ if __name__ == "__main__": index_base_dir=index_base_dir, default_variant=None, alias_to_default=args.alias_to_default, + comment=args.comment.strip(), ) print(f"Successfully generated index and metadata in {output_dir}") diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 0ac8fdd45bd0a..8e38ace0bfbc2 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -81,7 +81,10 @@ else alias_arg="" fi -$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg +# HACK: we do not need regex module here, but it is required by pre-commit hook +# To avoid any external dependency, we simply replace it back to the stdlib re module +sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py +$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg # copy indices to // unconditionally echo "Uploading indices to $S3_COMMIT_PREFIX" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 022b6ea236d54..6950ad774edd8 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -718,17 +718,6 @@ steps: - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py -- label: LM Eval Small Models # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - - label: OpenAI API correctness # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] @@ -974,19 +963,6 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work -- label: Multi-Modal Accuracy Eval (Small Models) # 10min - timeout_in_minutes: 70 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] @@ -1162,21 +1138,6 @@ steps: # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -- label: ROCm GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - agent_pool: mi325_1 - mirror_hardwares: [amdexperimental, amdproduction] - optional: true # run on nightlies - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1194,16 +1155,6 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1380,7 +1331,7 @@ steps: - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - # Disabled for now because MXFP4 backend on non-cuda platform + # Disabled for now because MXFP4 backend on non-cuda platform # doesn't support LoRA yet #- pytest -v -s -x lora/test_gptoss_tp.py @@ -1446,37 +1397,6 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py -- label: LM Eval Large Models # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: a100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - ##### H200 test ##### - label: Distributed Tests (H200) # optional mirror_hardwares: [amdexperimental] @@ -1508,20 +1428,94 @@ steps: - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/v1/distributed/test_dbo.py -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min - mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 +##### E2E Eval Tests ##### +- label: LM Eval Small Models (1 Card) # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 # grade: Blocking - timeout_in_minutes: 30 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh + - csrc/ + - vllm/model_executor/layers/quantization commands: - - bash .buildkite/scripts/run-prime-rl-test.sh + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + +- label: Multi-Modal Accuracy Eval (Small Models) # 10min + timeout_in_minutes: 70 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: LM Eval Large Models (4 Card) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (H100) # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +- label: ROCm LM Eval Large Models (8 Card) + mirror_hardwares: [amdproduction] + agent_pool: mi325_8 + num_gpus: 8 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 + +- label: ROCm GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + agent_pool: mi325_1 + mirror_hardwares: [amdexperimental, amdproduction] + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - label: DeepSeek V2-Lite Accuracy mirror_hardwares: [amdexperimental, amdproduction] @@ -1554,4 +1548,19 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a79f0b0c6bbdf..0a99994e243ae 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -350,7 +350,8 @@ steps: timeout_in_minutes: 25 gpu: h100 source_file_dependencies: - - vllm/ + - vllm/v1/attention + - vllm/model_executor/layers - tests/v1/determinism/ commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn diff --git a/docs/contributing/ci/nightly_builds.md b/docs/contributing/ci/nightly_builds.md new file mode 100644 index 0000000000000..a07b9c1c2fa4a --- /dev/null +++ b/docs/contributing/ci/nightly_builds.md @@ -0,0 +1,160 @@ +# Nightly Builds of vLLM Wheels + +vLLM maintains a per-commit wheel repository (commonly referred to as "nightly") at `https://wheels.vllm.ai` that provides pre-built wheels for every commit on the `main` branch since `v0.5.3`. This document explains how the nightly wheel index mechanism works. + +## Build and Upload Process on CI + +### Wheel Building + +Wheels are built in the `Release` pipeline (`.buildkite/release-pipeline.yaml`) after a PR is merged into the main branch, with multiple variants: + +- **Backend variants**: `cpu` and `cuXXX` (e.g., `cu129`, `cu130`). +- **Architecture variants**: `x86_64` and `aarch64`. + +Each build step: + +1. Builds the wheel in a Docker container. +2. Renames the wheel filename to use the correct manylinux tag (currently `manylinux_2_31`) for PEP 600 compliance. +3. Uploads the wheel to S3 bucket `vllm-wheels` under `/{commit_hash}/`. + +### Index Generation + +After uploading each wheel, the `.buildkite/scripts/upload-wheels.sh` script: + +1. **Lists all existing wheels** in the commit directory from S3 +2. **Generates indices** using `.buildkite/scripts/generate-nightly-index.py`: + - Parses wheel filenames to extract metadata (version, variant, platform tags). + - Creates HTML index files (`index.html`) for PyPI compatibility. + - Generates machine-readable `metadata.json` files. +3. **Uploads indices** to multiple locations (overriding existing ones): + - `/{commit_hash}/` - Always uploaded for commit-specific access. + - `/nightly/` - Only for commits on `main` branch (not PRs). + - `/{version}/` - Only for release wheels (no `dev` in its version). + +!!! tip "Handling Concurrent Builds" + The index generation script can handle multiple variants being built concurrently by always listing all wheels in the commit directory before generating indices, avoiding race conditions. + +## Directory Structure + +The S3 bucket structure follows this pattern: + +```text +s3://vllm-wheels/ +├── {commit_hash}/ # Commit-specific wheels and indices +│ ├── vllm-*.whl # All wheel files +│ ├── index.html # Project list (default variant) +│ ├── vllm/ +│ │ ├── index.html # Package index (default variant) +│ │ └── metadata.json # Metadata (default variant) +│ ├── cu129/ # Variant subdirectory +│ │ ├── index.html # Project list (cu129 variant) +│ │ └── vllm/ +│ │ ├── index.html # Package index (cu129 variant) +│ │ └── metadata.json # Metadata (cu129 variant) +│ ├── cu130/ # Variant subdirectory +│ ├── cpu/ # Variant subdirectory +│ └── .../ # More variant subdirectories +├── nightly/ # Latest main branch wheels (mirror of latest commit) +└── {version}/ # Release version indices (e.g., 0.11.2) +``` + +All built wheels are stored in `/{commit_hash}/`, while different indices are generated and reference them. +This avoids duplication of wheel files. + +For example, you can specify the following URLs to use different indices: + +- `https://wheels.vllm.ai/nightly/cu130` for the latest main branch wheels built with CUDA 13.0. +- `https://wheels.vllm.ai/{commit_hash}` for wheels built at a specific commit (default variant). +- `https://wheels.vllm.ai/0.12.0/cpu` for 0.12.0 release wheels built for CPU variant. + +Please note that not all variants are present on every commit. The available variants are subject to change over time, e.g., changing cu130 to cu131. + +### Variant Organization + +Indices are organized by variant: + +- **Default variant**: Wheels without variant suffix (i.e., built with the current `VLLM_MAIN_CUDA_VERSION`) are placed in the root. +- **Variant subdirectories**: Wheels with variant suffixes (e.g., `+cu130`, `.cpu`) are organized in subdirectories. +- **Alias to default**: The default variant can have an alias (e.g., `cu129` for now) for consistency and convenience. + +The variant is extracted from the wheel filename (as described in the [file name convention](https://packaging.python.org/en/latest/specifications/binary-distribution-format/#file-name-convention)): + +- The variant is encoded in the local version identifier (e.g. `+cu129` or `dev+g.cu130`). +- Examples: + - `vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl` → default variant + - `vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl` → `cu129` variant + - `vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl` → `cu130` variant + +## Index Generation Details + +The `generate-nightly-index.py` script performs the following: + +1. **Parses wheel filenames** using regex to extract: + - Package name + - Version (with variant extracted) + - Python tag, ABI tag, platform tag + - Build tag (if present) +2. **Groups wheels by variant**, then by package name: + - Currently only `vllm` is built, but the structure supports multiple packages in the future. +3. **Generates HTML indices** (compliant with the [Simple repository API](https://packaging.python.org/en/latest/specifications/simple-repository-api/#simple-repository-api)): + - Top-level `index.html`: Lists all packages and variant subdirectories + - Package-level `index.html`: Lists all wheel files for that package + - Uses relative paths to wheel files for portability +4. **Generates metadata.json**: + - Machine-readable JSON containing all wheel metadata + - Includes `path` field with URL-encoded relative path to wheel file + - Used by `setup.py` to locate compatible pre-compiled wheels during Python-only builds + +### Special Handling for AWS Services + +The wheels and indices are directly stored on AWS S3, and we use AWS CloudFront as a CDN in front of the S3 bucket. + +Since S3 does not provide proper directory listing, to support PyPI-compatible simple repository API behavior, we deploy a CloudFront Function that: + +- redirects any URL that does not end with `/` and does not look like a file (i.e., does not contain a dot `.` in the last path segment) to the same URL with a trailing `/` +- appends `/index.html` to any URL that ends with `/` + +For example, the following requests would be handled as: + +- `/nightly` -> `/nightly/index.html` +- `/nightly/cu130/` -> `/nightly/cu130/index.html` +- `/nightly/index.html` or `/nightly/vllm.whl` -> unchanged + +!!! note "AWS S3 Filename Escaping" + + S3 will automatically escape filenames upon upload according to its [naming rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html). The direct impact on vllm is that `+` in filenames will be converted to `%2B`. We take special care in the index generation script to escape filenames properly when generating the HTML indices and JSON metadata, to ensure the URLs are correct and can be directly used. + +## Usage of precompiled wheels in `setup.py` {#precompiled-wheels-usage} + +When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script: + +1. **Determines wheel location** via `precompiled_wheel_utils.determine_wheel_url()`: + - Env var `VLLM_PRECOMPILED_WHEEL_LOCATION` (user-specified URL/path) always takes precedence and skips all other steps. + - Determines the variant from `VLLM_MAIN_CUDA_VERSION` (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_VARIANT`); the default variant will also be tried as a fallback. + - Determines the _base commit_ (explained later) of this branch (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_COMMIT`). +2. **Fetches metadata** from `https://wheels.vllm.ai/{commit}/vllm/metadata.json` (for the default variant) or `https://wheels.vllm.ai/{commit}/{variant}/vllm/metadata.json` (for a specific variant). +3. **Selects compatible wheel** based on: + - Package name (`vllm`) + - Platform tag (architecture match) +4. **Downloads and extracts** precompiled binaries from the wheel: + - C++ extension modules (`.so` files) + - Flash Attention Python modules + - Triton kernel Python files +5. **Patches package_data** to include extracted files in the installation + +!!! note "What is the base commit?" + + The base commit is determined by finding the merge-base + between the current branch and upstream `main`, ensuring + compatibility between source code and precompiled binaries. + +_Note: it's users' responsibility to ensure there is no native code (e.g., C++ or CUDA) changes before using precompiled wheels._ + +## Implementation Files + +Key files involved in the nightly wheel mechanism: + +- **`.buildkite/release-pipeline.yaml`**: CI pipeline that builds wheels +- **`.buildkite/scripts/upload-wheels.sh`**: Script that uploads wheels and generates indices +- **`.buildkite/scripts/generate-nightly-index.py`**: Python script that generates PyPI-compatible indices +- **`setup.py`**: Contains `precompiled_wheel_utils` class for fetching and using precompiled wheels diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md new file mode 100644 index 0000000000000..483dd7474440b --- /dev/null +++ b/docs/deployment/integrations/kthena.md @@ -0,0 +1,333 @@ +# Kthena + +[**Kthena**](https://github.com/volcano-sh/kthena) is a Kubernetes-native LLM inference platform that transforms how organizations deploy and manage Large Language Models in production. Built with declarative model lifecycle management and intelligent request routing, it provides high performance and enterprise-grade scalability for LLM inference workloads. + +This guide shows how to deploy a production-grade, **multi-node vLLM** service on Kubernetes. + +We’ll: + +- Install the required components (Kthena + Volcano). +- Deploy a multi-node vLLM model via Kthena’s `ModelServing` CR. +- Validate the deployment. + +--- + +## 1. Prerequisites + +You need: + +- A Kubernetes cluster with **GPU nodes**. +- `kubectl` access with cluster-admin or equivalent permissions. +- **Volcano** installed for gang scheduling. +- **Kthena** installed with the `ModelServing` CRD available. +- A valid **Hugging Face token** if loading models from Hugging Face Hub. + +### 1.1 Install Volcano + +```bash +helm repo add volcano-sh https://volcano-sh.github.io/helm-charts +helm repo update +helm install volcano volcano-sh/volcano -n volcano-system --create-namespace +``` + +This provides the gang-scheduling and network topology features used by Kthena. + +### 1.2 Install Kthena + +```bash +helm install kthena oci://ghcr.io/volcano-sh/charts/kthena --version v0.1.0 --namespace kthena-system --create-namespace +``` + +- The `kthena-system` namespace is created. +- Kthena controllers and CRDs, including `ModelServing`, are installed and healthy. + +Validate: + +```bash +kubectl get crd | grep modelserving +``` + +You should see: + +```text +modelservings.workload.serving.volcano.sh ... +``` + +--- + +## 2. The Multi-Node vLLM `ModelServing` Example + +Kthena provides an example manifest to deploy a **multi-node vLLM cluster running Llama**. Conceptually this is equivalent to the vLLM production stack Helm deployment, but expressed with `ModelServing`. + +A simplified version of the example (`llama-multinode`) looks like: + +- `spec.replicas: 1` – one `ServingGroup` (one logical model deployment). +- `roles`: + - `entryTemplate` – defines **leader** pods that run: + - vLLM’s **multi-node cluster bootstrap script** (Ray cluster). + - vLLM **OpenAI-compatible API server**. + - `workerTemplate` – defines **worker** pods that join the leader’s Ray cluster. + +Key points from the example YAML: + +- **Image**: `vllm/vllm-openai:latest` (matches upstream vLLM images). +- **Command** (leader): + + ```yaml + command: + - sh + - -c + - > + bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; + python3 -m vllm.entrypoints.openai.api_server + --port 8080 + --model meta-llama/Llama-3.1-405B-Instruct + --tensor-parallel-size 8 + --pipeline-parallel-size 2 + ``` + +- **Command** (worker): + + ```yaml + command: + - sh + - -c + - > + bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS) + ``` + +--- + +## 3. Deploying Multi-Node llama vLLM via Kthena + +### 3.1 Prepare the Manifest + +**Recommended**: use a Secret instead of a raw env var: + +```bash +kubectl create secret generic hf-token \ + -n default \ + --from-literal=HUGGING_FACE_HUB_TOKEN='' +``` + +### 3.2 Apply the `ModelServing` + +```bash +cat <---`. + +The first number indicates `ServingGroup`. The second (`405b`) is the `Role`. The remaining indices identify the pod within the role. + +--- + +## 6. Accessing the vLLM OpenAI-Compatible API + +Expose the entry via a Service: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: llama-multinode-openai + namespace: default +spec: + selector: + modelserving.volcano.sh/name: llama-multinode + modelserving.volcano.sh/entry: "true" + # optionally further narrow to leader role if you label it + ports: + - name: http + port: 80 + targetPort: 8080 + type: ClusterIP +``` + +Port-forward from your local machine: + +```bash +kubectl port-forward svc/llama-multinode-openai 30080:80 -n default +``` + +Then: + +- List models: + + ```bash + curl -s http://localhost:30080/v1/models + ``` + +- Send a completion request (mirroring vLLM production stack docs): + + ```bash + curl -X POST http://localhost:30080/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-405B-Instruct", + "prompt": "Once upon a time,", + "max_tokens": 10 + }' + ``` + +You should see an OpenAI-style response from vLLM. + +--- + +## 7. Clean Up + +To remove the deployment and its resources: + +```bash +kubectl delete modelserving llama-multinode -n default +``` + +If you’re done with the entire stack: + +```bash +helm uninstall kthena -n kthena-system # or your Kthena release name +helm uninstall volcano -n volcano-system +``` diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index abffb7bc5f948..05814cbad9bfc 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -14,6 +14,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: - [InftyAI/llmaz](integrations/llmaz.md) - [KAITO](integrations/kaito.md) - [KServe](integrations/kserve.md) +- [Kthena](integrations/kthena.md) - [KubeRay](integrations/kuberay.md) - [kubernetes-sigs/lws](frameworks/lws.md) - [meta-llama/llama-stack](integrations/llamastack.md) diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md index e565f17da62ad..731e542a0307b 100644 --- a/docs/design/debug_vllm_compile.md +++ b/docs/design/debug_vllm_compile.md @@ -86,7 +86,7 @@ LLM(model, enforce_eager=True) ``` To turn off just torch.compile, pass `mode = NONE` to the compilation config. -(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated): +(`-cc` is short for `--compilation_config`): ```sh # Online diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 13264f6861b0c..28b5405871ac2 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -62,7 +62,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of - `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds. - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds. - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states. -- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM. +- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM. - `vllm:request_prompt_tokens` - Request prompt length. - `vllm:request_generation_tokens` - Request generation length. - `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached. diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 2b25dc7666c37..0adb32a7ac33c 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -443,6 +443,8 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd print(generated_text) ``` +For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features. + #### Audio Embeddings You can pass pre-computed audio embeddings similar to image embeddings: diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 08a0dd69efa90..3315c0949afca 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -18,6 +18,7 @@ vLLM currently supports the following reasoning models: | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ | | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ | | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | +| [Holo2 series](https://huggingface.co/collections/Hcompany/holo2) | `holo2` | `json`, `regex` | ✅ | | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ | @@ -28,6 +29,7 @@ vLLM currently supports the following reasoning models: IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. DeepSeek-V3.1 tool calling is supported in non-thinking mode. + Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`. ## Quickstart diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 394728b67eaa4..9d3d711c33797 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -58,10 +58,14 @@ schemathesis==3.39.15 # Evaluation and benchmarking lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d +jiwer==4.0.0 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test multiprocess==0.70.16 +# Required for v1/metrics/test_engine_logger_apis.py +ray[cgraph,default]>=2.48.0 + # Plugins test terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e torchgeo==0.7.0 diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index dc9c69bf58b95..3bd0b6609d88d 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -260,13 +260,18 @@ def test_deep_sleep_fp8_kvcache(): llm.sleep(level=2) used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline - assert used_bytes < 3 * GiB_bytes + + # Rocm uses more memory for CudaGraphs, so we add 2 GiB more for the threshold + rocm_extra_mem_bytes = 2 * GiB_bytes if current_platform.is_rocm() else 0 + mem_threshold_after_sleep = 3 * GiB_bytes + rocm_extra_mem_bytes + assert used_bytes < mem_threshold_after_sleep llm.wake_up(tags=["weights"]) llm.collective_rpc("reload_weights") used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline - assert used_bytes < 4 * GiB_bytes + mem_threshold_after_wake_up = 4 * GiB_bytes + rocm_extra_mem_bytes + assert used_bytes < mem_threshold_after_wake_up # now allocate kv cache and cuda graph memory llm.wake_up(tags=["kv_cache"]) diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index c65e5a25934d2..8fa305d6d72f5 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools +import multiprocessing import tempfile from contextlib import contextmanager @@ -137,3 +139,67 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch): artifacts = compiled_mod.aot_compiled_fn._artifacts guards_string = artifacts.compiled_fn.shape_env.format_guards() assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)" + + +@pytest.mark.skipif( + not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" +) +@use_vllm_config(make_vllm_config()) +def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch): + """ + Test that compiling gpt2 twice results in a cache hit and + capture torch dynamic symbol creations to ensure make_symbol + not called on cache hit. + """ + + import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module + from torch.utils._sympy.symbol import make_symbol + + from vllm import LLM + + create_symbol_counter = multiprocessing.Value("i", 0) + original_make_symbol = make_symbol + + @functools.wraps(original_make_symbol) + def counting_make_symbol(prefix, idx, **kwargs): + with create_symbol_counter.get_lock(): + create_symbol_counter.value += 1 + return original_make_symbol(prefix, idx, **kwargs) + + symbolic_shapes_module.make_symbol = counting_make_symbol + try: + with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname: + m.setenv("VLLM_CACHE_ROOT", tmpdirname) + m.setenv("VLLM_USE_AOT_COMPILE", "1") + # First compilation - initialize model and generate + llm_model = LLM( + model="gpt2", + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + ), + max_model_len=256, + ) + + llm_model.generate("Hello, my name is") + assert create_symbol_counter.value == 2 + create_symbol_counter.value = 0 + + # Clean up first model + del llm_model + + # Second compilation - should hit cache + m.setenv("VLLM_FORCE_AOT_LOAD", "1") + llm_model = LLM( + model="gpt2", + compilation_config=CompilationConfig( + mode=CompilationMode.VLLM_COMPILE, + ), + max_model_len=256, + ) + llm_model.generate("Hello, my name is") + + assert create_symbol_counter.value == 0 + + finally: + # Restore original method + symbolic_shapes_module.make_symbol = original_make_symbol diff --git a/tests/conftest.py b/tests/conftest.py index b20c9efef542a..204452b5835ce 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,7 +27,7 @@ import threading from collections.abc import Generator from contextlib import nullcontext from enum import Enum -from typing import Any, Callable, TypedDict, TypeVar, cast +from typing import Any, Callable, TypedDict, TypeVar, cast, TYPE_CHECKING import numpy as np import pytest @@ -67,6 +67,11 @@ from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_num_threads +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + from transformers.generation.utils import GenerateOutput + + logger = init_logger(__name__) _TEST_DIR = os.path.dirname(__file__) @@ -202,10 +207,7 @@ def dynamo_reset(): @pytest.fixture def example_prompts() -> list[str]: - prompts = [] - for filename in _TEST_PROMPTS: - prompts += _read_prompts(filename) - return prompts + return [prompt for filename in _TEST_PROMPTS for prompt in _read_prompts(filename)] @pytest.fixture @@ -224,10 +226,7 @@ class DecoderPromptType(Enum): @pytest.fixture def example_long_prompts() -> list[str]: - prompts = [] - for filename in _LONG_PROMPTS: - prompts += _read_prompts(filename) - return prompts + return [prompt for filename in _LONG_PROMPTS for prompt in _read_prompts(filename)] @pytest.fixture(scope="session") @@ -353,10 +352,13 @@ class HfRunner: trust_remote_code=trust_remote_code, ) else: - model = auto_cls.from_pretrained( - model_name, - trust_remote_code=trust_remote_code, - **model_kwargs, + model = cast( + nn.Module, + auto_cls.from_pretrained( + model_name, + trust_remote_code=trust_remote_code, + **model_kwargs, + ), ) # in case some unquantized custom models are not in same dtype @@ -374,10 +376,12 @@ class HfRunner: self.model = model if not skip_tokenizer_init: - self.tokenizer = AutoTokenizer.from_pretrained( - model_name, - dtype=dtype, - trust_remote_code=trust_remote_code, + self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = ( + AutoTokenizer.from_pretrained( + model_name, + dtype=dtype, + trust_remote_code=trust_remote_code, + ) ) # don't put this import at the top level @@ -495,7 +499,7 @@ class HfRunner: outputs: list[tuple[list[list[int]], list[str]]] = [] for inputs in all_inputs: - output_ids = self.model.generate( + output_ids: torch.Tensor = self.model.generate( **self.wrap_device(inputs), use_cache=True, **kwargs, @@ -505,8 +509,7 @@ class HfRunner: skip_special_tokens=True, clean_up_tokenization_spaces=False, ) - output_ids = output_ids.cpu().tolist() - outputs.append((output_ids, output_str)) + outputs.append((output_ids.cpu().tolist(), output_str)) return outputs def generate_greedy( @@ -574,7 +577,7 @@ class HfRunner: all_logprobs: list[list[torch.Tensor]] = [] for inputs in all_inputs: - output = self.model.generate( + output: "GenerateOutput" = self.model.generate( **self.wrap_device(inputs), use_cache=True, do_sample=False, @@ -656,7 +659,7 @@ class HfRunner: all_output_strs: list[str] = [] for inputs in all_inputs: - output = self.model.generate( + output: "GenerateOutput" = self.model.generate( **self.wrap_device(inputs), use_cache=True, do_sample=False, diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 7e4713b8aece0..3cb533dccd62c 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -16,16 +16,35 @@ from typing import Literal, NamedTuple import pytest import torch +from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k +from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test from vllm.config.model import RunnerOption from vllm.logger import init_logger from ..models.registry import HF_EXAMPLE_MODELS -from ..utils import compare_two_settings, create_new_process_for_each_test logger = init_logger("test_context_parallel") VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" +CP_TEST_MODELS = [ + # TODO support other models + # [LANGUAGE GENERATION] + "deepseek-ai/DeepSeek-V2-Lite-Chat", + "Qwen/Qwen2.5-1.5B-Instruct", +] + +# GSM8K eval configuration +NUM_QUESTIONS = 256 # Fast eval for CI +NUM_SHOTS = 5 # Few-shot examples +# tp accuracy with 2% buffer +MIN_ACCURACY = { + # .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml + "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64, + # .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml + "Qwen/Qwen2.5-1.5B-Instruct": 0.52, +} + class ParallelSetup(NamedTuple): tp_size: int @@ -38,7 +57,6 @@ class ParallelSetup(NamedTuple): class CPTestOptions(NamedTuple): multi_node_only: bool - load_format: str | None = None attn_backend: str | None = None @@ -54,17 +72,20 @@ class CPTestSettings: *, tp_base: int = 4, pp_base: int = 1, - dcp_base: int = 1, + dcp_multipliers: list[float] | None = None, cp_kv_cache_interleave_size: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: str | None = None, attn_backend: str | None = None, ): parallel_setups = [] + if dcp_multipliers is None: + dcp_multipliers = [ + 0.5, + ] for eager_mode_val in [False]: for pp_multiplier in [1]: - for dcp_multiplier in [0.5, 1]: + for dcp_multiplier in dcp_multipliers: for chunked_prefill_val in [True]: parallel_setups.append( ParallelSetup( @@ -82,7 +103,6 @@ class CPTestSettings: runner=runner, test_options=CPTestOptions( multi_node_only=multi_node_only, - load_format=load_format, attn_backend=attn_backend, ), ) @@ -101,7 +121,24 @@ class CPTestSettings: ) -def _compare_cp_with_tp( +CP_TEXT_GENERATION_MODELS = { + "deepseek-ai/DeepSeek-V2-Lite-Chat": [ + CPTestSettings.detailed( + dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64 + ), + ], + "Qwen/Qwen2.5-1.5B-Instruct": [ + CPTestSettings.detailed( + cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN" + ), + CPTestSettings.detailed( + cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER" + ), + ], +} + + +def _test_cp_gsm8k( model_id: str, parallel_setup: ParallelSetup, distributed_backend: str, @@ -121,7 +158,7 @@ def _compare_cp_with_tp( chunked_prefill, ) = parallel_setup - multi_node_only, load_format, attn_backend = test_options + multi_node_only, attn_backend = test_options model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_transformers_version(on_fail="skip") @@ -130,22 +167,7 @@ def _compare_cp_with_tp( tokenizer_mode = model_info.tokenizer_mode hf_overrides = model_info.hf_overrides - if load_format == "dummy": - # Avoid OOM - text_overrides = { - "num_hidden_layers": 4, - "hidden_size": 512, - "intermediate_size": 800, - "num_attention_heads": 4, - "num_key_value_heads": 1, - } - - if is_multimodal: - hf_overrides.update({"text_config": text_overrides}) - else: - hf_overrides.update(text_overrides) - else: - model_info.check_available_online(on_fail="skip") + model_info.check_available_online(on_fail="skip") if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") @@ -157,90 +179,70 @@ def _compare_cp_with_tp( if multi_node_only and not VLLM_MULTI_NODE: pytest.skip("Not in multi-node setting") - common_args = [ + server_args = [ # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", "--max-model-len", - "2048", + "4096", "--max-num-seqs", - "8", + "64", ] if chunked_prefill: - common_args.append("--enable-chunked-prefill") + server_args.append("--enable-chunked-prefill") if eager_mode: - common_args.append("--enforce-eager") + server_args.append("--enforce-eager") if runner != "auto": - common_args.extend(["--runner", runner]) + server_args.extend(["--runner", runner]) if trust_remote_code: - common_args.append("--trust-remote-code") + server_args.append("--trust-remote-code") if tokenizer_mode: - common_args.extend(["--tokenizer-mode", tokenizer_mode]) - if load_format: - common_args.extend(["--load-format", load_format]) + server_args.extend(["--tokenizer-mode", tokenizer_mode]) if hf_overrides: - common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) + server_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) - if not attn_backend: - cp_env = tp_env = {} - else: - cp_env = tp_env = { - "VLLM_ATTENTION_BACKEND": attn_backend, - } - - cp_args = [ - *common_args, - "--tensor-parallel-size", - str(tp_size), - "--pipeline-parallel-size", - str(pp_size), - "--decode-context-parallel-size", - str(dcp_size), - "--dcp-kv-cache-interleave-size", - str(cp_kv_cache_interleave_size), - "--distributed-executor-backend", - distributed_backend, - ] - - tp_args = [ - *common_args, - "--tensor-parallel-size", - str(tp_size), - "--pipeline-parallel-size", - str(pp_size), - "--distributed-executor-backend", - distributed_backend, - ] - - compare_two_settings( - model_id, - cp_args, - tp_args, - cp_env, - tp_env, - method=method, - max_wait_seconds=720, + server_args.extend( + [ + "--tensor-parallel-size", + str(tp_size), + "--pipeline-parallel-size", + str(pp_size), + "--decode-context-parallel-size", + str(dcp_size), + "--dcp-kv-cache-interleave-size", + str(cp_kv_cache_interleave_size), + "--distributed-executor-backend", + distributed_backend, + ] ) + server_env = {} + if attn_backend: + server_env["VLLM_ATTENTION_BACKEND"] = attn_backend -CP_TEXT_GENERATION_MODELS = { - "deepseek-ai/DeepSeek-V2-Lite-Chat": [ - CPTestSettings.detailed(), - CPTestSettings.detailed(tp_base=2), - CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64), - ], - "bigcode/gpt_bigcode-santacoder": [ - CPTestSettings.detailed(), - CPTestSettings.detailed(tp_base=2), - ], -} + with RemoteOpenAIServer( + model_id, + server_args, + env_dict=server_env, + max_wait_seconds=720, + ) as remote_server: + host = f"http://{remote_server.host}" + port = remote_server.port -CP_TEST_MODELS = [ - # TODO support other models - # [LANGUAGE GENERATION] - "deepseek-ai/DeepSeek-V2-Lite-Chat", - "bigcode/gpt_bigcode-santacoder", -] + # Run GSM8K evaluation + results = evaluate_gsm8k( + num_questions=NUM_QUESTIONS, + num_shots=NUM_SHOTS, + host=host, + port=port, + ) + + # Validate accuracy is reasonable + accuracy = results["accuracy"] + min_accuracy = MIN_ACCURACY[model_id] + assert accuracy >= min_accuracy, ( + f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}" + ) @pytest.mark.parametrize( @@ -274,12 +276,12 @@ def test_cp_generation( ): pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher") if ( - model_id == "bigcode/gpt_bigcode-santacoder" + model_id == "Qwen/Qwen2.5-1.5B-Instruct" and torch.cuda.get_device_capability() != (9, 0) ): pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0") - _compare_cp_with_tp( + _test_cp_gsm8k( model_id, parallel_setup, distributed_backend, diff --git a/tests/distributed/test_eplb_algo.py b/tests/distributed/test_eplb_algo.py index 79805a7cce53b..a53a61840e79e 100644 --- a/tests/distributed/test_eplb_algo.py +++ b/tests/distributed/test_eplb_algo.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.distributed.eplb.rebalance_algo import rebalance_experts +from vllm.distributed.eplb.policy.default import DefaultEplbPolicy def test_basic_rebalance(): @@ -23,7 +23,7 @@ def test_basic_rebalance(): num_nodes = 2 num_gpus = 8 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -77,7 +77,7 @@ def test_single_gpu_case(): num_nodes = 1 num_gpus = 1 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -99,7 +99,7 @@ def test_equal_weights(): num_nodes = 2 num_gpus = 4 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -122,7 +122,7 @@ def test_extreme_weight_imbalance(): num_nodes = 2 num_gpus = 4 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -150,7 +150,7 @@ def test_multiple_layers(): num_nodes = 2 num_gpus = 4 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -175,14 +175,14 @@ def test_parameter_validation(): # Test non-divisible case - this should handle normally without throwing # errors because the function will fall back to global load balancing # strategy - phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4) + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4) assert phy2log.shape == (1, 8) assert logcnt.shape == (1, 4) # Test cases that will actually cause errors: # num_physical_experts not divisible by num_gpus with pytest.raises(AssertionError): - rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4 + DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4 def test_small_scale_hierarchical(): @@ -197,7 +197,7 @@ def test_small_scale_hierarchical(): num_nodes = 2 # 2 nodes num_gpus = 4 # 4 GPUs - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -224,7 +224,7 @@ def test_global_load_balance_fallback(): num_nodes = 2 num_gpus = 4 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -246,7 +246,7 @@ def test_device_compatibility(device): num_nodes = 1 num_gpus = 2 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) @@ -263,7 +263,9 @@ def test_additional_cases(): weight1 = torch.tensor( [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]] ) - phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8) + phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts( + weight1, 24, 8, 4, 8 + ) assert phy2log1.shape == (1, 24) assert logcnt1.shape == (1, 16) @@ -276,7 +278,9 @@ def test_additional_cases(): [12, 25, 50, 100, 150, 200], # Increasing weights ] ) - phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2) + phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts( + weight2, 10, 3, 1, 2 + ) assert phy2log2.shape == (2, 10) assert logcnt2.shape == (2, 6) @@ -300,7 +304,7 @@ if __name__ == "__main__": num_nodes = 2 num_gpus = 8 - phy2log, log2phy, logcnt = rebalance_experts( + phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts( weight, num_replicas, num_groups, num_nodes, num_gpus ) print(phy2log) diff --git a/tests/entrypoints/test_harmony_utils.py b/tests/entrypoints/test_harmony_utils.py index 6fa051a678d68..82ff562d5c6d2 100644 --- a/tests/entrypoints/test_harmony_utils.py +++ b/tests/entrypoints/test_harmony_utils.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from openai_harmony import Role +from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem +from openai_harmony import Author, Message, Role, TextContent from vllm.entrypoints.harmony_utils import ( has_custom_tools, parse_input_to_harmony_message, + parse_output_message, ) @@ -257,6 +259,191 @@ class TestParseInputToHarmonyMessage: assert messages[0].content[1].text == "actual text" +class TestParseOutputMessage: + """Tests for parse_output_message function.""" + + def test_commentary_with_no_recipient_creates_reasoning(self): + """Test that commentary with recipient=None (preambles) creates reasoning items. + + Per Harmony format, commentary channel can contain preambles to calling + multiple functions - explanatory text with no recipient. + """ + message = Message.from_role_and_content( + Role.ASSISTANT, "I will now search for the weather information." + ) + message = message.with_channel("commentary") + # recipient is None by default, representing a preamble + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert ( + output_items[0].content[0].text + == "I will now search for the weather information." + ) + assert output_items[0].content[0].type == "reasoning_text" + + def test_commentary_with_function_recipient_creates_function_call(self): + """Test commentary with recipient='functions.X' creates function calls.""" + message = Message.from_role_and_content( + Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}' + ) + message = message.with_channel("commentary") + message = message.with_recipient("functions.get_weather") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseFunctionToolCall) + assert output_items[0].type == "function_call" + assert output_items[0].name == "get_weather" + assert ( + output_items[0].arguments + == '{"location": "San Francisco", "units": "celsius"}' + ) + assert output_items[0].call_id.startswith("call_") + assert output_items[0].id.startswith("fc_") + + def test_commentary_with_python_recipient_creates_reasoning(self): + """Test that commentary with recipient='python' creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))" + ) + message = message.with_channel("commentary") + message = message.with_recipient("python") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert ( + output_items[0].content[0].text + == "import numpy as np\nprint(np.array([1, 2, 3]))" + ) + + def test_commentary_with_browser_recipient_creates_reasoning(self): + """Test that commentary with recipient='browser' creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "Navigating to the specified URL" + ) + message = message.with_channel("commentary") + message = message.with_recipient("browser") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert output_items[0].content[0].text == "Navigating to the specified URL" + + def test_commentary_with_container_recipient_creates_reasoning(self): + """Test that commentary with recipient='container' creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "Running command in container" + ) + message = message.with_channel("commentary") + message = message.with_recipient("container") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert output_items[0].content[0].text == "Running command in container" + + def test_commentary_with_empty_content_and_no_recipient(self): + """Test edge case: empty commentary with recipient=None.""" + message = Message.from_role_and_content(Role.ASSISTANT, "") + message = message.with_channel("commentary") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].content[0].text == "" + + def test_commentary_with_multiple_contents_and_no_recipient(self): + """Test multiple content items in commentary with no recipient.""" + contents = [ + TextContent(text="Step 1: Analyze the request"), + TextContent(text="Step 2: Prepare to call functions"), + ] + message = Message.from_role_and_contents(Role.ASSISTANT, contents) + message = message.with_channel("commentary") + + output_items = parse_output_message(message) + + assert len(output_items) == 2 + assert all(isinstance(item, ResponseReasoningItem) for item in output_items) + assert output_items[0].content[0].text == "Step 1: Analyze the request" + assert output_items[1].content[0].text == "Step 2: Prepare to call functions" + + def test_commentary_with_multiple_function_calls(self): + """Test multiple function calls in commentary channel.""" + contents = [ + TextContent(text='{"location": "San Francisco"}'), + TextContent(text='{"location": "New York"}'), + ] + message = Message.from_role_and_contents(Role.ASSISTANT, contents) + message = message.with_channel("commentary") + message = message.with_recipient("functions.get_weather") + + output_items = parse_output_message(message) + + assert len(output_items) == 2 + assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items) + assert output_items[0].name == "get_weather" + assert output_items[1].name == "get_weather" + assert output_items[0].arguments == '{"location": "San Francisco"}' + assert output_items[1].arguments == '{"location": "New York"}' + + def test_commentary_with_unknown_recipient_raises_error(self): + """Test that commentary with unknown recipient raises ValueError.""" + message = Message.from_role_and_content(Role.ASSISTANT, "some content") + message = message.with_channel("commentary") + message = message.with_recipient("unknown_recipient") + + try: + parse_output_message(message) + raise AssertionError("Expected ValueError to be raised") + except ValueError as e: + assert "Unknown recipient: unknown_recipient" in str(e) + + def test_analysis_channel_creates_reasoning(self): + """Test that analysis channel creates reasoning items.""" + message = Message.from_role_and_content( + Role.ASSISTANT, "Analyzing the problem step by step..." + ) + message = message.with_channel("analysis") + + output_items = parse_output_message(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseReasoningItem) + assert output_items[0].type == "reasoning" + assert ( + output_items[0].content[0].text == "Analyzing the problem step by step..." + ) + + def test_non_assistant_message_returns_empty(self): + """Test that non-assistant messages return empty list. + + Per the implementation, tool messages to assistant (e.g., search results) + are not included in final output to align with OpenAI behavior. + """ + message = Message.from_author_and_content( + Author.new(Role.TOOL, "functions.get_weather"), + "The weather is sunny, 72°F", + ) + + output_items = parse_output_message(message) + + assert len(output_items) == 0 + + def test_has_custom_tools() -> None: assert not has_custom_tools(set()) assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"}) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 43b242ab2d586..4e1559a049bf9 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -113,12 +113,10 @@ def test_mrope( is_neox_style = True max_position = config.max_position_embeddings - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) - rotary_dim = int(head_dim * partial_rotary_factor) mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=rotary_dim, + rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, @@ -184,12 +182,10 @@ def test_mrope_torch_compile_tracing( ) is_neox_style = True max_position = config.max_position_embeddings - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) - rotary_dim = int(head_dim * partial_rotary_factor) mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=rotary_dim, + rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py deleted file mode 100644 index a61ccef700624..0000000000000 --- a/tests/kv_transfer/test_lookup_buffer.py +++ /dev/null @@ -1,160 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import random - -import torch -from tqdm import tqdm - -from vllm.config import KVTransferConfig -from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer -from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe - -# TODO: the test depends on a lot of fields in the current implementation. -# We should have standard interface instead direct field access - - -def test_run(my_rank, buffer, device): - # buffer should be empty in the beginning - if my_rank == 0: - assert buffer.buffer_size == 0 - assert len(buffer.buffer) == 0 - - print(f"My rank: {my_rank}, device: {device}") - - # insert - tokens = torch.tensor([1, 2, 3]).to(device) - roi = tokens > 0 - if my_rank == 0: - key = 2.0 * torch.ones([5, 6]).to(device) - value = 3.0 * torch.ones([5, 6]).to(device) - - placeholder = torch.tensor([1]).to(device) - - buffer.insert(tokens, roi, key, value, placeholder) - - torch.distributed.barrier() - - # drop_select - if my_rank == 1: - tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi) - assert torch.allclose(tokens, tok) - assert torch.allclose(roi, roi_) - assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device)) - assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device)) - torch.distributed.barrier() - - if my_rank == 0: - assert buffer.buffer_size == 0 - assert len(buffer.buffer) == 0 - - print(f"My rank: {my_rank}, Test run passed!") - - -def stress_test(my_rank, buf, device): - torch.distributed.barrier() - torch.manual_seed(100) - - reqs = [ - ( - torch.rand(100).to(device), # tokens - torch.ones(100).bool().to(device), # roi - torch.rand(100).to(device), # key - torch.rand(100).to(device), # value - torch.rand(100).to(device), # hidden - ) - for i in tqdm(range(200)) - ] - - random.seed(my_rank) - random.shuffle(reqs) - - torch.distributed.barrier() - - n = 0 - - # the buffer size can only store 100 reqs - # so the sender will occasionally block to wait for the receiver. - for req in tqdm(reqs): - if my_rank == 0: - buf.insert(*req) - else: - tok, roi, k, v, h = req - tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi) - - if tok_ is None: - assert roi_ is None - assert k_ is None - assert v_ is None - assert h_ is None - n += 1 - else: - assert torch.allclose(tok, tok_) - assert torch.allclose(roi, roi_) - assert torch.allclose(k, k_) - assert torch.allclose(v, v_) - assert torch.allclose(h, h_) - print(f"Rank {my_rank} done") - torch.distributed.barrier() - - if my_rank == 0: - x = torch.tensor([0]) - torch.distributed.recv(x, 1) - # the # of None received is the kv that are not selected - assert x.item() == len(buf.buffer) - # and the size of the buffer should be 2000 * buffer len - print(buf.buffer_size) - assert buf.buffer_size == 1700 * len(buf.buffer) - else: - torch.distributed.send(torch.tensor([n]), 0) - - print(f"My rank: {my_rank}, Passed stress test!") - - -if __name__ == "__main__": - my_rank = int(os.environ["RANK"]) - - torch.distributed.init_process_group( - backend="gloo", - init_method="tcp://localhost:12398", - world_size=2, - rank=my_rank, - ) - - print(f"initialized! My rank is {my_rank}") - - config = KVTransferConfig( - kv_connector="P2pNcclConnector", - kv_buffer_device="cuda", - kv_buffer_size=1e9, - kv_rank=my_rank, - kv_role="kv_both", # this arg doesn't matter in this test - kv_parallel_size=2, - kv_ip="127.0.0.1", - kv_port=12345, - ) - - data_pipe = PyNcclPipe( - local_rank=my_rank, - config=config, - device="cuda", - port_offset=0, - ) - cpu_pipe = PyNcclPipe( - local_rank=my_rank, - config=config, - device="cpu", - port_offset=1, - ) - - buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000) - - test_run(my_rank, buffer, data_pipe.device) - - stress_test(my_rank, buffer, data_pipe.device) - - buffer.close() - data_pipe.close() - cpu_pipe.close() - print("Done") diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh deleted file mode 100644 index f2aeaee9ca6d5..0000000000000 --- a/tests/kv_transfer/test_lookup_buffer.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -RANK=0 python3 test_lookup_buffer.py & -PID0=$! -RANK=1 python3 test_lookup_buffer.py & -PID1=$! - -wait $PID0 -wait $PID1 diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py deleted file mode 100644 index b9a28e4bceb7c..0000000000000 --- a/tests/kv_transfer/test_module.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import subprocess -import sys - -import pytest -import torch - - -def run_python_script(script_name, timeout): - script_name = f"kv_transfer/{script_name}" - try: - # Start both processes asynchronously using Popen - process0 = subprocess.Popen( - [sys.executable, script_name], - env={"RANK": "0"}, # Set the RANK environment variable for process 0 - stdout=sys.stdout, # Pipe stdout to current stdout - stderr=sys.stderr, # Pipe stderr to current stderr - ) - - process1 = subprocess.Popen( - [sys.executable, script_name], - env={"RANK": "1"}, # Set the RANK environment variable for process 1 - stdout=sys.stdout, # Pipe stdout to current stdout - stderr=sys.stderr, # Pipe stderr to current stderr - ) - - # Wait for both processes to complete, with a timeout - process0.wait(timeout=timeout) - process1.wait(timeout=timeout) - - # Check the return status of both processes - if process0.returncode != 0: - pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}") - if process1.returncode != 0: - pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}") - - except subprocess.TimeoutExpired: - # If either process times out, terminate both and fail the test - process0.terminate() - process1.terminate() - pytest.fail(f"Test {script_name} timed out") - except Exception as e: - pytest.fail(f"Test {script_name} failed with error: {str(e)}") - - -# Define the test cases using pytest's parametrize -@pytest.mark.parametrize( - "script_name,timeout", - [ - ("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout - ("test_send_recv.py", 120), # First test case with a 120-second timeout - ], -) -def test_run_python_script(script_name, timeout): - # Check the number of GPUs - if torch.cuda.device_count() < 2: - pytest.skip(f"Skipping test {script_name} because <2 GPUs are available") - - # Run the test if there are at least 2 GPUs - run_python_script(script_name, timeout) diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py deleted file mode 100644 index 5762224eff76d..0000000000000 --- a/tests/kv_transfer/test_send_recv.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import time - -import torch -from tqdm import tqdm - -from vllm.config import KVTransferConfig -from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe - - -def test_run(my_rank, pipe): - print(f"rank {my_rank} test_run starts....") - # test run - x = torch.tensor([1]).to(pipe.device) - y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device) - if my_rank == 0: - pipe.send_tensor(x) - print(f"rank {my_rank} sent tensor x") - pipe.send_tensor(y) - print(f"rank {my_rank} sent tensor y") - x2 = pipe.recv_tensor() - print(f"rank {my_rank} received x2 = ", x2) - y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", y2) - - else: - x2 = pipe.recv_tensor() - print(f"rank {my_rank} received x2 = ", x2) - y2 = pipe.recv_tensor() - print(f"rank {my_rank} received y2 = ", y2) - pipe.send_tensor(x) - print(f"rank {my_rank} sent tensor x") - pipe.send_tensor(y) - print(f"rank {my_rank} sent tensor y") - - assert torch.allclose(x, x2) - assert torch.allclose(y, y2) - - print(f"rank {my_rank} test_run passed!") - - -def stress_test(my_rank, pipe): - print(f"rank {my_rank} stress_test starts....") - - tensors: list[torch.Tensor] = [] - - torch.distributed.barrier() - torch.manual_seed(0) - - for i in tqdm(range(500)): - mean = torch.rand(1).item() * 100 - std = torch.rand(1).item() * 100 - size = torch.randint(900, 1000, (2,)) - x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device) - - # 5% probability of sending a None - if torch.rand(1).item() < 0.05: - tensors.append(None) - tensors.append(None) - tensors.append(None) - else: - tensors.append(x) - tensors.append(x.mean().unsqueeze(0)) - tensors.append(x.std().unsqueeze(0)) - - torch.distributed.barrier() - - for i in tqdm(range(500)): - if my_rank == int((i % 10) > 3): - pipe.send_tensor(tensors[3 * i]) - pipe.send_tensor(tensors[3 * i + 1]) - pipe.send_tensor(tensors[3 * i + 2]) - else: - x = pipe.recv_tensor() - mean = pipe.recv_tensor() - std = pipe.recv_tensor() - - if x is None: - assert mean is None - assert std is None - else: - assert torch.allclose(x, tensors[3 * i]) - assert x.mean() == mean[0] - assert x.std() == std[0] - - torch.distributed.barrier() - - -def latency_test(my_rank, pipe, nelement, ntensor): - latencies = [] - - torch.distributed.barrier() - - for i in tqdm(range(500)): - tensors = [] - - if my_rank == 0: - # create tensor - tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)] - - torch.distributed.barrier() - - if my_rank == 0: - t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device) - for tensor in tensors: - pipe.send_tensor(tensor) - pipe.send_tensor(t) - else: - for _ in range(ntensor): - pipe.recv_tensor() - t = pipe.recv_tensor() - latencies.append(time.time() - t.item()) - - torch.distributed.barrier() - - print("Latency test passed.") - print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms") - - -if __name__ == "__main__": - my_rank = int(os.environ["RANK"]) - - torch.distributed.init_process_group( - backend="gloo", - init_method="tcp://localhost:12398", - world_size=2, - rank=my_rank, - ) - - config = KVTransferConfig( - kv_connector="P2pNcclConnector", - kv_buffer_device="cuda", - kv_buffer_size=1e9, - kv_rank=my_rank, - kv_role="kv_both", # this arg doesn't matter in this test - kv_parallel_size=2, - kv_ip="127.0.0.1", - kv_port=12345, - ) - - pipe = PyNcclPipe( - local_rank=my_rank, - config=config, - ) - - test_run(my_rank, pipe) - - stress_test(my_rank, pipe) - - # Use this function if you want to test the latency of pipe impl. - # latency_test(my_rank, pipe, 1024 * 8 * 128, 80) diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh deleted file mode 100644 index 54e0604806841..0000000000000 --- a/tests/kv_transfer/test_send_recv.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -RANK=0 python3 test_send_recv.py & -PID0=$! -RANK=1 python3 test_send_recv.py & -PID1=$! - -wait $PID0 -wait $PID1 diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 6b9d388f2b9b4..2e032ac4ca526 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -20,7 +20,7 @@ from vllm.config.multimodal import ( ) from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache -from vllm.multimodal.inputs import MultiModalInputs +from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.tokenizers import ( MistralTokenizer, @@ -418,4 +418,4 @@ def _assert_inputs_equal( a_data.pop(key, None) b_data.pop(key, None) - assert a_data == b_data, msg + assert batched_tensors_equal(a_data, b_data), msg diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index 553a5f719bd35..51071c93531de 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -5,6 +5,7 @@ import pytest from vllm.assets.video import VideoAsset from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import batched_tensors_equal from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend from ...utils import build_model_context @@ -103,7 +104,7 @@ def test_video_loader_consistency( dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs) assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"] - assert ( - static_outputs["mm_kwargs"].get_data() - == dynamic_outputs["mm_kwargs"].get_data() + assert batched_tensors_equal( + static_outputs["mm_kwargs"].get_data(), + dynamic_outputs["mm_kwargs"].get_data(), ) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 7628ab4fe2349..5d489549c5b46 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -130,10 +130,9 @@ def create_batched_mm_kwargs( hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, tokenization_kwargs=processor_inputs.tokenization_kwargs, )["mm_kwargs"].require_data() - items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]] + return group_mm_kwargs_by_modality( - items, - merge_by_field_config=model_cls.merge_by_field_config, + [item for modality in supported_mm_limits for item in mm_kwargs[modality]] ) diff --git a/tests/models/registry.py b/tests/models/registry.py index b9f9945eb5fb8..352abdd2da9a0 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True, ), "Qwen2ForCausalLM": _HfExamplesInfo( - "Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"} + "Qwen/Qwen2-0.5B-Instruct", + extras={ + "2.5": "Qwen/Qwen2.5-0.5B-Instruct", + "2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct", + }, ), "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index 2ddc93f8daf7b..e4fcc34740edb 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -85,12 +85,6 @@ def _dummy_items( (_dummy_item("a", {"a1": 100}), 100), (_dummy_item("a", {"a1": 100, "a2": 110}), 210), (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501 - ( - _dummy_items( - {"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}} - ).get_data(), - 460, - ), # noqa: E501 ], ) def test_cache_item_size(item, expected_size): @@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size): cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update]) assert cache.currsize == expected_size + cache[""] = item.get_data() + assert cache.currsize == expected_size + def _create_vllm_config( *, diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py deleted file mode 100644 index 88e92bee3a292..0000000000000 --- a/tests/multimodal/test_inputs.py +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors - -pytestmark = pytest.mark.cpu_test - - -def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors): - assert type(expected) == type(actual) # noqa: E721 - if isinstance(expected, torch.Tensor): - assert torch.equal(expected, actual) - else: - for expected_item, actual_item in zip(expected, actual): - assert_nested_tensors_equal(expected_item, actual_item) - - -def assert_multimodal_inputs_equal( - expected: MultiModalKwargs, actual: MultiModalKwargs -): - assert set(expected.keys()) == set(actual.keys()) - for key in expected: - assert_nested_tensors_equal(expected[key], actual[key]) - - -def test_multimodal_input_batch_single_tensor(): - t = torch.rand([1, 2]) - result = MultiModalKwargs.batch([{"image": t}]) - assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)}) - - -def test_multimodal_input_batch_multiple_tensors(): - a = torch.rand([1, 1, 2]) - b = torch.rand([1, 1, 2]) - c = torch.rand([1, 1, 2]) - result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) - assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])}) - - -def test_multimodal_input_batch_multiple_heterogeneous_tensors(): - a = torch.rand([1, 2, 2]) - b = torch.rand([1, 3, 2]) - c = torch.rand([1, 4, 2]) - result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) - assert_multimodal_inputs_equal(result, {"image": [a, b, c]}) - - -def test_multimodal_input_batch_nested_tensors(): - a = torch.rand([2, 3]) - b = torch.rand([2, 3]) - c = torch.rand([2, 3]) - result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}]) - assert_multimodal_inputs_equal( - result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])} - ) - - -def test_multimodal_input_batch_heterogeneous_lists(): - a = torch.rand([1, 2, 3]) - b = torch.rand([1, 2, 3]) - c = torch.rand([1, 2, 3]) - result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) - assert_multimodal_inputs_equal( - result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]} - ) - - -def test_multimodal_input_batch_multiple_batchable_lists(): - a = torch.rand([1, 2, 3]) - b = torch.rand([1, 2, 3]) - c = torch.rand([1, 2, 3]) - d = torch.rand([1, 2, 3]) - result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}]) - assert_multimodal_inputs_equal( - result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])} - ) - - -def test_multimodal_input_batch_mixed_stacking_depths(): - a = torch.rand([1, 2, 3]) - b = torch.rand([1, 3, 3]) - c = torch.rand([1, 4, 3]) - - result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) - assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]}) - - result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}]) - assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]}) diff --git a/tests/reasoning/test_holo2_reasoning_parser.py b/tests/reasoning/test_holo2_reasoning_parser.py new file mode 100644 index 0000000000000..438bb2e957b85 --- /dev/null +++ b/tests/reasoning/test_holo2_reasoning_parser.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager +from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser +from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser + +REASONING_MODEL_NAME = "HCompany/Holo2-4B" + + +@pytest.fixture(scope="module") +def tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +@pytest.mark.parametrize( + "thinking,expected_parser_type", + [ + (True, DeepSeekR1ReasoningParser), + (False, IdentityReasoningParser), + ], +) +def test_parser_selection(tokenizer, thinking, expected_parser_type): + parser = Holo2ReasoningParser( + tokenizer, + chat_template_kwargs={ + "thinking": thinking, + }, + ) + + assert isinstance(parser._parser, expected_parser_type) + + +def test_holo2_default_parser_is_deepseekr1(tokenizer): + parser = Holo2ReasoningParser(tokenizer) + + assert isinstance(parser._parser, DeepSeekR1ReasoningParser) + + +def test_holo2_supports_structured_output(tokenizer): + # Structured output manager uses the reasoning parser to check if the + # reasoning content is ended before applying the grammar. The main function + # used is is_reasoning_end. This test checks if the parser is able to + # correctly identify the end of the reasoning content. + + # important to not pass chat_template_kwargs here as it is done in the + # StructuredOutputManager + parser = Holo2ReasoningParser(tokenizer) + + end_token_id = tokenizer.encode("", add_special_tokens=False)[0] + + assert parser.is_reasoning_end([1, 2, 4, end_token_id]) + assert not parser.is_reasoning_end([1, 2, 4]) + assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5]) + + +# thinking is True, non-streaming +WITH_THINK = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning": "This is a reasoning section", + "content": "This is the rest", +} +# thinking is True, streaming +WITH_THINK_STREAM = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning": "This is a reasoning section", + "content": "This is the rest", +} +# thinking is False, non-streaming +THINKING_DISABLED = { + "output": "This is the rest", + "reasoning": None, + "content": "This is the rest", +} +# thinking is False, streaming +THINKING_DISABLED_STREAM = { + "output": "This is the rest", + "reasoning": None, + "content": "This is the rest", +} +# thinking is False but the model output , non-streaming +THINKING_DISABLED_WITH_CLOSE_TAG = { + "output": "This is the rest", + "reasoning": None, + "content": "This is the rest", +} +# thinking is False but the model output , streaming +THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = { + "output": "some textThis is the rest", + "reasoning": None, + "content": "some textThis is the rest", +} +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, +} + +TEST_CASES = [ + pytest.param( + False, + WITH_THINK, + None, + id="with_think", + ), + pytest.param( + True, + WITH_THINK_STREAM, + None, + id="with_think_stream", + ), + pytest.param( + False, + WITH_THINK, + {"thinking": True}, + id="with_think_enabled", + ), + pytest.param( + True, + WITH_THINK_STREAM, + {"thinking": True}, + id="with_think_stream_enabled", + ), + pytest.param( + False, + THINKING_DISABLED, + {"thinking": False}, + id="thinking_disabled", + ), + pytest.param( + True, + THINKING_DISABLED_STREAM, + {"thinking": False}, + id="thinking_disabled_stream", + ), + pytest.param( + False, + THINKING_DISABLED_WITH_CLOSE_TAG, + {"thinking": False}, + id="thinking_disabled_with_close_tag", + ), + pytest.param( + True, + THINKING_DISABLED_WITH_CLOSE_TAG_STREAM, + {"thinking": False}, + id="thinking_disabled_with_close_tag_stream", + ), + pytest.param( + False, + COMPLETE_REASONING, + None, + id="complete_reasoning", + ), + pytest.param( + True, + COMPLETE_REASONING, + None, + id="complete_reasoning_stream", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + chat_template_kwargs: dict | None, + tokenizer, +): + output = tokenizer.tokenize(param_dict["output"]) + output_tokens: list[str] = [ + tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")( + tokenizer, + chat_template_kwargs=chat_template_kwargs, + ) + + reasoning, content = run_reasoning_extraction( + parser, output_tokens, streaming=streaming + ) + + assert reasoning == param_dict["reasoning"] + assert content == param_dict["content"] diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py index 2d969b8c9347d..6f24c77e066a4 100644 --- a/tests/utils_/test_argparse_utils.py +++ b/tests/utils_/test_argparse_utils.py @@ -460,23 +460,20 @@ def test_flat_product(): ] -def test_o_legacy_syntax_deprecation(caplog_vllm): - """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax.""" +def test_o_dotted_syntax_error(): + """Test that -O.* dotted syntax raises a clear error message.""" parser = FlexibleArgumentParser() parser.add_argument("-cc", "--compilation-config", type=json.loads) - # Test that -O.backend gets converted correctly AND emits warning - args = parser.parse_args(["-O.backend=eager"]) - assert args.compilation_config == {"backend": "eager"} + # Test that -O.* syntax raises a clear ValueError + with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"): + parser.parse_args(["-O.backend=eager"]) - # Check that deprecation warning was logged - assert len(caplog_vllm.records) >= 1 - assert ( - "The -O.* dotted syntax for --compilation-config is deprecated" - in caplog_vllm.text - ) + with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"): + parser.parse_args(["-O.mode=2"]) - # Test that -O.mode gets converted correctly - # Note: warning_once won't emit again in same session - args = parser.parse_args(["-O.mode=2"]) - assert args.compilation_config == {"mode": 2} + with pytest.raises( + ValueError, + match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'", + ): + parser.parse_args(["-O.cudagraph_mode=NONE"]) diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py index 1cbd0fe56be6d..f60861e3489d6 100644 --- a/tests/v1/attention/test_attention_splitting.py +++ b/tests/v1/attention/test_attention_splitting.py @@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import ( split_attn_metadata, split_decodes_and_prefills, ) -from vllm.v1.worker.ubatch_utils import create_ubatch_slices +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices @pytest.fixture @@ -294,8 +294,14 @@ def test_prefill_split_across_ubatches( qsl_np = common.query_start_loc_cpu.numpy() num_tokens = common.num_actual_tokens - ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point) - assert len(ubatch_slices) == 2 + ubatch_slices, _ = maybe_create_ubatch_slices( + True, + num_scheduled_tokens, + num_tokens, + batch_spec.batch_size, + split_point=split_point, + ) + assert ubatch_slices is not None and len(ubatch_slices) == 2 first_meta = _make_metadata_with_slice(ubatch_slices[0], common) second_meta = _make_metadata_with_slice(ubatch_slices[1], common) diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py index 083fc3f34f545..b80789945d2fc 100644 --- a/tests/v1/core/test_reset_prefix_cache_e2e.py +++ b/tests/v1/core/test_reset_prefix_cache_e2e.py @@ -21,6 +21,7 @@ def test_reset_prefix_cache_e2e(monkeypatch): max_num_batched_tokens=32, max_model_len=2048, compilation_config={"mode": 0}, + dtype="float16", ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams( diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py index 16f154d196ba5..f3a159762ea54 100644 --- a/tests/v1/distributed/test_dbo.py +++ b/tests/v1/distributed/test_dbo.py @@ -9,10 +9,22 @@ correctly with the DeepSeek-V2-Lite model using GSM8K evaluation. """ import pytest +import torch from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k from tests.utils import RemoteOpenAIServer +# Detect Blackwell / B200 (compute capability 10.x) +try: + if torch.cuda.is_available(): + cap = torch.cuda.get_device_capability(0) + IS_BLACKWELL = cap[0] >= 10 + else: + IS_BLACKWELL = False +except Exception: + # Be conservative: if we can't detect, don't xfail by default + IS_BLACKWELL = False + MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat" DP_SIZE = 2 @@ -33,6 +45,13 @@ DEEPEP_BACKENDS = [ @pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS) +@pytest.mark.xfail( + IS_BLACKWELL, + reason=( + "Temporary: DBO accuracy unstable on Blackwell " + "(doesn't meet expectation of MIN_ACCURACY = 0.62)" + ), +) def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available): """ Test DBO with DP+EP using GSM8K evaluation. diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index 771076186a3b4..4c01560fc88c3 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -1,9 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time +from concurrent.futures import Future + +import pytest from transformers import AutoTokenizer from vllm.config import StructuredOutputsConfig, VllmConfig from vllm.config.model import ModelConfig +from vllm.config.parallel import ParallelConfig from vllm.config.speculative import SpeculativeConfig from vllm.sampling_params import SamplingParams, StructuredOutputsParams from vllm.v1.request import Request @@ -116,3 +121,72 @@ def test_grammar_bitmask_with_specdec(): ) # EOS not the final token grammar_bitmask(request, prompt[i:]) # EOS not present grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id]) + + +@pytest.mark.parametrize("async_grammar", [True, False]) +def test_grammar_init_async_and_sync(async_grammar): + """Test grammar initialization works correctly in both async and sync modes. + + This test validates that the distributed_executor_backend config option + correctly controls whether grammar compilation happens asynchronously + (via executor.submit) or synchronously. When set to "external_launcher", + grammar compilation is synchronous to avoid deadlocks. + """ + tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) + prompt = tokenizer.encode('{"a": "b"}') + + # Use "external_launcher" for sync mode, None for async mode + executor_backend = None if async_grammar else "external_launcher" + vllm_config = VllmConfig( + model_config=ModelConfig(tokenizer=TOKENIZER), + structured_outputs_config=StructuredOutputsConfig(backend="guidance"), + parallel_config=ParallelConfig(distributed_executor_backend=executor_backend), + ) + structured_output_manager = StructuredOutputManager(vllm_config) + + sampling_params = SamplingParams( + structured_outputs=StructuredOutputsParams( + json='{"type": "object"}', + ), + ) + sampling_params.structured_outputs._backend = "guidance" + + request = Request( + "test_request", + prompt_token_ids=prompt, + sampling_params=sampling_params, + pooling_params=None, + eos_token_id=tokenizer.eos_token_id, + ) + + structured_output_manager.grammar_init(request) + + # Check the internal _grammar type immediately after init + # Before _check_grammar_completion is called, async mode should have a Future + raw_grammar = request.structured_output_request._grammar + if async_grammar: + assert isinstance(raw_grammar, Future), ( + "Async mode should store a Future before completion" + ) + else: + assert not isinstance(raw_grammar, Future), ( + "Sync mode should store the grammar directly, not a Future" + ) + + # Wait for grammar to be ready (handles both async and sync cases) + start_time = time.time() + while not request.structured_output_request._check_grammar_completion(): + if time.time() - start_time > 5: # 5-second timeout + pytest.fail("Grammar compilation timed out") + time.sleep(0.01) + + # After completion, _grammar should no longer be a Future + assert not isinstance(request.structured_output_request._grammar, Future) + + # Verify grammar is properly initialized and functional + grammar = request.structured_output_request.grammar + assert grammar is not None + assert not grammar.is_terminated() + + # Verify the grammar can accept valid tokens + assert grammar.accept_tokens(request.request_id, prompt) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index ec9b0fd6e969c..638ece26071ef 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1842,6 +1842,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: random_seed=args.seed, dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle, + prefix_len=args.common_prefix_len, ).sample( tokenizer=tokenizer, num_requests=args.num_prompts, diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 2933f5d01b274..890cd7e089fd6 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1221,6 +1221,12 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Repetition penalty sampling parameter. Only has effect on " "openai-compatible backends.", ) + sampling_group.add_argument( + "--common-prefix-len", + type=int, + default=None, + help="Common prefix length shared by all prompts (used by random dataset)", + ) parser.add_argument( "--tokenizer-mode", diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 1773913d0b6c6..b5b7fe2b76c27 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -402,6 +402,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): self.extra_traceback = False def run(self, *args): + # maybe instead just assert inputs are fake? fake_args = [ self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in args @@ -416,11 +417,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): kwargs: dict[str, Any], ) -> Any: assert isinstance(target, str) + output = super().call_module(target, args, kwargs) if target in self.compile_submod_names: index = self.compile_submod_names.index(target) submod = self.fetch_attr(target) + sym_shape_indices = [ i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] @@ -746,11 +749,21 @@ class VllmBackend: if not item.is_splitting_graph ] + # Extract fake values from the graph to use them when needed. + all_fake_values = [] + for i in graph.graph.find_nodes(op="placeholder"): + all_fake_values.append(i.meta["example_value"]) + + fake_args = [ + all_fake_values[i] if isinstance(t, torch.Tensor) else t + for i, t in enumerate(example_inputs) + ] + # propagate the split graph to the piecewise backend, # compile submodules with symbolic shapes PiecewiseCompileInterpreter( self.split_gm, submod_names_to_compile, self.vllm_config, self - ).run(*example_inputs) + ).run(*fake_args) graph_path = os.path.join(local_cache_dir, "computation_graph.py") if not os.path.exists(graph_path): @@ -780,14 +793,7 @@ class VllmBackend: ) # if we need to copy input buffers for cudagraph - from torch._guards import detect_fake_mode - - fake_mode = detect_fake_mode() - fake_args = [ - fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t - for t in example_inputs - ] - + # # index of tensors that have symbolic shapes (batch size) # for weights and static buffers, they will have concrete shapes. # symbolic shape only happens for input tensors. diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 6d9da1c488c6d..eed7795cdb349 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -433,7 +433,6 @@ def _support_torch_compile( return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs) # This is the path for the first compilation. - # the first compilation needs to have dynamic shapes marked _mark_dynamic_inputs( self, diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 42b8d3daac985..06e1771bac960 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -5,6 +5,7 @@ from collections.abc import Iterable import torch.fx from torch import SymInt +from torch.fx.experimental.symbolic_shapes import statically_known_true from vllm.logger import init_logger @@ -116,12 +117,7 @@ class NoOpEliminationPass(VllmInductorPass): 2. The dimensions both correspond to the same SymInt """ # Case 1 - if isinstance(i_dim, int) and isinstance(dim, int): - return dim == i_dim - # Case 2 - if isinstance(i_dim, SymInt) and isinstance(dim, SymInt): - return dim == i_dim - return False + return statically_known_true(dim == i_dim) def all_dims_equivalent( self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt] diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b120c85bf232e..69e1ed37a5beb 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -14,6 +14,7 @@ import torch._C._dynamo.guards import vllm.envs as envs from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config from vllm.logger import init_logger +from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context logger = init_logger(__name__) @@ -92,12 +93,29 @@ class TorchCompileWithNoGuardsWrapper: return self.forward(*args, **kwargs) + def _call_with_optional_nvtx_range(self, callable_fn, *args, **kwargs): + if self.layerwise_nvtx_tracing_enabled: + args_list = list(args) + kwargs_dict = dict(kwargs) + with layerwise_nvtx_marker_context( + "Torch Compiled Module (input):{}".format(self.__class__.__name__), + self, + in_tensor=args_list, + kwargs=kwargs_dict, + ) as ctx: + ctx.result = callable_fn(*args, **kwargs) + return ctx.result + return callable_fn(*args, **kwargs) + def __init__(self): self.compiled = False vllm_config = get_current_vllm_config() self.vllm_config = vllm_config mode = vllm_config.compilation_config.mode + self.layerwise_nvtx_tracing_enabled = ( + vllm_config.observability_config.enable_layerwise_nvtx_tracing + ) if mode is None: raise RuntimeError("Compilation mode cannot be NO_COMPILATION") @@ -168,13 +186,19 @@ class TorchCompileWithNoGuardsWrapper: # Make sure a compilation is triggered by clearing dynamo # cache. torch._dynamo.eval_frame.remove_from_cache(self.original_code_object()) - return self._compiled_callable(*args, **kwargs) + return self._call_with_optional_nvtx_range( + self._compiled_callable, *args, **kwargs + ) else: with self._dispatch_to_compiled_code(): - return self.forward(*args, **kwargs) + return self._call_with_optional_nvtx_range( + self.forward, *args, **kwargs + ) else: with _compilation_context(): - return self._compiled_callable(*args, **kwargs) + return self._call_with_optional_nvtx_range( + self._compiled_callable, *args, **kwargs + ) @abstractmethod def forward(self, *args, **kwargs): ... diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 91f083a5534ba..067799a44db30 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -29,7 +29,7 @@ CacheDType = Literal[ "fp8_inc", "fp8_ds_mla", ] -MambaDType = Literal["auto", "float32"] +MambaDType = Literal["auto", "float32", "float16"] PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"] KVOffloadingBackend = Literal["native", "lmcache"] diff --git a/vllm/config/observability.py b/vllm/config/observability.py index fdc27aee380ef..e40bf18a00ce2 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -59,6 +59,11 @@ class ObservabilityConfig: """Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph dispatch modes, and their observed frequencies at every logging interval).""" + enable_layerwise_nvtx_tracing: bool = False + """Enable layerwise NVTX tracing. This traces the execution of each layer or + module in the model and attach informations such as input/output shapes to + nvtx range markers. Noted that this doesn't work with CUDA graphs enabled.""" + @cached_property def collect_model_forward_time(self) -> bool: """Whether to collect model forward time for the request.""" diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 20de672257107..3a768bcd4f2ce 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -35,6 +35,7 @@ logger = init_logger(__name__) ExpertPlacementStrategy = Literal["linear", "round_robin"] DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"] DataParallelBackend = Literal["ray", "mp"] +EPLBPolicyOption = Literal["default"] @config @@ -65,6 +66,9 @@ class EPLBConfig: Whether to use non-blocking EPLB. """ + policy: EPLBPolicyOption = "default" + """The policy type for expert parallel load balancing (EPLB).""" + @config @dataclass diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 735b0afbaaeb3..823bd96db9ac9 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -671,36 +671,22 @@ class VllmConfig: if current_platform.support_static_graph_mode(): # if cudagraph_mode has full cudagraphs, we need to check support - if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): - # decode context parallel does not support full cudagraphs - if self.parallel_config.decode_context_parallel_size > 1: + if ( + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + and self.model_config is not None + ): + if self.model_config.pooler_config is not None: logger.warning_once( - "Decode context parallel (DCP) is enabled, which is " - "incompatible with full CUDA graphs. " + "Pooling models do not support full cudagraphs. " "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - # prefill context parallel do not support full cudagraphs - elif self.parallel_config.prefill_context_parallel_size > 1: + elif self.model_config.is_encoder_decoder: logger.warning_once( - "Prefill context parallel (PCP) is enabled, which is " - "incompatible with full CUDA graphs. " + "Encoder-decoder models do not support full cudagraphs. " "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - elif self.model_config is not None: - if self.model_config.pooler_config is not None: - logger.warning_once( - "Pooling models do not support full cudagraphs. " - "Overriding cudagraph_mode to PIECEWISE." - ) - self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - elif self.model_config.is_encoder_decoder: - logger.warning_once( - "Encoder-decoder models do not support full cudagraphs. " - "Overriding cudagraph_mode to PIECEWISE." - ) - self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE # disable cudagraph when enforce eager execution if self.model_config is not None and self.model_config.enforce_eager: diff --git a/vllm/distributed/eplb/__init__.py b/vllm/distributed/eplb/__init__.py index 4cd51dd384ad2..12e6cd417c50d 100644 --- a/vllm/distributed/eplb/__init__.py +++ b/vllm/distributed/eplb/__init__.py @@ -1,8 +1,3 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Expert parallelism load balancer (EPLB). -""" - -from .eplb_state import * -from .rebalance_algo import * +"""Expert parallelism load balancer (EPLB).""" diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 9f8798a96a2fc..c5654659b79d6 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -45,7 +45,7 @@ from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MixtureOfExperts from .async_worker import start_async_worker -from .rebalance_algo import rebalance_experts +from .policy import EPLB_POLICIES, AbstractEplbPolicy, DefaultEplbPolicy from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace logger = init_logger(__name__) @@ -213,18 +213,23 @@ class EplbState: self.parallel_config = parallel_config self.device = device self.model_states: dict[str, EplbModelState] = {} + self.policy: type[AbstractEplbPolicy] = DefaultEplbPolicy + """ + Selected EPLB algorithm class + """ + self.expert_load_window_step: int = 0 """ Current step in the sliding window. Different from `expert_rearrangement_step`, each EP rank may have its own `expert_load_window_step`. """ - self.expert_load_window_step: int = 0 + self.expert_load_window_size: int = 0 """ Size of the expert load sliding window. This is a constant and is taken from the config. """ - self.expert_load_window_size: int = 0 + self.expert_rearrangement_step: int = 0 """ Steps after last rearrangement. Will trigger a rearrangement if it exceeds the threshold. @@ -415,6 +420,10 @@ class EplbState: ) self.expert_rearrangement_step_interval = eplb_step_interval + # Set the policy based on the selected eplb algorithm type. + policy_type = self.parallel_config.eplb_config.policy + self.policy = EPLB_POLICIES[policy_type] + logger.debug("Selected EPLB policy: %d", policy_type) if global_expert_load is not None: ep_group = get_ep_group().device_group assert global_expert_load.shape == ( @@ -441,7 +450,7 @@ class EplbState: new_physical_to_logical_map, new_logical_to_physical_map, new_logical_replica_count, - ) = rebalance_experts( + ) = self.policy.rebalance_experts( global_expert_load, num_replicas, num_groups, @@ -776,6 +785,7 @@ class EplbState: f"{num_gpus=}, {num_nodes=}" ) + # Get new expert mappings for eplb_model_state, global_expert_load_window in zip( self.model_states.values(), global_expert_load_windows ): @@ -784,7 +794,7 @@ class EplbState: new_physical_to_logical_map, new_logical_to_physical_map, new_logical_replica_count, - ) = rebalance_experts( + ) = self.policy.rebalance_experts( global_expert_load_window, num_replicas, num_groups, diff --git a/vllm/distributed/eplb/policy/__init__.py b/vllm/distributed/eplb/policy/__init__.py new file mode 100644 index 0000000000000..8e78d7bac0e35 --- /dev/null +++ b/vllm/distributed/eplb/policy/__init__.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import get_args + +from vllm.config.parallel import EPLBPolicyOption + +from .abstract import AbstractEplbPolicy +from .default import DefaultEplbPolicy + +EPLB_POLICIES = {"default": DefaultEplbPolicy} + +# Ensure that the EPLB_POLICIES keys match the EPLBPolicyOption values +assert set(EPLB_POLICIES.keys()) == set(get_args(EPLBPolicyOption)) + +__all__ = [ + "AbstractEplbPolicy", + "DefaultEplbPolicy", + "EPLB_POLICIES", +] diff --git a/vllm/distributed/eplb/policy/abstract.py b/vllm/distributed/eplb/policy/abstract.py new file mode 100644 index 0000000000000..40ed621c84892 --- /dev/null +++ b/vllm/distributed/eplb/policy/abstract.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from abc import ABC, abstractmethod + +import torch + + +class AbstractEplbPolicy(ABC): + @classmethod + @abstractmethod + def rebalance_experts( + cls, + weight: torch.Tensor, + num_replicas: int, + num_groups: int, + num_nodes: int, + num_ranks: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Entry point for expert-parallelism load balancer. + + Parameters: + weight: [layers, num_logical_experts], the load statistics + for all logical experts + num_replicas: number of physical experts, must be a multiple of + `num_ranks` + num_groups: number of expert groups + num_nodes: number of server nodes + num_ranks: number of ranks, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map: [layers, num_replicas], the expert + index of each replica + logical_to_physical_map: [layers, num_logical_experts, X], + the replica indices for each expert + expert_count: [layers, num_logical_experts], number of + physical replicas for each logical expert + """ + raise NotImplementedError diff --git a/vllm/distributed/eplb/policy/default.py b/vllm/distributed/eplb/policy/default.py new file mode 100644 index 0000000000000..6127ec703184a --- /dev/null +++ b/vllm/distributed/eplb/policy/default.py @@ -0,0 +1,267 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Expert parallelism load balancer (EPLB) for vLLM. + +This module implements the core rearrangement algorithm. + +The rearrangement algorithm is adapted from +[DeepSeek EPLB](https://github.com/deepseek-ai/eplb). + +Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example +on how the EPLB algorithm works. +""" + +import numpy as np +import torch + +from .abstract import AbstractEplbPolicy + + +class DefaultEplbPolicy(AbstractEplbPolicy): + @classmethod + def balanced_packing( + cls, weight: torch.Tensor, num_packs: int + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Pack n weighted objects to m packs, such that each bin contains exactly + n/m objects and the weights of all packs are as balanced as possible. + + Parameters: + weight: [X, n], the weight of each item + num_packs: number of packs + + Returns: + pack_index: [X, n], the pack index of each item + rank_in_pack: [X, n], the rank of the item in the pack + """ + num_layers, num_groups = weight.shape + assert num_groups % num_packs == 0 + groups_per_pack = num_groups // num_packs + + device = weight.device + + if groups_per_pack == 1: + pack_index = torch.arange( + weight.size(-1), dtype=torch.int64, device=device + ).expand(weight.shape) + rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device) + return pack_index, rank_in_pack + + weight_np = weight.cpu().numpy() + + # Sort and get indices in decending order + indices_np = np.argsort(-weight_np, axis=-1) + + pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64) + rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64) + + # Run the packing algorithm + for i in range(num_layers): + pack_weights = [0.0] * num_packs + pack_items = [0] * num_packs + + for group in indices_np[i]: + # Find a pack with capacity that has the lowest weight + pack = min( + (j for j in range(num_packs) if pack_items[j] < groups_per_pack), + key=pack_weights.__getitem__, + ) + + assert pack_items[pack] < groups_per_pack + pack_index_np[i, group] = pack + rank_in_pack_np[i, group] = pack_items[pack] + pack_weights[pack] += weight_np[i, group] + pack_items[pack] += 1 + + pack_index = torch.from_numpy(pack_index_np).to(device) + rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device) + + return pack_index, rank_in_pack + + @classmethod + def replicate_experts( + cls, weight: torch.Tensor, num_phy: int + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Replicate `num_log` experts to `num_phy` replicas, such that the maximum + load of all replicas is minimized. + + Parameters: + weight: [X, num_log] + num_phy: total number of experts after replication + + Returns: + phy2log: [X, num_phy], logical expert id of each physical expert + rank: [X, num_phy], the replica rank + logcnt: [X, num_log], number of replicas for each logical expert + """ + n, num_log = weight.shape + num_redundant = num_phy - num_log + assert num_redundant >= 0 + device = weight.device + phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1) + rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device) + logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device) + arangen = torch.arange(n, dtype=torch.int64, device=device) + for i in range(num_log, num_phy): + redundant_indices = (weight / logcnt).max(dim=-1).indices + phy2log[:, i] = redundant_indices + rank[:, i] = logcnt[arangen, redundant_indices] + logcnt[arangen, redundant_indices] += 1 + return phy2log, rank, logcnt + + @classmethod + def rebalance_experts_hierarchical( + cls, + weight: torch.Tensor, + num_physical_experts: int, + num_groups: int, + num_nodes: int, + num_gpus: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + weight: [num_moe_layers, num_logical_experts] + num_physical_experts: number of physical experts after replication + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + phy2log: [layers, num_replicas], the expert + index of each replica + log2phy: [layers, num_logical_experts, X], + the replica indices for each expert + logcnt: [layers, num_logical_experts], number of + physical replicas for each logical expert + """ + num_layers, num_logical_experts = weight.shape + assert num_logical_experts % num_groups == 0 + group_size = num_logical_experts // num_groups + assert num_groups % num_nodes == 0 + groups_per_node = num_groups // num_nodes + assert num_gpus % num_nodes == 0 + assert num_physical_experts % num_gpus == 0 + phy_experts_per_gpu = num_physical_experts // num_gpus + + def inverse(perm: torch.Tensor) -> torch.Tensor: + inv = torch.empty_like(perm) + inv.scatter_( + 1, + perm, + torch.arange( + perm.size(1), dtype=torch.int64, device=perm.device + ).expand(perm.shape), + ) + return inv + + # Step 1: pack groups to nodes + tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1) + group_pack_index, group_rank_in_pack = cls.balanced_packing( + tokens_per_group, num_nodes + ) + log2mlog = ( + ( + (group_pack_index * groups_per_node + group_rank_in_pack) * group_size + ).unsqueeze(-1) + + torch.arange( + group_size, dtype=torch.int64, device=group_pack_index.device + ) + ).flatten(-2) + mlog2log = inverse(log2mlog) + + # Step 2: construct redundant experts within nodes + # [num_layers * num_nodes, num_logical_experts // num_nodes] + tokens_per_mlog = weight.gather(-1, mlog2log).view( + -1, num_logical_experts // num_nodes + ) + phy2mlog, phyrank, mlogcnt = cls.replicate_experts( + tokens_per_mlog, num_physical_experts // num_nodes + ) + + # Step 3: pack physical_experts to GPUs + # [num_layers * num_nodes, num_physical_experts // num_nodes] + tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog) + pack_index, rank_in_pack = cls.balanced_packing( + tokens_per_phy, num_gpus // num_nodes + ) + phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack + pphy2phy = inverse(phy2pphy) + + pphy2mlog = phy2mlog.gather( + -1, pphy2phy + ) # [num_layers * num_nodes, num_log_per_nodes] + pphy2mlog = ( + pphy2mlog.view(num_layers, num_nodes, -1) + + torch.arange( + 0, + num_logical_experts, + num_logical_experts // num_nodes, + device=group_pack_index.device, + ).view(1, -1, 1) + ).flatten(-2) + pphy2log = mlog2log.gather(-1, pphy2mlog) + pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1) + logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog) + return pphy2log, pphyrank, logcnt + + @classmethod + def rebalance_experts( + cls, + weight: torch.Tensor, + num_replicas: int, + num_groups: int, + num_nodes: int, + num_ranks: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Entry point for expert-parallelism load balancer. + + Parameters: + weight: [layers, num_logical_experts], the load statistics for all + logical experts + num_replicas: number of physical experts, must be a multiple of + `num_gpus` + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster + num_ranks: number of ranks, must be a multiple of `num_nodes` + + Returns: + phy2log: [layers, num_replicas], the expert + index of each replica + log2phy: [layers, num_logical_experts, X], + the replica indices for each expert + logcnt: [layers, num_logical_experts], number of + physical replicas for each logical expert + """ + num_layers, num_logical_experts = weight.shape + weight = weight.float() + if num_groups % num_nodes == 0: + # use hierarchical load-balance policy + phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical( + weight, num_replicas, num_groups, num_nodes, num_ranks + ) + else: + # use global load-balance policy + phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical( + weight, num_replicas, 1, 1, num_ranks + ) + num_redundant_experts = num_replicas - num_logical_experts + maxlogcnt = num_redundant_experts + 1 + log2phy: torch.Tensor = torch.full( + (num_layers, num_logical_experts, maxlogcnt), + -1, + dtype=torch.int64, + device=logcnt.device, + ) + log2phy.view(num_layers, -1).scatter_( + -1, + phy2log * maxlogcnt + phyrank, + torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand( + num_layers, -1 + ), + ) + return phy2log, log2phy, logcnt diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py deleted file mode 100644 index e6645e524cc3e..0000000000000 --- a/vllm/distributed/eplb/rebalance_algo.py +++ /dev/null @@ -1,260 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Expert parallelism load balancer (EPLB) for vLLM. - -This module implements the core rearrangement algorithm. - -The rearrangement algorithm is adapted from -[DeepSeek EPLB](https://github.com/deepseek-ai/eplb). - -Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example -on how the EPLB algorithm works. -""" - -import numpy as np -import torch - - -def balanced_packing( - weight: torch.Tensor, num_packs: int -) -> tuple[torch.Tensor, torch.Tensor]: - """ - Pack n weighted objects to m packs, such that each bin contains exactly - n/m objects and the weights of all packs are as balanced as possible. - - Parameters: - weight: [X, n], the weight of each item - num_packs: number of packs - - Returns: - pack_index: [X, n], the pack index of each item - rank_in_pack: [X, n], the rank of the item in the pack - """ - num_layers, num_groups = weight.shape - assert num_groups % num_packs == 0 - groups_per_pack = num_groups // num_packs - - device = weight.device - - if groups_per_pack == 1: - pack_index = torch.arange( - weight.size(-1), dtype=torch.int64, device=device - ).expand(weight.shape) - rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device) - return pack_index, rank_in_pack - - weight_np = weight.cpu().numpy() - - # Sort and get indices in decending order - indices_np = np.argsort(-weight_np, axis=-1) - - pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64) - rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64) - - # Run the packing algorithm - for i in range(num_layers): - pack_weights = [0.0] * num_packs - pack_items = [0] * num_packs - - for group in indices_np[i]: - # Find a pack with capacity that has the lowest weight - pack = min( - (j for j in range(num_packs) if pack_items[j] < groups_per_pack), - key=pack_weights.__getitem__, - ) - - assert pack_items[pack] < groups_per_pack - pack_index_np[i, group] = pack - rank_in_pack_np[i, group] = pack_items[pack] - pack_weights[pack] += weight_np[i, group] - pack_items[pack] += 1 - - pack_index = torch.from_numpy(pack_index_np).to(device) - rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device) - - return pack_index, rank_in_pack - - -def replicate_experts( - weight: torch.Tensor, num_phy: int -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Replicate `num_log` experts to `num_phy` replicas, such that the maximum - load of all replicas is minimized. - - Parameters: - weight: [X, num_log] - num_phy: total number of experts after replication - - Returns: - phy2log: [X, num_phy], logical expert id of each physical expert - rank: [X, num_phy], the replica rank - logcnt: [X, num_log], number of replicas for each logical expert - """ - n, num_log = weight.shape - num_redundant = num_phy - num_log - assert num_redundant >= 0 - device = weight.device - phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1) - rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device) - logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device) - arangen = torch.arange(n, dtype=torch.int64, device=device) - for i in range(num_log, num_phy): - redundant_indices = (weight / logcnt).max(dim=-1).indices - phy2log[:, i] = redundant_indices - rank[:, i] = logcnt[arangen, redundant_indices] - logcnt[arangen, redundant_indices] += 1 - return phy2log, rank, logcnt - - -def rebalance_experts_hierarchical( - weight: torch.Tensor, - num_physical_experts: int, - num_groups: int, - num_nodes: int, - num_gpus: int, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Parameters: - weight: [num_moe_layers, num_logical_experts] - num_physical_experts: number of physical experts after replication - num_groups: number of expert groups - num_nodes: number of server nodes, where the intra-node network - (e.g., NVLink) is faster - num_gpus: number of GPUs, must be a multiple of `num_nodes` - - Returns: - physical_to_logical_map (torch.Tensor): - [num_moe_layers, num_physical_experts] - logical_to_physical_map (torch.Tensor): - [num_moe_layers, num_logical_experts, X] - logical_count (torch.Tensor): - [num_moe_layers, num_logical_experts] - """ - num_layers, num_logical_experts = weight.shape - assert num_logical_experts % num_groups == 0 - group_size = num_logical_experts // num_groups - assert num_groups % num_nodes == 0 - groups_per_node = num_groups // num_nodes - assert num_gpus % num_nodes == 0 - assert num_physical_experts % num_gpus == 0 - phy_experts_per_gpu = num_physical_experts // num_gpus - - def inverse(perm: torch.Tensor) -> torch.Tensor: - inv = torch.empty_like(perm) - inv.scatter_( - 1, - perm, - torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand( - perm.shape - ), - ) - return inv - - # Step 1: pack groups to nodes - tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1) - group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes) - log2mlog = ( - ( - (group_pack_index * groups_per_node + group_rank_in_pack) * group_size - ).unsqueeze(-1) - + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device) - ).flatten(-2) - mlog2log = inverse(log2mlog) - - # Step 2: construct redundant experts within nodes - # [num_layers * num_nodes, num_logical_experts // num_nodes] - tokens_per_mlog = weight.gather(-1, mlog2log).view( - -1, num_logical_experts // num_nodes - ) - phy2mlog, phyrank, mlogcnt = replicate_experts( - tokens_per_mlog, num_physical_experts // num_nodes - ) - - # Step 3: pack physical_experts to GPUs - # [num_layers * num_nodes, num_physical_experts // num_nodes] - tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog) - pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes) - phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack - pphy2phy = inverse(phy2pphy) - - pphy2mlog = phy2mlog.gather( - -1, pphy2phy - ) # [num_layers * num_nodes, num_log_per_nodes] - pphy2mlog = ( - pphy2mlog.view(num_layers, num_nodes, -1) - + torch.arange( - 0, - num_logical_experts, - num_logical_experts // num_nodes, - device=group_pack_index.device, - ).view(1, -1, 1) - ).flatten(-2) - pphy2log = mlog2log.gather(-1, pphy2mlog) - pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1) - logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog) - return pphy2log, pphyrank, logcnt - - -def rebalance_experts( - weight: torch.Tensor, - num_replicas: int, - num_groups: int, - num_nodes: int, - num_gpus: int, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Entry point for expert-parallelism load balancer. - - Parameters: - weight: [layers, num_logical_experts], the load statistics for all - logical experts - num_replicas: number of physical experts, must be a multiple of - `num_gpus` - num_groups: number of expert groups - num_nodes: number of server nodes, where the intra-node network - (e.g, NVLink) is faster - num_gpus: number of GPUs, must be a multiple of `num_nodes` - - Returns: - physical_to_logical_map: - [layers, num_replicas], the expert index of each replica - logical_to_physical_map: - [layers, num_logical_experts, X], the replica indices for each - expert - expert_count: - [layers, num_logical_experts], number of physical - replicas for each logical expert - """ - num_layers, num_logical_experts = weight.shape - weight = weight.float() - if num_groups % num_nodes == 0: - # use hierarchical load-balance policy - phy2log, phyrank, logcnt = rebalance_experts_hierarchical( - weight, num_replicas, num_groups, num_nodes, num_gpus - ) - else: - # use global load-balance policy - phy2log, phyrank, logcnt = rebalance_experts_hierarchical( - weight, num_replicas, 1, 1, num_gpus - ) - num_redundant_experts = num_replicas - num_logical_experts - maxlogcnt = num_redundant_experts + 1 - log2phy: torch.Tensor = torch.full( - (num_layers, num_logical_experts, maxlogcnt), - -1, - dtype=torch.int64, - device=logcnt.device, - ) - log2phy.view(num_layers, -1).scatter_( - -1, - phy2log * maxlogcnt + phyrank, - torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand( - num_layers, -1 - ), - ) - return phy2log, log2phy, logcnt - - -__all__ = ["rebalance_experts"] diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py deleted file mode 100644 index f48d03d0b0cd5..0000000000000 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ /dev/null @@ -1,179 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This file contains a new class `KVLookupBufferBase` that allows developers to -think of KV cache operations as inserting new KV cache entries (`insert`) -into the lookup buffer and querying existing KV caches (`drop_select`) -from the lookup buffer. - -This file also contains a new class `KVStoreBufferBase` that allows developers -to manage the KVCache buffer as a simple key-value storage buffer with basic -put/get operations. - -These classes above are abstracted behind class `KVCacheBufferBase`. -""" - -from abc import ABC, abstractmethod - -import torch - - -class KVCacheBufferBase(ABC): - """ - Abstract base class for a KVCache buffer. - """ - - @abstractmethod - def close(self) -> None: - """Close the buffer and release resources. - - This method is responsible for cleaning up resources related to the - KVCache buffer when it is no longer needed. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - -class KVLookupBufferBase(KVCacheBufferBase): - """ - Abstract base class for a KVCache lookup buffer. - - This class provides an abstraction for a key-value (KV) cache lookup buffer. - - The key of the lookup buffer: - - input_tokens: token IDs of the request - - roi: a binary mask on top of input_tokens. - - Purpose of roi: Since KV cache may only be available for a subset of - tokens in the input (for example, when vLLM is connected to an external - KV cache service), roi specifies the subset of tokens that the KV cache - is associated with. - - NOTE: roi can be further extended to describe which part of KV the - current process is holding (each process may only hold a part of KV - due to TP and PP). This is not implemented for now. - - The value of the lookup buffer: - - key: the key tensor in the KV cache - - value: the value tensor in the KV cache - - hidden: the final hidden state generated by model forwarding. This allows - vLLM to bypass further model forwarding by transmitting the hidden state. - """ - - @abstractmethod - def insert( - self, - input_tokens: torch.Tensor, - roi: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - hidden: torch.Tensor, - ) -> None: - """Insert into the lookup buffer. - - The functionality is similar to the following python statement - ``` - buffer[input_tokens, roi] = [key, value, hidden] - ``` - - FIXME: in the future, we should only have two arguments, key and value, - where key is a tensor dict and value is a tensor dict. - - FIXME: we should transmit both sampler outputs and the hidden states. - - Args: - input_tokens (torch.Tensor): token IDs. - roi (torch.Tensor): A binary mask on top of the input tokens - key (torch.Tensor): The key tensor in the KV cache. - value (torch.Tensor): The value tensor in the KV cache. - hidden (torch.Tensor): The final hidden state tensor generated - during model forwarding to bypass model - forwarding. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - @abstractmethod - def drop_select( - self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None - ) -> list[torch.Tensor | None]: - """Select and *drop* KV cache entries from the lookup buffer. - - The functionality is similar to the following python statements - ``` - ret = buffer.pop(input_tokens, roi) - return ret - ``` - - If `input_tokens` and `roi` is `None`, it means selecting any of the - KV caches in the buffer, return, and remove it from the buffer, useful - when offloading KV cache to KV cache storage service. - - Args: - input_tokens (torch.Tensor): token IDs. - roi (torch.Tensor): A binary mask on top of the input tokens - - Returns: - list[Optional[torch.Tensor]]: A list of tensors. Can be None. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - -class KVStoreBufferBase(KVCacheBufferBase): - """ - Abstract base class for a KVCache storage buffer with key-value semantics. - This class provides a simple key-value storage buffer abstract with basic - put/get operations, which enables flexible KVCache transfer granular - control. - - The functionality is similar to a distributed key-value store, where: - - Key: A unique string identifier for the cached entry - - Value: - - Tensor to be stored and retrieved - - None (indicating deletion or empty value) - """ - - @abstractmethod - def put( - self, - key: str, - value: torch.Tensor | None, - ) -> None: - """Store a key-value pair in the buffer. - - Args: - key (str): Unique identifier for a tensor, this tensor could be the - key cache tensor, value cache tensor, or hidden state tensor - generated during model forwarding. - - value (Optional[torch.Tensor]): Tensor to be stored. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - @abstractmethod - def get( - self, - key: str, - ) -> torch.Tensor | None: - """Retrieve a value from the buffer by key. - - Args: - key (str): Unique identifier for a tensor, this tensor could be the - key cache tensor, value cache tensor, or hidden state tensor - generated during model forwarding. - - Returns: - Optional[torch.Tensor]: Stored tensor if exists, None otherwise. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py deleted file mode 100644 index 7861bea1f9c54..0000000000000 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +++ /dev/null @@ -1,164 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This file contains a new class `MooncakeStore` that allows developers to -think of KV cache transfer operations as putting new KV cache entries -into a remote KVStore-based lookup buffer and getting existing KV caches -from this remote lookup buffer. -""" - -import json -import os -from dataclasses import dataclass - -import torch -from safetensors.torch import load as safetensors_load -from safetensors.torch import save as safetensors_save - -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase -from vllm.logger import init_logger - -DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB -DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB - -logger = init_logger(__name__) - - -@dataclass -class MooncakeStoreConfig: - local_hostname: str - metadata_server: str - global_segment_size: int - local_buffer_size: int - protocol: str - device_name: str - master_server_address: str - - @staticmethod - def from_file(file_path: str) -> "MooncakeStoreConfig": - """Load the config from a JSON file.""" - with open(file_path) as fin: - config = json.load(fin) - return MooncakeStoreConfig( - local_hostname=config.get("local_hostname"), - metadata_server=config.get("metadata_server"), - global_segment_size=config.get( - "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE - ), - local_buffer_size=config.get( - "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE - ), - protocol=config.get("protocol", "tcp"), - device_name=config.get("device_name", ""), - master_server_address=config.get("master_server_address"), - ) - - @staticmethod - def load_from_env() -> "MooncakeStoreConfig": - """Load config from a file specified in the environment variable.""" - config_file_path = os.getenv("MOONCAKE_CONFIG_PATH") - if config_file_path is None: - raise ValueError( - "The environment variable 'MOONCAKE_CONFIG_PATH' is not set." - ) - return MooncakeStoreConfig.from_file(config_file_path) - - -class MooncakeStore(KVStoreBufferBase): - def __init__( - self, - config: VllmConfig, - ): - try: - from mooncake.store import MooncakeDistributedStore - except ImportError as e: - raise ImportError( - "Please install mooncake by following the instructions at " - "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501 - "to run vLLM with MooncakeConnector." - ) from e - - try: - self.store = MooncakeDistributedStore() - self.config = MooncakeStoreConfig.load_from_env() - logger.info("Mooncake Configuration loaded successfully.") - - self.store.setup( - self.config.local_hostname, - self.config.metadata_server, - self.config.global_segment_size, - self.config.local_buffer_size, - self.config.protocol, - self.config.device_name, - self.config.master_server_address, - ) - - except ValueError as e: - logger.error("Configuration loading failed: %s", e) - raise - except Exception as exc: - logger.error("An error occurred while loading the configuration: %s", exc) - raise - - def close(self): - # MooncakeDistributedStore will automatically call the destructor, so - # it is unnecessary to close it manually. - pass - - def put( - self, - key: str, - value: torch.Tensor | None, - ) -> None: - # A message queue needs to be introduced before making it asynchronous. - if value is not None: - self._put_impl(key, value) - - def get( - self, - key: str, - ) -> torch.Tensor | None: - # A message queue needs to be introduced before making it asynchronous. - value = self._get_impl(key) - return value - - def _put_impl( - self, - key: str, - value: torch.Tensor, - ) -> None: - """Put KVCache to Mooncake Store""" - device_id = value.device.index if value.device.type == "cuda" else -1 - device_tensor = torch.tensor(device_id, dtype=torch.int32) - value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor}) - try: - self.store.put(key, value_bytes) - except TypeError as err: - logger.error("Failed to put value into Mooncake Store: %s", err) - raise TypeError("Mooncake Store Put Type Error.") from err - - def _get_impl( - self, - key: str, - ) -> torch.Tensor | None: - """Get KVCache from Mooncake Store""" - try: - data = self.store.get(key) - except TypeError as err: - logger.error("Failed to get value from Mooncake Store: %s", err) - raise TypeError("Mooncake Store Get Type Error.") from err - - if data: - loaded_tensors = safetensors_load(data) - tensor = loaded_tensors["tensor"] - device_id_tensor = loaded_tensors["device_id"] - device_id = int(device_id_tensor.item()) - device = ( - torch.device("cuda", device_id) - if device_id >= 0 - else torch.device("cpu") - ) - return tensor.to(device) - - return None diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py deleted file mode 100644 index f046a349874e6..0000000000000 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ /dev/null @@ -1,242 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -Implements a distributed key-value (KV) cache transfer mechanism. - -Key Features: -- Distributed KV cache transmission using PyNccl pipes. -- Non-blocking `insert`, blocking `drop_select`. -- Use CPU signal pipe to avoid racing condition -- Handles buffer size constraints and provide backpressure mechanism to - stop the prefill instance when the decode instance is slow. -""" - -import threading -from collections import deque - -import torch - -from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase -from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class SimpleBuffer(KVLookupBufferBase): - def __init__( - self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float - ): - """ - signal_pipe: on CPU - - NOTE: on-device recv will block all threads in the process, making the - KV cache producer unable to listen to new request while transmitting - KV cache. Luckily CPU recv only blocks the current thread so we use - CPU recv to listen to new request. - - data_pipe: on device (e.g. GPU) - """ - - self.buffer: deque[list[torch.Tensor]] = deque() - - self.buffer_size = 0 - self.buffer_size_threshold = buffer_size_thresh - self.buffer_cv = threading.Condition() - self.signal_pipe = signal_pipe - self.data_pipe = data_pipe - self.request_handling_thread: threading.Thread | None = None - - self.normal_signal = torch.tensor([0], device="cpu") - self.end_signal = None - - def _matches( - self, - tokens_roi_sender: list[torch.Tensor], - tokens_roi_recver: list[torch.Tensor], - ): - # tokens_roi_sender: tokens and roi of the producer (in the buffer) - # tokens_roi_recver: tokens and roi of the consumer (query) - - tokens_sender = tokens_roi_sender[0] - tokens_recver = tokens_roi_recver[0] - roi_sender = tokens_roi_sender[1] - roi_recver = tokens_roi_recver[1] - - if tokens_recver is None: - # consumer sends an empty request - # semantics: DROP SELECT * LIMIT 1 - # so any of the data in the buffer can be drop-selected - return True - - # Assuming that roi is a binary mask on tokens - tokens_sender = tokens_sender[roi_sender] - tokens_recver = tokens_recver[roi_recver] - - # simple common prefix matching - min_length = min(len(tokens_sender), len(tokens_recver)) - if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]): - return min_length - - return 0 - - def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None: - assert tensor is not None, "Use self.data_pipe.send(None) instead" - self.buffer_size -= tensor.element_size() * tensor.numel() - if tensor.dtype == torch.bool: - tensor = tensor.float() - self.data_pipe.send_tensor(tensor) - - def _get_element_size(self, data: list | torch.Tensor | None): - if isinstance(data, torch.Tensor): - return data.element_size() * data.numel() - if not data: - # cannot perform `not data` on a tensor - # so this check needs to go after the check above - return 0 - - raise AssertionError(f"Unknown data type {type(data)}") - - def _add_to_buffer( - self, - input_tokens: torch.Tensor, - roi: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - hidden: torch.Tensor, - ): - if isinstance(input_tokens, torch.Tensor): - input_tokens = input_tokens.clone() - if isinstance(roi, torch.Tensor): - roi = roi.clone() - if isinstance(key, torch.Tensor): - key = key.clone() - if isinstance(value, torch.Tensor): - value = value.clone() - if isinstance(hidden, torch.Tensor): - hidden = hidden.clone() - - buffer_item = [input_tokens, roi, key, value, hidden] - data_size = sum([self._get_element_size(data) for data in buffer_item]) - - with self.buffer_cv: - if self.buffer_size + data_size > self.buffer_size_threshold: - # log outside the while loop to avoid this message being logged - # repeatedly. - logger.debug("KV transfer buffer is full. Handling...") - while self.buffer_size + data_size > self.buffer_size_threshold: - self.buffer_cv.wait() - - self.buffer_size += data_size - self.buffer.append(buffer_item) - self.buffer_cv.notify() - - def _is_end_signal(self, signal): - return signal is None - - def drop_select_handler(self): - try: - while True: - signal = self.signal_pipe.recv_tensor() - if self._is_end_signal(signal): - logger.info("Received end signal!") - break - - input_tokens = self.data_pipe.recv_tensor() - - roi = self.data_pipe.recv_tensor() - assert roi is not None, ( - "Please provide the roi when sending drop-select request" - ) - roi = roi > 0.5 - tokens_roi_recver = [input_tokens, roi] - - def is_buffer_available( - tokens_roi_recver: list[torch.Tensor], - ) -> bool: - # perform input tokens and roi matching - # FIXME: this matching is O(n), ideally it should be O(1) - # but this buffer size won't (and shouldn't) be too large so - # the fix is not urgent. - for _ in range(len(self.buffer)): - if self._matches(self.buffer[0], tokens_roi_recver) > 0: - return True - # rotate the element we just accessed to the end - self.buffer.rotate(-1) - return False - - with self.buffer_cv: - while not is_buffer_available(tokens_roi_recver): - logger.debug("KV transfer buffer is not available. Waiting...") - self.buffer_cv.wait() - # need to clone the tensor - # in case the tensor is freed before sending finishes - matched_item = self.buffer.popleft() - for tensor in matched_item: - self._send_tensor_and_dec_size(tensor) - self.buffer_cv.notify() - - except RuntimeError as e: - if "Connection closed by peer" not in str(e): - raise e - - logger.debug("Closing drop_select_handler") - - def drop_select( - self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None - ) -> list[torch.Tensor | None]: - assert self.request_handling_thread is None, ( - "drop_select should be called by the KV cache consumer " - "(e.g. the decode vLLM instance)" - ) - - if isinstance(input_tokens, torch.Tensor): - input_tokens = input_tokens.clone() - if isinstance(roi, torch.Tensor): - roi = roi.clone().float() - - self.signal_pipe.send_tensor(self.normal_signal) - self.data_pipe.send_tensor(input_tokens) - self.data_pipe.send_tensor(roi) - - input_tokens = self.data_pipe.recv_tensor() - roi = self.data_pipe.recv_tensor() - if roi is not None: - # convert from float tensor to bool tensor - # as PyNccl does not support sending bool tensor - roi = roi > 0.5 - key = self.data_pipe.recv_tensor() - value = self.data_pipe.recv_tensor() - hidden = self.data_pipe.recv_tensor() - - return [input_tokens, roi, key, value, hidden] - - def insert( - self, - input_tokens: torch.Tensor, - roi: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - hidden: torch.Tensor, - ) -> None: - self._add_to_buffer(input_tokens, roi, key, value, hidden) - - # when calling the insert, the current process is a sender - # need to launch the request handler and start listening to request. - if self.request_handling_thread is None: - self.request_handling_thread = threading.Thread( - target=self.drop_select_handler - ) - self.request_handling_thread.start() - - def close(self): - if ( - hasattr(self, "request_handling_thread") - and self.request_handling_thread is not None - ): - self.request_handling_thread.join() - - else: - # TODO: have a explicit close signal and have a explicit way to - # check if it's requester - self.signal_pipe.send_tensor(self.end_signal) diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py deleted file mode 100644 index 1fe7a90e9a712..0000000000000 --- a/vllm/distributed/kv_transfer/kv_pipe/base.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This file defines an interface `KVPipeBase` -that provides an abstraction for sending and receiving tensors, or None, via -distributed communications. - -All classes instantiated from this interface are assumed to be a FIFO pipe. - -If your distributed communication platform already supports key-value lookup, -you can bypass this interface and directly start from `kv_lookup_buffer`. -""" - -from abc import ABC, abstractmethod - -import torch - - -class KVPipeBase(ABC): - """ - This class provides an interface for sending and receiving tensors, or - None, by distributed communications. - """ - - @abstractmethod - def send_tensor(self, tensor: torch.Tensor | None) -> None: - """Send a tensor, or None, via the pipe. - - Need to support sending None -- important for error handling. - - TODO: add a `key` argument so that we can use traditional - key-value database as the distributed communication mechanism behind - the pipe. - - Args: - tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - @abstractmethod - def recv_tensor(self) -> torch.Tensor | None: - """Receive a tensor (can be None) from the pipeline. - - Returns: - Optional[torch.Tensor]: The tensor received from the pipeline. Can - be None. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError - - @abstractmethod - def close(self) -> None: - """Close the pipeline and release resources. - - This method is responsible for closing the communication pipeline - and releasing any resources associated with it. - - Raises: - NotImplementedError: This method must be implemented in subclasses. - """ - raise NotImplementedError diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py deleted file mode 100644 index 542dde09abad4..0000000000000 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ /dev/null @@ -1,295 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import json -import os -import struct -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass - -import torch -import zmq -from safetensors.torch import load as safetensors_load -from safetensors.torch import save as safetensors_save - -from vllm.config.kv_transfer import KVTransferConfig -from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase -from vllm.logger import init_logger -from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port - -logger = init_logger(__name__) -NONE_INT = -150886311 - - -@dataclass -class MooncakeTransferEngineConfig: - prefill_url: str - decode_url: str - metadata_backend: str | None - metadata_server: str - protocol: str - device_name: str - - @staticmethod - def from_file(file_path: str) -> "MooncakeTransferEngineConfig": - """Load the config from a JSON file.""" - with open(file_path) as fin: - config = json.load(fin) - return MooncakeTransferEngineConfig( - prefill_url=config.get("prefill_url"), - decode_url=config.get("decode_url"), - metadata_backend=config.get("metadata_backend", None), - metadata_server=config.get("metadata_server"), - protocol=config.get("protocol", "tcp"), - device_name=config.get("device_name", ""), - ) - - @staticmethod - def load_from_env() -> "MooncakeTransferEngineConfig": - """Load config from a file specified in the environment variable.""" - config_file_path = os.getenv("MOONCAKE_CONFIG_PATH") - if config_file_path is None: - raise ValueError( - "The environment variable 'MOONCAKE_CONFIG_PATH' is not set." - ) - return MooncakeTransferEngineConfig.from_file(config_file_path) - - -class MooncakeTransferEngine: - """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ.""" - - def __init__(self, kv_rank: int, local_rank: int): - try: - from mooncake.engine import TransferEngine - except ImportError as e: - raise ImportError( - "Please install mooncake by following the instructions at " - "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501 - "to run vLLM with MooncakeConnector." - ) from e - - self.engine = TransferEngine() - self.local_rank = local_rank - - try: - self.config = MooncakeTransferEngineConfig.load_from_env() - logger.info("Mooncake Configuration loaded successfully.") - except ValueError as e: - logger.error(e) - raise - except Exception as exc: - logger.error("An error occurred while loading the configuration: %s", exc) - raise - prefill_host, base_prefill_port = split_host_port(self.config.prefill_url) - decode_host, base_decode_port = split_host_port(self.config.decode_url) - - # Avoid ports conflict when running prefill and decode on the same node - if prefill_host == decode_host and base_prefill_port == base_decode_port: - base_decode_port = base_decode_port + 100 - - prefill_port = base_prefill_port + self.local_rank - decode_port = base_decode_port + self.local_rank - self.prefill_url = join_host_port(prefill_host, prefill_port) - self.decode_url = join_host_port(decode_host, decode_port) - - self.initialize( - self.prefill_url if kv_rank == 0 else self.decode_url, - self.config.metadata_server, - self.config.protocol, - self.config.device_name, - self.config.metadata_backend, - ) - - self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url - - # Initialize ZeroMQ context and sockets - self.context = zmq.Context() # type: ignore[attr-defined] - self.sender_socket = self.context.socket(zmq.constants.PUSH) - self.receiver_socket = self.context.socket(zmq.constants.PULL) - self.sender_ack = self.context.socket(zmq.constants.PULL) - self.receiver_ack = self.context.socket(zmq.constants.PUSH) - - self.buffer_cleaner = ThreadPoolExecutor(max_workers=1) - self._setup_metadata_sockets( - kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port - ) - - def _setup_metadata_sockets( - self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int - ) -> None: - """Set up ZeroMQ sockets for sending and receiving data.""" - # Offsets < 8 are left for initialization in case tp and pp are enabled - p_rank_offset = p_port + 8 + self.local_rank * 2 - d_rank_offset = d_port + 8 + self.local_rank * 2 - if kv_rank == 0: - self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1)) - self.receiver_socket.connect( - make_zmq_path("tcp", d_host, d_rank_offset + 1) - ) - self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2)) - self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2)) - else: - self.receiver_socket.connect( - make_zmq_path("tcp", p_host, p_rank_offset + 1) - ) - self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1)) - self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2)) - self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2)) - - def initialize( - self, - local_hostname: str, - metadata_server: str, - protocol: str, - device_name: str, - metadata_backend: str | None, - ) -> None: - """Initialize the mooncake instance.""" - if metadata_backend is None: - self.engine.initialize( - local_hostname, metadata_server, protocol, device_name - ) - else: - supported_backend = ["etcd", "redis"] - metadata_backend = metadata_backend.lower() - if metadata_backend not in supported_backend: - raise ValueError( - "Mooncake Configuration error. `metadata_backend`" - f" should be one of {supported_backend}." - ) - - self.engine.initialize_ext( - local_hostname, metadata_server, protocol, device_name, metadata_backend - ) - - def allocate_managed_buffer(self, length: int) -> int: - """Allocate a managed buffer of the specified length.""" - ret = self.engine.allocate_managed_buffer(length) - if ret <= 0: - logger.error("Allocation Return Error") - raise Exception("Allocation Return Error") - return ret - - def free_managed_buffer(self, buffer: int, length: int) -> int: - """Free a previously allocated managed buffer.""" - return self.engine.free_managed_buffer(buffer, length) - - def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int: - """Synchronously transfer data to the specified address.""" - ret = self.engine.transfer_sync_read( - self.remote_url, buffer, peer_buffer_address, length - ) - if ret < 0: - logger.error("Transfer Return Error") - raise Exception("Transfer Return Error") - return ret - - def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int: - """Write bytes to the allocated buffer.""" - return self.engine.write_bytes_to_buffer(buffer, user_data, length) - - def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes: - """Read bytes from the allocated buffer.""" - return self.engine.read_bytes_from_buffer(buffer, length) - - def wait_for_ack(self, src_ptr: int, length: int) -> None: - """Asynchronously wait for ACK from the receiver.""" - ack = self.sender_ack.recv() - if ack != b"ACK": - logger.error("Failed to receive ACK from the receiver") - - self.free_managed_buffer(src_ptr, length) - - def send_bytes(self, user_data: bytes) -> None: - """Send bytes to the remote process.""" - length = len(user_data) - src_ptr = self.allocate_managed_buffer(length) - self.write_bytes_to_buffer(src_ptr, user_data, length) - self.sender_socket.send_multipart( - [struct.pack("!Q", src_ptr), struct.pack("!Q", length)] - ) - self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length) - - def recv_bytes(self) -> bytes: - """Receive bytes from the remote process.""" - data = self.receiver_socket.recv_multipart() - src_ptr = struct.unpack("!Q", data[0])[0] - length = struct.unpack("!Q", data[1])[0] - dst_ptr = self.allocate_managed_buffer(length) - self.transfer_sync(dst_ptr, src_ptr, length) - ret = self.read_bytes_from_buffer(dst_ptr, length) - - # Buffer cleanup - self.receiver_ack.send(b"ACK") - self.free_managed_buffer(dst_ptr, length) - - return ret - - -class MooncakePipe(KVPipeBase): - """MooncakeTransferEngine based Pipe implementation.""" - - def __init__( - self, local_rank: int, config: KVTransferConfig, device: str | None = None - ): - """Initialize the mooncake pipe and set related parameters.""" - self.config = config - self.local_rank = local_rank - self.kv_rank = self.config.kv_rank - assert self.kv_rank is not None - if device is None: - self.device = self._select_device(self.config.kv_buffer_device) - else: - self.device = self._select_device(device) - - self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank) - self.transport_thread: ThreadPoolExecutor | None = None - self.none_tensor = torch.tensor([NONE_INT], device=self.device) - - def _select_device(self, device: str) -> torch.device: - """Select available device (CUDA or CPU).""" - logger.info("Selecting device: %s", device) - if device == "cuda": - return torch.device(f"cuda:{self.local_rank}") - else: - return torch.device("cpu") - - def tensor_hash(self, tensor: torch.Tensor) -> int: - """Calculate the hash value of the tensor.""" - return hash(tensor.data_ptr()) - - def _send_impl(self, tensor: torch.Tensor) -> None: - """Implement the tensor sending logic using safetensors.""" - self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor})) - - def _recv_impl(self) -> torch.Tensor: - """Implement the tensor receiving logic using safetensors.""" - data = self.transfer_engine.recv_bytes() - return safetensors_load(data)["tensor"].to(self.device) - - def send_tensor(self, tensor: torch.Tensor | None) -> None: - """Send tensor to the target process.""" - if self.transport_thread is None: - self.transport_thread = ThreadPoolExecutor(max_workers=1) - tensor = tensor if tensor is not None else self.none_tensor - assert len(tensor.shape) > 0 - self.transport_thread.submit(self._send_impl, tensor) - - def recv_tensor(self) -> torch.Tensor | None: - """Receive tensor from other processes.""" - if self.transport_thread is None: - self.transport_thread = ThreadPoolExecutor(max_workers=1) - tensor = self.transport_thread.submit(self._recv_impl).result() - if tensor.numel() == 1 and tensor.item() == NONE_INT: - return None - else: - return tensor - - def close(self) -> None: - """Cleanup logic when closing the pipe.""" - self.transfer_engine.sender_socket.close() - self.transfer_engine.receiver_socket.close() - self.transfer_engine.sender_ack.close() - self.transfer_engine.receiver_ack.close() - self.transfer_engine.context.term() # Terminate the ZMQ context - logger.info("Closed the transfer engine and cleaned up resources.") diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py deleted file mode 100644 index 526c5cd1d5278..0000000000000 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ /dev/null @@ -1,285 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This module implements a PyNccl pipe for sending and receiving -Optional[torch.Tensor] between distributed ranks with advanced -communication features. - -Key Features: -- Supports sending and receiving tensors with metadata -- Handles both CUDA and CPU device communications -- Implements a non-blocking tensor transfer mechanism -- Manages buffer size and provides backpressure control -- Supports distributed process groups with configurable parameters -""" - -import threading -import time -from collections.abc import Callable -from concurrent.futures import ThreadPoolExecutor - -import torch - -from vllm.config.kv_transfer import KVTransferConfig -from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator -from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase -from vllm.distributed.utils import StatelessProcessGroup -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class BrokenPipeException(Exception): - def __init__(self, message): - self.message = message - super().__init__(self.message) - - -Metadata = dict[str, torch.Tensor | None] - - -class PyNcclPipe(KVPipeBase): - METADATA_LENGTH = 16 - MAX_TENSOR_DIMENSIONS = 14 - METADATA_DTYPE = torch.int64 - - def __init__( - self, - local_rank: int, - config: KVTransferConfig, - device: str | None = None, - port_offset: int = 0, - ): - self.config = config - self.local_rank = local_rank - self.kv_rank = self.config.kv_rank - assert self.kv_rank is not None - self.kv_parallel_size = self.config.kv_parallel_size - if device is None: - self.device = self._select_device(self.config.kv_buffer_device) - else: - self.device = self._select_device(device) - - # build distributed connection and send/recv implementation - store_timeout = self.config.get_from_extra_config("store_timeout", 300) - self.group = StatelessProcessGroup.create( - host=self.config.kv_ip, - port=self.config.kv_port + port_offset, - rank=self.kv_rank, - world_size=self.kv_parallel_size, - store_timeout=store_timeout, - ) - # add a barrier to make sure the connection is initiated properly - self.group.barrier() - impl = self._get_device_send_recv_impl(self.group) - self.device_send_func, self.device_recv_func = impl - # set target rank - self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size - self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size - - # transportation-related variables - self.transport_thread: ThreadPoolExecutor | None = None - self.buffer_size = 0 - self.buffer_size_lock = threading.Lock() - self.buffer_size_thresh = self.config.kv_buffer_size - - def _get_device_send_recv_impl( - self, group: StatelessProcessGroup - ) -> tuple[ - Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None] - ]: - send: Callable[[torch.Tensor, int], None] - recv: Callable[[torch.Tensor, int], None] - if self.device.type == "cuda": - # use PyNCCL for send / recv - comm = PyNcclCommunicator(group, device=self.local_rank) - comm.disabled = False - send, recv = comm.send, comm.recv # type: ignore - else: - # This send / recv implementation here is NOT intended to transfer - # KV caches (and should NOT be repurposed to transfer KV caches). - # Currently it is only used to transmit control-plane messages - # for PyNcclBuffer. - send = group.send_obj - - def my_recv(x, src): - x[...] = group.recv_obj(src) - - recv = my_recv - - return send, recv - - def _select_device(self, device: str): - logger.info("Selecting device: %s", device) - if device == "cuda": - return torch.device(f"cuda:{self.local_rank}") - else: - return torch.device("cpu") - - def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata: - """ - Create the metadata as a dictionary based on the input tensor. - - Args: - tensor: The input tensor or None if no tensor is provided. - - Returns: - metadata: A dictionary with the following keys: - - "dtype": The data type of the tensor or None. - - "shape": The shape of the tensor or None. - """ - if tensor is None: - return {"dtype": None, "shape": None} - else: - return {"dtype": tensor.dtype, "shape": tensor.shape} - - def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor: - """ - Create a buffer to receive the tensor based on the provided metadata. - - Args: - metadata: A dictionary with keys "dtype" and "shape", - describing the tensor's data type and shape. - - Returns: - buffer: A tensor of the specified type and shape, - allocated on `self.device`. - """ - return torch.empty( - metadata["shape"], dtype=metadata["dtype"], device=self.device - ) - - def _send_metadata(self, metadata: Metadata): - """ - Send the metadata dictionary to the target rank. - - Args: - metadata: A dictionary with keys "dtype" and "shape". - """ - self.group.send_obj(metadata, self.target_rank_for_send) - - def _recv_metadata(self) -> Metadata: - """ - Receive the metadata dictionary from the target rank. - - Returns: - metadata: A dictionary with keys "dtype" and "shape" - describing the tensor. - """ - return self.group.recv_obj(self.target_rank_for_recv) - - def _send_impl(self, tensor: torch.Tensor | None) -> None: - """ - The actual implementation of sending the tensor and its metadata to the - target rank. - - Args: - tensor: The input tensor to be sent, or `None` if no tensor is - being sent. - """ - metadata = self._make_metadata(tensor) - self._send_metadata(metadata) - if tensor is not None: - self.device_send_func(tensor.to(self.device), self.target_rank_for_send) - - def _recv_impl(self) -> torch.Tensor | None: - """ - The actual implementation of receiving a tensor and its metadata from - the target rank. - - Returns: - buffer: The received tensor, or `None` if no tensor is received. - """ - metadata = self._recv_metadata() - if metadata["dtype"] is None: - return None - buffer = self._prepare_recv_buffer(metadata) - self.device_recv_func(buffer, self.target_rank_for_recv) - - return buffer - - def send_tensor_wrapper( - self, tensor: torch.Tensor | None, tensor_size: int - ) -> None: - """ - Wrapper for _send_impl to handle exceptions and update buffer size. - """ - try: - self._send_impl(tensor) - - with self.buffer_size_lock: - self.buffer_size -= tensor_size - except Exception as e: - logger.error( - "[rank%d]: Exception when trying to send %s, msg: %s", - torch.distributed.get_rank(), - str(tensor), - str(e), - ) - import traceback - - traceback.print_exc() - - def block_if_full(self): - """ - Block the current thread if the buffer size is larger than the - threshold. - """ - while self.buffer_size > self.buffer_size_thresh: - logger.debug("KV cache transfer pipe is full. Waiting...") - time.sleep(0.05) - - def send_tensor(self, tensor: torch.Tensor | None) -> None: - """ - Sends a tensor and its metadata to the destination rank in a - non-blocking way. - - Args: - tensor: The tensor to send, or `None` if no tensor is being sent. - """ - if self.transport_thread is None: - self.transport_thread = ThreadPoolExecutor(max_workers=1) - - if tensor is not None: - tensor_size = tensor.element_size() * tensor.numel() - else: - tensor_size = 0 - - self.block_if_full() - - with self.buffer_size_lock: - self.buffer_size += tensor_size - - self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size) - - def recv_tensor(self) -> torch.Tensor | None: - """ - Receives a tensor and its metadata from the source rank. Blocking call. - - Returns: - The received tensor, or `None` if no tensor is received. - """ - if self.transport_thread is None: - self.transport_thread = ThreadPoolExecutor(max_workers=1) - - future = self.transport_thread.submit(self._recv_impl) - - try: - tensor = future.result() - except Exception as e: - logger.error("Encountering exception in KV receiving thread") - logger.error("%s", e) - logger.error("My device: %s", self.device) - import traceback - - traceback.print_exc() - raise e - - return tensor - - def close(self): - """ - Close the pipe and release associated resources. - """ - if hasattr(self, "transport_thread") and self.transport_thread is not None: - self.transport_thread.shutdown() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 978f553d7b8a1..8065b609ca5b8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -520,6 +520,9 @@ class EngineArgs: ObservabilityConfig, "kv_cache_metrics_sample" ) cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics + enable_layerwise_nvtx_tracing: bool = ( + ObservabilityConfig.enable_layerwise_nvtx_tracing + ) scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls @@ -1032,6 +1035,10 @@ class EngineArgs: "--cudagraph-metrics", **observability_kwargs["cudagraph_metrics"], ) + observability_group.add_argument( + "--enable-layerwise-nvtx-tracing", + **observability_kwargs["enable_layerwise_nvtx_tracing"], + ) # Scheduler arguments scheduler_kwargs = get_kwargs(SchedulerConfig) @@ -1711,6 +1718,7 @@ class EngineArgs: kv_cache_metrics=self.kv_cache_metrics, kv_cache_metrics_sample=self.kv_cache_metrics_sample, cudagraph_metrics=self.cudagraph_metrics, + enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing, ) # Compilation config overrides diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index bb932e39e0472..7da0914ce3d3e 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -455,11 +455,13 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: output_items.extend(_parse_function_call(message, recipient)) # Built-in tools on commentary channel are treated as reasoning for now - elif recipient is not None and ( - recipient.startswith("python") - or recipient.startswith("browser") - or recipient.startswith("container") + elif ( + recipient is None # Preambles: explanatory text before tool calls + or recipient.startswith(("python", "browser", "container")) ): + # Per Harmony format, commentary channel can contain preambles to calling + # multiple functions - explanatory text with no recipient. Built-in tool + # recipients (python/browser/container) also generate reasoning output. output_items.extend(_parse_reasoning_content(message)) else: raise ValueError(f"Unknown recipient: {recipient}") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index cecd1da1e5548..9b7bc461e4511 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1072,10 +1072,15 @@ class OpenAIServingChat(OpenAIServing): # wasn't ready to send a token, then # get the next token without streaming a chunk if delta_message is None: - if output.finish_reason is None: + # NOTE: If return_token_ids is enabled, we still need to + # send a chunk with token_ids even if delta_message is None + # to ensure all tokens are included in the response + if ( + output.finish_reason is None + and not request.return_token_ids + ): continue - else: - delta_message = DeltaMessage() + delta_message = DeltaMessage() # Log streaming delta if output logging is enabled if self.enable_log_outputs and self.request_logger: diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 1826fafa8c4f5..e52845dfa246d 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -345,6 +345,10 @@ class FusedMoEQuantConfig: def use_mxfp4_w4a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == "mxfp4" + @property + def use_mxfp4_w4a4(self) -> bool: + return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4" + @property def use_nvfp4_w4a4(self) -> bool: return self.quant_dtype == "nvfp4" diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 902a77987d61a..6001b6d83c398 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -863,7 +863,8 @@ class FusedMoE(CustomOp): use_chunked_impl: bool, ) -> tuple[bool, torch.Tensor | None]: use_shared_experts_stream = ( - has_separate_shared_experts + current_platform.is_cuda() + and has_separate_shared_experts and not use_chunked_impl and self.shared_experts_stream is not None and ( diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 8f05828d74f5f..882ad0a537cd5 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -221,8 +221,8 @@ def rocm_aiter_fused_experts( else: quant_method = QuantMethod.NO.value - # quark moe for mxfp4 w_dtype - if quant_config.use_mxfp4_w4a16: + # quark moe for mxfp4 w_dtype mxfp4 a_dtype + if quant_config.use_mxfp4_w4a4: quant_method = QuantMethod.BLOCK_1X32.value # w8a8 block-scaled if quant_config.block_shape is not None and quant_config.use_fp8_w8a8: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 48223c9f103ea..0e3e13f5945ea 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -124,12 +124,16 @@ class Fp8MoeBackend(Enum): def get_fp8_moe_backend( - block_quant: bool, moe_parallel_config: FusedMoEParallelConfig + block_quant: bool, + moe_parallel_config: FusedMoEParallelConfig, + with_lora_support: bool, ) -> Fp8MoeBackend: """ Select the primary FP8 MoE backend Note: Shape-specific fallbacks may still occur at runtime. """ + if with_lora_support: + return Fp8MoeBackend.TRITON # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100. if ( current_platform.is_cuda() @@ -665,7 +669,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.weight_block_size = self.quant_config.weight_block_size self.block_quant: bool = self.weight_block_size is not None self.fp8_backend = get_fp8_moe_backend( - self.block_quant, layer.moe_parallel_config + self.block_quant, layer.moe_parallel_config, self.moe.is_lora_enabled ) self.marlin_input_dtype = None @@ -1084,6 +1088,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): from vllm.model_executor.layers.fused_moe import ( BatchedDeepGemmExperts, BatchedTritonExperts, + TritonExperts, TritonOrDeepGemmExperts, ) @@ -1116,7 +1121,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): num_dispatchers=prepare_finalize.num_dispatchers(), quant_config=self.moe_quant_config, ) - + elif self.moe.is_lora_enabled: + return TritonExperts(quant_config=self.moe_quant_config) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: # Select GEMM experts with block-scale when weights are block-quantized experts = select_cutlass_fp8_gemm_impl( diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index aa6ece30026d3..4dff984f92be6 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -30,7 +30,6 @@ def get_rope( is_neox_style: bool = True, rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, - partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: @@ -55,6 +54,10 @@ def get_rope( else: dual_chunk_attention_args = None + partial_rotary_factor = 1.0 + if rope_parameters is not None: + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + if partial_rotary_factor < 1.0: rotary_dim = int(rotary_dim * partial_rotary_factor) key = ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 4a69787af55e2..2a8be29d8d306 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -148,8 +148,6 @@ class ApertusAttention(nn.Module): if head_dim is None: head_dim = self.hidden_size // self.total_num_heads self.head_dim = head_dim - # Phi models introduced a partial_rotary_factor parameter in the config - self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1) self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -228,11 +226,10 @@ class ApertusAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=int(self.partial_rotary_factor * self.head_dim), + rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, - partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index f7a5d4e7889e5..0143e140af265 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -127,8 +127,6 @@ class BailingAttention(nn.Module): prefix=f"{prefix}.dense", ) - self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) - self.rotary_dim = getattr(config, "rotary_dim", self.head_dim) self.rotary_emb = get_rope( @@ -137,7 +135,6 @@ class BailingAttention(nn.Module): max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, - partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 1d6493b18c343..00d742f84ef79 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module): self.scaling = self.head_dim**-0.5 self.max_position_embeddings = max_position_embeddings - if hasattr(config, "partial_rotary_factor"): - rotary_dim = int(self.head_dim * config.partial_rotary_factor) - elif hasattr(config, "attn_rotary_emb"): + if hasattr(config, "attn_rotary_emb"): rotary_dim = config.attn_rotary_emb # for backward compatibility else: rotary_dim = self.head_dim # default diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index d7e802ba1aca0..fbeb28a1c0b36 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -8,7 +8,6 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform -from vllm.transformers_utils.config import set_default_rope_theta from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec @@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): if not model_config.enforce_eager: max_position = round_up(max_position, 8) - set_default_rope_theta(config, default_theta=config.rotary_emb_base) - config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), @@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) - set_default_rope_theta(config, default_theta=config.rotary_emb_base) - config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, @@ -490,6 +485,26 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig): logger.info("Using bfloat16 kv-cache for DeepSeekV3.2") +class NemotronHForCausalLMConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto' + (or not explicitly set), to the value specified in the HF config, or to + float16 if not specified. + """ + cache_config = vllm_config.cache_config + if cache_config.mamba_ssm_cache_dtype == "auto": + hf_config = vllm_config.model_config.hf_config + mamba_ssm_cache_dtype = getattr( + hf_config, "mamba_ssm_cache_dtype", "float16" + ) + logger.info( + "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model", + mamba_ssm_cache_dtype, + ) + cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype + + MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteModel": SnowflakeGteNewModelConfig, "GteNewModel": GteNewModelConfig, @@ -507,4 +522,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "Mamba2ForCausalLM": MambaModelConfig, "FalconMambaForCausalLM": MambaModelConfig, "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM, + "NemotronHForCausalLM": NemotronHForCausalLMConfig, } diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 019fb3e29ab91..a612ebd956282 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, + MultiModalKwargsItems, NestedTensors, ) from vllm.multimodal.parse import ( @@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 83ceb9303cfb5..a1c1263f8d724 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): self.scaling = self.head_dim**-0.5 self.max_position_embeddings = max_position_embeddings - if hasattr(config, "partial_rotary_factor"): - rotary_dim = self.head_dim * config.partial_rotary_factor - elif hasattr(config, "attn_rotary_emb"): + if hasattr(config, "attn_rotary_emb"): rotary_dim = config.attn_rotary_emb # for backward compatibility else: rotary_dim = self.head_dim # default diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py index a6991f8e43fef..26d7c29aae6e2 100644 --- a/vllm/model_executor/models/glm.py +++ b/vllm/model_executor/models/glm.py @@ -10,7 +10,8 @@ from .utils import PPMissingLayer class GlmForCausalLM(LlamaForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - vllm_config.model_config.hf_config.partial_rotary_factor = 0.5 + hf_config = vllm_config.model_config.hf_config + hf_config.rope_parameters["partial_rotary_factor"] = 0.5 super().__init__(vllm_config=vllm_config, prefix=prefix) # Hack Llama model to fit HF format GLM implementation # Attention difference between GLM and Llama: diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 002cdb721e1db..9adfa942b99fa 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -78,7 +78,7 @@ class Glm4Attention(nn.Module): # Number of KV heads is less than TP size, so we replicate # the KV heads across multiple tensor parallel GPUs. assert tp_size % self.total_num_kv_heads == 0 - partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) + config.rope_parameters.setdefault("partial_rotary_factor", 0.5) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.head_dim = head_dim or hidden_size // self.total_num_heads self.rotary_dim = self.head_dim @@ -106,7 +106,6 @@ class Glm4Attention(nn.Module): rotary_dim=self.rotary_dim, max_position=max_position, rope_parameters=config.rope_parameters, - partial_rotary_factor=partial_rotary_factor, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index c99f824e1bd4d..8cae5ee425e4d 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module): prefix=f"{prefix}.o_proj", ) - partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) + config.rope_parameters.setdefault("partial_rotary_factor", 0.5) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, - partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index b9959682cbcef..212d605c17285 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.dense", ) - scaling = self.head_size**-0.5 - rotary_dim = int(self.head_size * config.rotary_pct) - assert rotary_dim % 2 == 0 max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, - rotary_dim=rotary_dim, + rotary_dim=self.head_size, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) + scaling = self.head_size**-0.5 self.attn = Attention( self.num_heads, self.head_size, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 0f65683cf7c57..01b3e7827424d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -78,7 +78,7 @@ class SupportsMultiModal(Protocol): `multimodal_config.mm_encoder_tp_mode="data"`. """ - merge_by_field_config: ClassVar[bool] = False + merge_by_field_config: ClassVar[bool] = True """ A flag that indicates which implementation of `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use. diff --git a/vllm/model_executor/models/lightonocr.py b/vllm/model_executor/models/lightonocr.py index 9839e4f8f707e..353ee7806b1b1 100644 --- a/vllm/model_executor/models/lightonocr.py +++ b/vllm/model_executor/models/lightonocr.py @@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import ( ) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import BaseMultiModalProcessorCache -from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import ( BaseMultiModalProcessor, @@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8f5a967cd422a..167dfbca248ce 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -149,8 +149,6 @@ class LlamaAttention(nn.Module): if head_dim is None: head_dim = self.hidden_size // self.total_num_heads self.head_dim = head_dim - # Phi models introduced a partial_rotary_factor parameter in the config - self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1) self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -265,7 +263,6 @@ class LlamaAttention(nn.Module): max_position=self.max_position_embeddings, rope_parameters=getattr(config, "rope_parameters", None), is_neox_style=is_neox_style, - partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 891a9ce080233..c4198d36b392e 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -52,7 +52,6 @@ from vllm.multimodal.evs import ( from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, - MultiModalKwargs, MultiModalKwargsItems, VideoItem, ) @@ -849,17 +848,18 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - if "image_num_patches" in out_mm_kwargs: - image_num_patches = out_mm_kwargs["image_num_patches"] + out_mm_data = out_mm_kwargs.get_data() + if "image_num_patches" in out_mm_data: + image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() - elif "image_embeds" in out_mm_kwargs: + elif "image_embeds" in out_mm_data: # to compute num_patches (similar to Qwen2-VL) - image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index ffba6c9dfe739..bf83ee5e42a15 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -178,7 +178,6 @@ class NemotronAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.partial_rotary_factor = config.partial_rotary_factor self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -203,7 +202,6 @@ class NemotronAttention(nn.Module): rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, - partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 9d968dee87114..734fbc60709fa 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention): max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, - partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index 4338918663378..b92f0c9dac32b 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -23,7 +23,7 @@ from vllm.config import VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalFieldConfig, - MultiModalKwargs, + MultiModalKwargsItems, ) from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser from vllm.multimodal.processing import ( @@ -153,7 +153,7 @@ class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]) self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 5256d8ba7fd86..1df5ff62fa5b5 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -62,7 +62,7 @@ from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFeatureSpec, MultiModalFieldConfig, - MultiModalKwargs, + MultiModalKwargsItems, ) from vllm.multimodal.parse import ( ImageProcessorItems, @@ -307,7 +307,7 @@ class PaddleOCRVLMultiModalProcessor( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) hf_config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index ec5d0fa6226dd..9fa32f01d37a0 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -40,7 +40,6 @@ from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -252,6 +251,8 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn dummy_inputs=PaliGemmaDummyInputsBuilder, ) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -327,9 +328,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP return None if pixel_values is not None: - pixel_values = flatten_bn(pixel_values, concat=True) - h = w = self.config.vision_config.image_size + return PaliGemmaImagePixelInputs( type="pixel_values", data=pixel_values, @@ -337,8 +337,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP ) if image_embeds is not None: - image_embeds = flatten_bn(image_embeds, concat=True) - return PaliGemmaImageEmbeddingInputs( type="image_embeds", data=image_embeds, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 795cd25f16753..8f26c68720a5c 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module): self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True assert (self.head_dim * self.total_num_heads) == self.hidden_size @@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module): rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, - partial_rotary_factor=self.partial_rotary_factor, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 70016d9ed246c..253fbbc41330c 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -109,10 +109,7 @@ class PhiAttention(nn.Module): ) scaling = self.head_size**-0.5 - rotary_dim = int( - config.partial_rotary_factor - * (config.hidden_size // config.num_attention_heads) - ) + rotary_dim = config.hidden_size // config.num_attention_heads assert rotary_dim % 2 == 0 max_position_embeddings = getattr(config, "max_position_embeddings", 2048) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index d5f1db9f6e064..a2332064d17f7 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -77,7 +77,7 @@ from vllm.multimodal.evs import ( from vllm.multimodal.inputs import ( MultiModalFeatureSpec, MultiModalFieldConfig, - MultiModalKwargs, + MultiModalKwargsItems, ) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import PromptReplacement, PromptUpdate @@ -973,7 +973,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 661a182151d74..dd64e3983e381 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module): rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, - partial_rotary_factor=config.partial_rotary_factor, dual_chunk_attention_config=self.dual_chunk_attention_config, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 88b76d2071d18..80f22d87423c7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -103,7 +103,7 @@ from .qwen2_5_vl import ( Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs, ) -from .qwen2_vl import Qwen2VLProcessingInfo +from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo from .qwen3 import Qwen3ForCausalLM, Qwen3Model from .utils import ( AutoWeightsLoader, @@ -884,7 +884,10 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(video_needs_metadata=True) + return Qwen2VLMultiModalDataParser( + self.info.get_hf_config().vision_config.spatial_merge_size, + video_needs_metadata=True, + ) def _call_hf_processor( self, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 65092584edced..e879599ad3ead 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -119,9 +119,6 @@ class StablelmAttention(nn.Module): self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size) self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.partial_rotary_factor = getattr( - config, "rope_pct", getattr(config, "partial_rotary_factor", 1) - ) self.scaling = self.head_dim**-0.5 self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_key_value_heads * self.head_dim @@ -154,7 +151,6 @@ class StablelmAttention(nn.Module): rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, rope_parameters=self.config.rope_parameters, - partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( self.num_heads, diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 97f6aa461b90c..67bdf5e1557f9 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -25,7 +25,6 @@ from .inputs import ( MultiModalBatchedField, MultiModalFeatureSpec, MultiModalFieldElem, - MultiModalKwargs, MultiModalKwargsItem, MultiModalKwargsItems, NestedTensors, @@ -90,7 +89,6 @@ MultiModalCacheValue: TypeAlias = ( | MultiModalProcessorCacheItemMetadata | MultiModalKwargsItems | MultiModalKwargsItem - | MultiModalKwargs | Mapping[str, NestedTensors] ) @@ -108,12 +106,7 @@ class MultiModalCache: # These are not subclasses of dict if isinstance( leaf, - ( - MultiModalKwargs, - MultiModalKwargsItems, - MultiModalKwargsItem, - MultiModalFieldElem, - ), + (MultiModalKwargsItems, MultiModalKwargsItem, MultiModalFieldElem), ): return cls.get_item_size(leaf.data) # type: ignore diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 397684fa2f83c..32f15240cb7da 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict -from collections.abc import Mapping, Sequence +from collections.abc import Mapping, Sequence, Set from dataclasses import dataclass from functools import partial from itertools import accumulate @@ -201,8 +201,10 @@ Uses a list instead of a tensor if the dimensions of each element do not match. def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: - """Equality check between - [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.""" + """ + Equality check between + [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects. + """ if isinstance(a, torch.Tensor): return isinstance(b, torch.Tensor) and torch.equal(a, b) elif isinstance(b, torch.Tensor): @@ -224,10 +226,24 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: BatchedTensorInputs: TypeAlias = dict[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via -[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch]. +[`MultiModalKwargsItems.get_data`][vllm.multimodal.inputs.MultiModalKwargsItems.get_data]. """ +def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> bool: + """ + Equality check between + [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects. + """ + for k in a: + if k not in b: + return False + if not nested_tensors_equal(a[k], b[k]): + return False + + return True + + @dataclass class MultiModalFeatureSpec: """ @@ -823,7 +839,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]): return self # type: ignore[return-value] - def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs": + def get_data( + self, + *, + device: torch.types.Device = None, + pin_memory: bool = False, + cpu_fields: Set[str] = frozenset(), + ) -> BatchedTensorInputs: + """Construct a dictionary of keyword arguments to pass to the model.""" elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) for modality, items in self.items(): for i, item in enumerate(items): @@ -835,12 +858,23 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]): for key, elem in item.items(): elems_by_key[key].append(elem) - return MultiModalKwargs( - { - key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) - for key, elems in elems_by_key.items() - } - ) + data = { + key: elems[0].field.reduce_data(elems, pin_memory=pin_memory) + for key, elems in elems_by_key.items() + } + + if device is not None: + for k in data.keys() - cpu_fields: + data[k] = json_map_leaves( + ( + lambda x: x.to(device=device, non_blocking=True) + if isinstance(x, torch.Tensor) + else x + ), + data[k], + ) + + return data MultiModalKwargsOptionalItems: TypeAlias = ( @@ -849,6 +883,7 @@ MultiModalKwargsOptionalItems: TypeAlias = ( ) +@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.") class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to @@ -882,91 +917,6 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): ): return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory) - @staticmethod - def _try_stack( - nested_tensors: NestedTensors, pin_memory: bool = False - ) -> NestedTensors: - """ - Stack the inner dimensions that have the same shape in - a nested list of tensors. - - Thus, a dimension represented by a list means that the inner - dimensions are different for each element along that dimension. - """ - if isinstance(nested_tensors, torch.Tensor): - return nested_tensors - - # TODO: Remove these once all models have been migrated - if isinstance(nested_tensors, np.ndarray): - return torch.from_numpy(nested_tensors) - if isinstance(nested_tensors, (int, float)): - return torch.tensor(nested_tensors) - - stacked = [MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors] - if not is_list_of(stacked, torch.Tensor, check="all"): - # Only tensors (not lists) can be stacked. - return stacked - - tensors_ = cast(list[torch.Tensor], stacked) - if len(tensors_) == 1: - # An optimization when `tensors_` contains only one tensor: - # - produce exactly same result as `torch.stack(tensors_)` - # - will achieve zero-copy if the tensor is contiguous - return tensors_[0].unsqueeze(0).contiguous() - - if any(t.shape != tensors_[0].shape for t in tensors_): - # The tensors have incompatible shapes and can't be stacked. - return tensors_ - - outputs = torch.empty( - len(tensors_), - *tensors_[0].shape, - dtype=tensors_[0].dtype, - device=tensors_[0].device, - pin_memory=pin_memory, - ) - return torch.stack(tensors_, out=outputs) - - @staticmethod - def batch( - inputs_list: list["MultiModalKwargs"], pin_memory: bool = False - ) -> BatchedTensorInputs: - """ - Batch multiple inputs together into a dictionary. - - The resulting dictionary has the same keys as the inputs. - If the corresponding value from each input is a tensor and they all - share the same shape, the output value is a single batched tensor; - otherwise, the output value is a list containing the original value - from each input. - """ - if len(inputs_list) == 0: - return {} - - # We need to consider the case where each item in the batch - # contains different modalities (i.e. different keys). - item_lists = defaultdict[str, list[NestedTensors]](list) - - for inputs in inputs_list: - for k, v in inputs.items(): - item_lists[k].append(v) - - return { - k: MultiModalKwargs._try_stack(item_list, pin_memory) - for k, item_list in item_lists.items() - } - - @staticmethod - def as_kwargs( - batched_inputs: BatchedTensorInputs, - *, - device: torch.types.Device, - ) -> BatchedTensorInputs: - return json_map_leaves( - lambda x: x.to(device=device, non_blocking=True), - batched_inputs, - ) - def __getitem__(self, key: str): if key not in self: raise KeyError( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1840220854858..f8e8847e8e609 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -19,7 +19,6 @@ from PIL import Image, UnidentifiedImageError import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger -from vllm.utils.jsontree import json_map_leaves from vllm.utils.registry import ExtensionManager from .audio import AudioEmbeddingMediaIO, AudioMediaIO @@ -427,59 +426,25 @@ def group_mm_kwargs_by_modality( Yields: A tuple `(modality, num_items, grouped_kwargs)`. """ - if merge_by_field_config is None: - raise RuntimeError( - "`group_mm_kwargs_by_modality` now requires " - "`merge_by_field_config` arg, please update your model runner " - "according to https://github.com/vllm-project/vllm/pull/25676." - ) - if merge_by_field_config is False: + # TODO: After v0.13, remove merge_by_field_config attribute from model impls + if merge_by_field_config is not None: logger.warning_once( - "The legacy code for batching multi-modal kwargs is deprecated and " - "will be removed in v0.12. Please update your model with " - "`merge_by_field_config=True` to use the new code defined by " - "`MultiModalFieldConfig`. You can refer to " - "https://github.com/vllm-project/vllm/issues/26149 " - "for some examples on how to do this." + "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` " + "is deprecated and will be removed in v0.13." ) - from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems + from vllm.multimodal.inputs import MultiModalKwargsItems for modality, items in groupby(mm_kwargs, key=lambda item: item.modality): items_lst = list(items) + mm_kwargs_items = MultiModalKwargsItems.from_seq(items_lst) + mm_kwargs_data = mm_kwargs_items.get_data( + device=device, + pin_memory=pin_memory, + cpu_fields=multimodal_cpu_fields, + ) - if merge_by_field_config: - mm_kwargs_group: BatchedTensorInputs = dict( - MultiModalKwargsItems.from_seq(items_lst).get_data( - pin_memory=pin_memory - ) - ) - - if device is not None: - mm_kwargs_group = { - k: json_map_leaves( - lambda x: x.to(device=device, non_blocking=True) - if isinstance(x, torch.Tensor) - else x, - v, - ) - if k not in multimodal_cpu_fields - else v - for k, v in mm_kwargs_group.items() - } - else: - mm_kwargs_group = MultiModalKwargs.as_kwargs( - MultiModalKwargs.batch( - [ - MultiModalKwargsItems.from_seq([item]).get_data() - for item in items_lst - ], - pin_memory=pin_memory, - ), - device=device, - ) - - yield modality, len(items_lst), mm_kwargs_group + yield modality, len(items_lst), mm_kwargs_data def fetch_audio( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 4bf9401b6b051..1467ca71efec1 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -233,6 +233,23 @@ class CudaPlatformBase(Platform): from vllm.config import CUDAGraphMode compilation_config = vllm_config.compilation_config + if compilation_config.cudagraph_mode.has_full_cudagraphs(): + # decode context parallel does not support full cudagraphs + if parallel_config.decode_context_parallel_size > 1: + logger.warning_once( + "Decode context parallel (DCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + # prefill context parallel do not support full cudagraphs + elif parallel_config.prefill_context_parallel_size > 1: + logger.warning_once( + "Prefill context parallel (PCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE if ( parallel_config.all2all_backend == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ccf3446a3a6e5..32c7f8e536639 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -381,6 +381,24 @@ class RocmPlatform(Platform): parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE + if compilation_config.cudagraph_mode.has_full_cudagraphs(): + # decode context parallel does not support full cudagraphs + if parallel_config.decode_context_parallel_size > 1: + logger.warning_once( + "Decode context parallel (DCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + # prefill context parallel do not support full cudagraphs + elif parallel_config.prefill_context_parallel_size > 1: + logger.warning_once( + "Prefill context parallel (PCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() if cache_config and cache_config.block_size is None: diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 36e58dba6b497..7b918d2e3b78f 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -44,6 +44,10 @@ _REASONING_PARSERS_TO_REGISTER = { "granite_reasoning_parser", "GraniteReasoningParser", ), + "holo2": ( + "holo2_reasoning_parser", + "Holo2ReasoningParser", + ), "hunyuan_a13b": ( "hunyuan_a13b_reasoning_parser", "HunyuanA13BReasoningParser", diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 4a04292be009e..5c6ac7dad9930 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -121,7 +121,7 @@ class ReasoningParser: self, original_tag: str | None, tool_server: ToolServer | None, - ) -> str: + ) -> str | None: """ Instance method that is implemented for preparing the structured tag Otherwise, None is returned diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 0c1b54d0bd359..fa45b12856c7d 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -145,7 +145,7 @@ class GptOssReasoningParser(ReasoningParser): # This function prepares the structural tag to format reasoning output def prepare_structured_tag( self, original_tag: str | None, tool_server: ToolServer | None - ) -> str: + ) -> str | None: if original_tag is None: if tool_server is None: return json.dumps(no_func_reaonsing_tag) diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py new file mode 100644 index 0000000000000..76de1c077c88b --- /dev/null +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.logger import init_logger +from vllm.reasoning import ( + ReasoningParser, +) +from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser +from vllm.tokenizers import TokenizerLike + +logger = init_logger(__name__) + + +class Holo2ReasoningParser(ReasoningParser): + """ + Reasoning parser for the Holo2 models which are based on Qwen3. + + The Holo2 model uses ... tokens to denote reasoning text but + is part of the chat template. This parser extracts the reasoning content until + in the model's output. + + The model provides a switch to enable or disable reasoning + output via the 'thinking=False' parameter. + + Chat template args: + - thinking: Whether to enable reasoning output (default: True) + + + Parsing rules on model output: + - thinking == False + -> Model output is treated as purely the content |content| + - thinking == True + -> Model output is |reasoning_content||content| + """ + + def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): + super().__init__(tokenizer, *args, **kwargs) + + chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} + # Deepseek V3 and Holo2 are similar. However, Holo2 models think by default. + # this parser without user specified chat template args is initiated once for + # all requests in the structured output manager. So it is important that without + # user specified chat template args, the default thinking is True. + + enable_thinking = bool(chat_kwargs.get("thinking", True)) + + if enable_thinking: + self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) + else: + self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs) + + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: + return self._parser.is_reasoning_end(input_ids) + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + return self._parser.extract_content_ids(input_ids) + + def extract_reasoning( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[str | None, str | None]: + return self._parser.extract_reasoning(model_output, request) + + def extract_reasoning_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> DeltaMessage | None: + return self._parser.extract_reasoning_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + ) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2911dcff2ab49..f926b523afdfa 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -25,6 +25,7 @@ from transformers.models.auto.tokenization_auto import get_tokenizer_config from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs +from vllm.config.utils import getattr_iter from vllm.logger import init_logger from vllm.transformers_utils.utils import parse_safetensors_file_metadata @@ -304,14 +305,26 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" + rope_theta_names = ("rope_theta", "rotary_emb_base") + rope_theta = getattr_iter(config, rope_theta_names, None) if Version(version("transformers")) < Version("5.0.0.dev0"): # Transformers v4 installed, legacy config fields may be present if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: config.rope_parameters = rope_scaling - if (rope_theta := getattr(config, "rope_theta", None)) is not None: + if rope_theta is not None: if not hasattr(config, "rope_parameters"): config.rope_parameters = {"rope_type": "default"} config.rope_parameters["rope_theta"] = rope_theta + partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct") + partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None) + if partial_rotary_factor is not None: + if not hasattr(config, "rope_parameters"): + config.rope_parameters = {"rope_type": "default"} + config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor + elif rope_theta is not None or hasattr(config, "rope_parameters"): + # Transformers v5 installed + config.standardize_rope_params() + config.validate_rope() # No RoPE parameters to patch if getattr(config, "rope_parameters", None) is None: diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index d112c71d7d20b..62f52703029b7 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -89,9 +89,14 @@ class NemotronConfig(PretrainedConfig): tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_parameters (`dict`, *optional*): - The parameters of the RoPE embeddings. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. + The parameters of the RoPE embeddings. Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', + 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the + original RoPE implementation. + `partial_rotary_factor` (`float`, *optional*, defaults to 0.5): + Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. @@ -133,7 +138,6 @@ class NemotronConfig(PretrainedConfig): eos_token_id=3, tie_word_embeddings=False, rope_parameters=None, - partial_rotary_factor=0.5, attention_bias=False, attention_dropout=0.0, mlp_bias=False, @@ -165,14 +169,16 @@ class NemotronConfig(PretrainedConfig): rope_theta = kwargs.pop("rope_theta", 10000.0) if "rope_theta" not in rope_parameters: rope_parameters["rope_theta"] = rope_theta - self.rope_parameters = rope_parameters # for backward compatibility partial_rotary_factor = ( kwargs.get("rope_percent") or kwargs.get("rope_percentage") - or partial_rotary_factor + or kwargs.get("partial_rotary_factor") + or 0.5 ) - self.partial_rotary_factor = partial_rotary_factor + if "partial_rotary_factor" not in rope_parameters: + rope_parameters["partial_rotary_factor"] = partial_rotary_factor + self.rope_parameters = rope_parameters self._rope_parameters_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index fd36b49245f56..8230a18343c5e 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -103,8 +103,8 @@ class Qwen3NextConfig(PretrainedConfig): Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE `high_freq_factor` (`float`, *optional*): Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. + `partial_rotary_factor` (`float`, *optional*, defaults to 0.25): + Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -198,7 +198,6 @@ class Qwen3NextConfig(PretrainedConfig): use_cache=True, tie_word_embeddings=False, rope_parameters=None, - partial_rotary_factor=0.25, attention_bias=False, attention_dropout=0.0, head_dim=256, @@ -239,6 +238,9 @@ class Qwen3NextConfig(PretrainedConfig): rope_theta = kwargs.pop("rope_theta", 10000.0) if "rope_theta" not in rope_parameters: rope_parameters["rope_theta"] = rope_theta + partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25) + if "partial_rotary_factor" not in rope_parameters: + rope_parameters["partial_rotary_factor"] = partial_rotary_factor self.rope_parameters = rope_parameters self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index 555fcfea491e2..356f383cc52bd 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -244,9 +244,15 @@ class FlexibleArgumentParser(ArgumentParser): else: key = pattern.sub(repl, arg, count=1) processed_args.append(key) - elif arg.startswith("-O") and arg != "-O" and arg[2] != ".": + elif arg.startswith("-O."): + # Provide clear error for deprecated -O.* syntax + raise ValueError( + f"The -O.* syntax is no longer supported. " + f"Please use -cc.* instead. " + f"For example, replace '{arg}' with '{arg.replace('-O', '-cc', 1)}'" + ) + elif arg.startswith("-O") and arg != "-O": # allow -O flag to be used without space, e.g. -O3 or -Odecode - # -O.<...> handled later # also handle -O= here optimization_level = arg[3:] if arg[2] == "=" else arg[2:] processed_args += ["--optimization-level", optimization_level] @@ -257,17 +263,6 @@ class FlexibleArgumentParser(ArgumentParser): ): # Convert -O to --optimization-level processed_args.append("--optimization-level") - elif arg.startswith("-O."): - # Handle -O.* dotted syntax - ALL dotted syntax is deprecated - logger.warning_once( - "The -O.* dotted syntax for --compilation-config is " - "deprecated and will be removed in v0.13.0 or v1.0.0" - ", whichever is earlier. Please use -cc.* instead. " - "Example: -cc.backend=eager instead of " - "-O.backend=eager." - ) - converted_arg = arg.replace("-O", "-cc", 1) - processed_args.append(converted_arg) else: processed_args.append(arg) diff --git a/vllm/utils/nvtx_pytorch_hooks.py b/vllm/utils/nvtx_pytorch_hooks.py new file mode 100644 index 0000000000000..39e2a9a136e63 --- /dev/null +++ b/vllm/utils/nvtx_pytorch_hooks.py @@ -0,0 +1,286 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager + +import torch +import torch.cuda.nvtx as nvtx + + +def print_tensor(tensor_obj, prefix, tensor_list=None): + """Descends iterators that contains Tensors and prints the Tensor. + Recursive function that descends iterator type arguments until + it finds a Tensor object. + """ + if tensor_list is None: + tensor_list = [] + + if isinstance(tensor_obj, (list, tuple)): + for ten in tensor_obj: + tensor_list = print_tensor(ten, prefix, tensor_list) + elif isinstance(tensor_obj, torch.Tensor): + tensor_dims = list(tensor_obj.size()) + tensor_list.append(tensor_dims) + return tensor_list + + +def process_layer_params(module_obj): + """Extract the static parameters from LLM and VLM relevant layer types""" + param_info = {} + # Extract parameters for layers commonly used in LLMs and VLMs + if isinstance(module_obj, (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d)): + conv_params = {} + conv_params["in_chan"] = module_obj.in_channels + conv_params["out_chan"] = module_obj.out_channels + conv_params["filter_dim"] = module_obj.kernel_size + conv_params["stride"] = module_obj.stride + conv_params["padding"] = module_obj.padding + conv_params["dilation"] = module_obj.dilation + conv_params["transposed"] = module_obj.transposed + conv_params["output_padding"] = module_obj.output_padding + conv_params["groups"] = module_obj.groups + conv_params["padding_mode"] = module_obj.padding_mode + param_info = conv_params + elif isinstance( + module_obj, + ( + torch.nn.ConvTranspose1d, + torch.nn.ConvTranspose2d, + torch.nn.ConvTranspose3d, + ), + ): + convtranspose_params = {} + convtranspose_params["in_chan"] = module_obj.in_channels + convtranspose_params["out_chan"] = module_obj.out_channels + convtranspose_params["filter_dim"] = module_obj.kernel_size + convtranspose_params["stride"] = module_obj.stride + convtranspose_params["padding"] = module_obj.padding + convtranspose_params["dilation"] = module_obj.dilation + convtranspose_params["transposed"] = module_obj.transposed + convtranspose_params["output_padding"] = module_obj.output_padding + convtranspose_params["groups"] = module_obj.groups + convtranspose_params["padding_mode"] = module_obj.padding_mode + param_info = convtranspose_params + elif isinstance( + module_obj, (torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d) + ): + + def _handle_int_or_tuple(parameter): + if isinstance(parameter, tuple): + return list(parameter) + elif isinstance(parameter, int): + return [parameter, parameter] + + pooling_params = {} + pooling_params["filter_dim"] = _handle_int_or_tuple(module_obj.kernel_size) + pooling_params["stride"] = _handle_int_or_tuple(module_obj.stride) + pooling_params["padding"] = _handle_int_or_tuple(module_obj.padding) + pooling_params["dilation"] = _handle_int_or_tuple(module_obj.dilation) + param_info = pooling_params + elif isinstance( + module_obj, (torch.nn.AvgPool1d, torch.nn.AvgPool2d, torch.nn.AvgPool3d) + ): + pooling_params = {} + pooling_params["filter_dim"] = [ + module_obj.kernel_size, + module_obj.kernel_size, + ] + pooling_params["stride"] = [module_obj.stride, module_obj.stride] + pooling_params["padding"] = [module_obj.padding, module_obj.padding] + pooling_params["ceil_mode"] = module_obj.ceil_mode + pooling_params["count_include_pad"] = module_obj.count_include_pad + param_info = pooling_params + elif isinstance( + module_obj, + ( + torch.nn.AdaptiveAvgPool1d, + torch.nn.AdaptiveAvgPool2d, + torch.nn.AdaptiveAvgPool3d, + ), + ): + pooling_params = {} + pooling_params["output_size"] = [ + module_obj.output_size, + module_obj.output_size, + ] + param_info = pooling_params + elif isinstance(module_obj, torch.nn.Linear): + param_info["in_features"] = module_obj.in_features + param_info["out_features"] = module_obj.out_features + elif isinstance( + module_obj, + (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d), + ): + param_info["num_features"] = module_obj.num_features + param_info["epsilon"] = module_obj.eps + param_info["momentum"] = module_obj.momentum + elif isinstance(module_obj, torch.nn.ReLU): + param_info["in_place"] = module_obj.inplace + elif isinstance(module_obj, torch.nn.Dropout): + param_info["p"] = module_obj.p + param_info["in_place"] = module_obj.inplace + elif isinstance(module_obj, torch.nn.Embedding): + param_info["num_embeddings"] = module_obj.num_embeddings + param_info["embedding_dim"] = module_obj.embedding_dim + elif isinstance( + module_obj, + ( + torch.nn.Upsample, + torch.nn.UpsamplingNearest2d, + torch.nn.UpsamplingBilinear2d, + ), + ): + param_info["scale_factor"] = module_obj.scale_factor + + return param_info + + +def construct_marker_dict_and_push( + module_name, module_obj, in_tensor, kwargs=None, out_tensor=None +): + marker_dict = {} + marker_dict["Module"] = module_name + + ## Get trainable parameters like weights and bias + module_params = module_obj.named_parameters(recurse=False) + for idx, (param_name, param_obj) in enumerate(module_params): + if idx == 0: + marker_dict["TrainableParams"] = {} + marker_dict["TrainableParams"][param_name] = list(param_obj.size()) + + in_tensor_list = print_tensor(in_tensor, "Input") + if in_tensor_list: + marker_dict["Inputs"] = in_tensor_list + + out_tensor_list = print_tensor(out_tensor, "Output") + if out_tensor_list: + marker_dict["Outputs"] = out_tensor_list + + ## Get Kwargs like input_ids and positions for the top module + if kwargs: + for key, value in kwargs.items(): + if isinstance(value, (torch.Tensor, list, tuple)): + tensor_list = print_tensor(value, key) + if tensor_list: + marker_dict[key] = tensor_list + + param_info = process_layer_params(module_obj) + if param_info: + marker_dict["StaticParams"] = param_info + nvtx.range_push("{}".format(marker_dict)) + + +class ResultHolder: + """Holder for storing results from within a context manager.""" + + result = None + + +@contextmanager +def layerwise_nvtx_marker_context(module_name, module_obj, in_tensor=None, kwargs=None): + """Context manager for NVTX markers that automatically pushes on enter + and pops on exit. + + Example: + with nvtx_marker_context("Module:MyModule", module, in_tensor=args, + kwargs=kwargs) as ctx: + ctx.result = module(*args, **kwargs) + return ctx.result + """ + holder = ResultHolder() + + # Push input marker + construct_marker_dict_and_push( + module_name, + module_obj, + in_tensor=in_tensor, + kwargs=kwargs, + ) + try: + yield holder + finally: + # Pop input marker + nvtx.range_pop() + # Push and pop output marker + output_name = module_name.replace("(input)", "(output)") + construct_marker_dict_and_push( + output_name, + module_obj, + in_tensor=None, + kwargs=None, + out_tensor=holder.result, + ) + nvtx.range_pop() + + +class PytHooks: + """This module contains all the code needed to enable forward hooks + in a pytorch network. + + To register the hooks for a given network, the user needs to instantiate + a PytHook object. Then call the register_hooks method. + + Example: + + my_hook = PytHook() + my_hook.register_hooks(my_network_model) + """ + + def __init__(self): + """Initialize module variables.""" + super().__init__() + self.module_to_name_map = {} + + def _process_layer_params(self, module_obj): + return process_layer_params(module_obj) + + def module_fwd_hook(self, module_obj, in_tensor, out_tensor): + """Callback function that ends the NVTX marker. + Records the module name and tensor information. + Called after the module executes the forward method. + """ + nvtx.range_pop() + module_name = self.module_to_name_map.get(module_obj, "unknown") + construct_marker_dict_and_push( + module_name, module_obj, in_tensor=None, kwargs=None, out_tensor=out_tensor + ) + nvtx.range_pop() + return + + def module_fwd_pre_hook(self, module_obj, in_tensor, kwargs): + """Creates an NVTX marker with the module name in it. + This function is called before the module executes. + """ + module_name = self.module_to_name_map.get(module_obj, "unknown") + construct_marker_dict_and_push( + module_name, module_obj, in_tensor=in_tensor, kwargs=kwargs, out_tensor=None + ) + return + + def register_hooks(self, network_model, module_prefix="top"): + """User level function that activates all the hooks. + The user needs to call this method from the network source code. + The code descends all the modules in the network and registers their + respective hooks. + """ + # Module types to skip (simple operations that don't need detailed profiling) + skip_types = ( + torch.nn.Identity, + torch.nn.Dropout, + torch.nn.Dropout1d, + torch.nn.Dropout2d, + torch.nn.Dropout3d, + ) + + for name, module in network_model.named_modules(prefix=module_prefix): + # Skip certain module types to reduce profiling overhead + if isinstance(module, skip_types): + continue + + module.register_forward_pre_hook(self.module_fwd_pre_hook, with_kwargs=True) + module.register_forward_hook(self.module_fwd_hook) + if module not in self.module_to_name_map: + self.module_to_name_map[module] = name + else: + raise ValueError("Module instance {} is not unique ".format(module)) + return diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index f5c49ac169f0c..c97efce312b56 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -28,6 +28,7 @@ else: STR_DTYPE_TO_TORCH_DTYPE = { "float32": torch.float32, "half": torch.half, + "float16": torch.float16, "bfloat16": torch.bfloat16, "float": torch.float, "fp8": torch.uint8, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 69a6a5e5fae82..3d9640a2d4024 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -482,9 +482,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.dcp_rank = 0 self.dcp_kv_cache_interleave_size = 1 - self.num_qo_heads = ( - self.model_config.get_num_attention_heads(self.vllm_config.parallel_config) - * self.dcp_world_size + self.num_qo_heads = self.model_config.get_num_attention_heads( + self.vllm_config.parallel_config ) self.num_kv_heads = self.kv_cache_spec.num_kv_heads diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 0a6806390451d..14ae487f3eb13 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -27,7 +27,6 @@ from vllm.multimodal.inputs import ( MultiModalFieldConfig, MultiModalFieldElem, MultiModalFlatField, - MultiModalKwargs, MultiModalKwargsItem, MultiModalKwargsItems, MultiModalSharedField, @@ -176,9 +175,6 @@ class MsgpackEncoder: if isinstance(obj, MultiModalKwargsItems): return self._encode_mm_items(obj) - if isinstance(obj, MultiModalKwargs): - return self._encode_mm_kwargs(obj) - if isinstance(obj, UtilityResult): result = obj.result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: @@ -259,11 +255,6 @@ class MsgpackEncoder: "field": self._encode_mm_field(elem.field), } - def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]: - return { - modality: self._encode_nested_tensors(data) for modality, data in kw.items() - } - def _encode_nested_tensors(self, nt: NestedTensors) -> Any: if isinstance(nt, torch.Tensor): return self._encode_tensor(nt) @@ -325,8 +316,6 @@ class MsgpackDecoder: return self._decode_mm_item(obj) if issubclass(t, MultiModalKwargsItems): return self._decode_mm_items(obj) - if issubclass(t, MultiModalKwargs): - return self._decode_mm_kwargs(obj) if t is UtilityResult: return self._decode_utility_result(obj) return obj @@ -414,14 +403,6 @@ class MsgpackDecoder: obj["field"] = factory_meth(None, *field_args).field return MultiModalFieldElem(**obj) - def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs: - return MultiModalKwargs( - { - modality: self._decode_nested_tensors(data) - for modality, data in obj.items() - } - ) - def _decode_nested_tensors(self, obj: Any) -> NestedTensors: if isinstance(obj, (int, float)): # Although it violates NestedTensors type, MultiModalKwargs diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1c7845a14b742..31428db2d3afc 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1258,7 +1258,7 @@ class EagleProposer: num_tokens_padded: int, ) -> tuple[int, torch.Tensor]: # TODO(Flechman): support DBO ubatching - ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp( + should_ubatch, num_toks_across_dp = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens_unpadded, parallel_config=self.vllm_config.parallel_config, allow_microbatching=False, @@ -1267,7 +1267,7 @@ class EagleProposer: uniform_decode=None, num_scheduled_tokens_per_request=None, ) - assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE" + assert not should_ubatch, "DBO ubatching not implemented for EAGLE" num_tokens_dp_padded = num_tokens_padded if num_toks_across_dp is not None: diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index d087d28b1dae3..5ee88178cdf60 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -40,6 +40,16 @@ class StructuredOutputManager: self.reasoner: ReasoningParser | None = None self.vllm_config = vllm_config + # When in external_launcher mode, async grammar compilation causes deadlocks + # due to external_launcher mode having a scheduler for each TP rank. + # Async grammar compilation causes the WAITING_FOR_FSM → WAITING transition to + # happen at different times on different TP ranks, + # breaking the determinism assumption that external_launcher relies on. + self._use_async_grammar_compilation = ( + vllm_config.parallel_config.distributed_executor_backend + != "external_launcher" + ) + self._grammar_bitmask: torch.Tensor | None = None self._full_mask = torch.tensor(-1, dtype=torch.int32) @@ -138,10 +148,13 @@ class StructuredOutputManager: else: raise ValueError(f"Unsupported structured output backend: {backend}") - grammar = self.executor.submit(self._async_create_grammar, request) + if self._use_async_grammar_compilation: + grammar = self.executor.submit(self._create_grammar, request) + else: + grammar = self._create_grammar(request) # type: ignore[assignment] request.structured_output_request.grammar = grammar # type: ignore[assignment] - def _async_create_grammar( + def _create_grammar( self, request: Request, ) -> StructuredOutputGrammar: diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 6539d72d81cb7..5da55d740c347 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import numpy as np import torch import torch.distributed as dist @@ -9,10 +10,7 @@ from vllm.config import ParallelConfig from vllm.distributed.parallel_state import get_dp_group from vllm.logger import init_logger from vllm.v1.worker.ubatch_utils import ( - UBatchSlice, - UBatchSlices, check_ubatch_thresholds, - create_ubatch_slices, is_second_ubatch_empty, ) @@ -91,20 +89,6 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch return num_tokens_across_dp.cpu() -# This just pads the second ubatch slice out to the total number of tokens -# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding. -def _pad_out_ubatch_slice( - ubatch_slices: UBatchSlices, num_total_tokens: int -) -> UBatchSlices: - padded_second_token_slice = slice( - ubatch_slices[1].token_slice.start, num_total_tokens - ) - ubatch_slices[1] = UBatchSlice( - ubatch_slices[1].request_slice, padded_second_token_slice - ) - return ubatch_slices - - def _synchronize_dp_ranks( num_tokens_unpadded: int, num_tokens_padded: int, @@ -175,7 +159,7 @@ def coordinate_batch_across_dp( num_tokens_padded: int | None = None, uniform_decode: bool | None = None, num_scheduled_tokens_per_request: np.ndarray | None = None, -) -> tuple[UBatchSlices | None, torch.Tensor | None]: +) -> tuple[bool, torch.Tensor | None]: """ Coordinates amongst all DP ranks to determine if and how the full batch should be split into microbatches. @@ -204,7 +188,7 @@ def coordinate_batch_across_dp( """ if parallel_config.data_parallel_size == 1: # Early exit. - return None, None + return False, None # If the caller has explicitly enabled microbatching. should_attempt_ubatching = False @@ -228,23 +212,4 @@ def coordinate_batch_across_dp( parallel_config, ) - # Don't microbatch unless every other DP worker is also microbatching - if not should_ubatch: - return (None, num_tokens_after_padding) - - # This doesn't actually pad the ubatch slices. It just initializes the - # split point to the padded value so that padding can be applied - # to the second ubatch in pad_out_ubatch_slice after attention - # metadata creation - assert num_tokens_after_padding is not None - num_tokens_padded = int(num_tokens_after_padding[0].item()) - token_split_point = int(num_tokens_padded) // 2 - - assert num_scheduled_tokens_per_request is not None - ubatch_slices = create_ubatch_slices( - num_scheduled_tokens_per_request, token_split_point - ) - ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded) - assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded - - return (ubatch_slices, num_tokens_after_padding) + return (should_ubatch, num_tokens_after_padding) diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py index 8ae887fe82cfe..1b78734fba78f 100644 --- a/vllm/v1/worker/gpu/input_batch.py +++ b/vllm/v1/worker/gpu/input_batch.py @@ -354,6 +354,55 @@ def combine_sampled_and_draft_tokens( return logits_indices +@triton.jit +def _get_num_sampled_and_rejected_kernel( + num_sampled_ptr, + num_rejected_ptr, + seq_lens_ptr, + cu_num_logits_ptr, + idx_mapping_ptr, + prefill_len_ptr, +): + batch_idx = tl.program_id(0) + req_state_idx = tl.load(idx_mapping_ptr + batch_idx) + + seq_len = tl.load(seq_lens_ptr + batch_idx) + prefill_len = tl.load(prefill_len_ptr + req_state_idx) + is_chunked_prefilling = seq_len < prefill_len + + num_sampled = tl.load(num_sampled_ptr + batch_idx) + num_sampled = tl.where(is_chunked_prefilling, 0, num_sampled) + tl.store(num_sampled_ptr + batch_idx, num_sampled) + + logits_start = tl.load(cu_num_logits_ptr + batch_idx) + logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1) + num_logits = logits_end - logits_start + + num_rejected = num_logits - num_sampled + num_rejected = tl.where(is_chunked_prefilling, 0, num_rejected) + tl.store(num_rejected_ptr + batch_idx, num_rejected) + + +def get_num_sampled_and_rejected( + num_sampled: torch.Tensor, + seq_lens: torch.Tensor, + cu_num_logits: torch.Tensor, + idx_mapping: torch.Tensor, + prefill_len: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + num_reqs = idx_mapping.shape[0] + num_rejected = torch.empty_like(num_sampled) + _get_num_sampled_and_rejected_kernel[(num_reqs,)]( + num_sampled, + num_rejected, + seq_lens, + cu_num_logits, + idx_mapping, + prefill_len, + ) + return num_sampled, num_rejected + + @triton.jit def _post_update_kernel( idx_mapping_ptr, diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 9bf345053c30c..464f7b7bd3532 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -43,6 +43,7 @@ from vllm.v1.worker.gpu.input_batch import ( InputBatch, InputBuffers, combine_sampled_and_draft_tokens, + get_num_sampled_and_rejected, post_update, prepare_pos_seq_lens, prepare_prefill_inputs, @@ -54,10 +55,7 @@ from vllm.v1.worker.gpu.sample.metadata import ( ) from vllm.v1.worker.gpu.sample.sampler import Sampler from vllm.v1.worker.gpu.spec_decode import init_speculator -from vllm.v1.worker.gpu.spec_decode.rejection_sample import ( - get_num_rejected, - rejection_sample, -) +from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample from vllm.v1.worker.gpu.states import RequestState from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin @@ -621,16 +619,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Sample tokens and compute logprobs (if needed). sampler_output = self.sampler(logits, sampling_metadata) - # Get the number of sampled tokens. - prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping] - is_chunked_prefilling = input_batch.seq_lens < prefill_len if input_batch.num_draft_tokens == 0: # No draft tokens (common case). - # 0 if chunked-prefilling, 1 if not. - num_sampled = (~is_chunked_prefilling).int() - num_rejected = torch.zeros_like(num_sampled) + num_sampled = torch.ones( + input_batch.num_reqs, dtype=torch.int32, device=self.device + ) else: - # Draft tokens for spec decoding. + # Rejection sampling for spec decoding. input_ids = input_batch.input_ids[input_batch.logits_indices] sampled_tokens, num_sampled = rejection_sample( sampler_output.sampled_token_ids, @@ -638,13 +633,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): input_batch.cu_num_logits, self.num_speculative_steps, ) - num_sampled *= ~is_chunked_prefilling - num_rejected = get_num_rejected( - input_batch.cu_num_logits, - num_sampled, - ) sampler_output.sampled_token_ids = sampled_tokens - # TODO(woosuk): Support logprobs with spec decoding. + + # Get the number of sampled and rejected tokens. + # For chunked prefills, num_sampled and num_rejected are both 0. + num_sampled, num_rejected = get_num_sampled_and_rejected( + num_sampled, + input_batch.seq_lens, + input_batch.cu_num_logits, + input_batch.idx_mapping, + self.req_states.prefill_len.gpu, + ) return sampler_output, num_sampled, num_rejected def compute_prompt_logprobs( diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py index 43c6ac518bccc..8a7bf28bacbd4 100644 --- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py +++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py @@ -69,15 +69,3 @@ def rejection_sample( num_warps=1, ) return sampled, num_sampled - - -@torch.compile(dynamic=True) -def get_num_rejected( - cu_num_logits: torch.Tensor, - num_sampled: torch.Tensor, -) -> torch.Tensor: - num_logits = cu_num_logits[1:] - cu_num_logits[:-1] - num_rejected = num_logits - num_sampled - # No token is rejected for chunked prefills. - num_rejected *= num_sampled > 0 - return num_rejected diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9b09cd6df45fe..6b880ee5b2c8c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -89,6 +89,7 @@ from vllm.utils.jsontree import json_map_leaves from vllm.utils.math_utils import cdiv, round_up from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.utils.nvtx_pytorch_hooks import PytHooks from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import ( get_dtype_size, @@ -154,6 +155,7 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.ubatch_utils import ( UBatchSlices, check_ubatch_thresholds, + maybe_create_ubatch_slices, ) from vllm.v1.worker.utils import is_residual_scattered_for_sp @@ -606,6 +608,7 @@ class GPUModelRunner( # Ephemeral state transferred between execute_model() and sample_tokens(). self.execute_model_state: ExecuteModelState | None = None self.kv_connector_output: KVConnectorOutput | None = None + self.layerwise_nvtx_hooks_registered = False def reset_mm_cache(self) -> None: if self.mm_budget: @@ -2185,7 +2188,6 @@ class GPUModelRunner( mm_kwargs, device=self.device, pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ): curr_group_outputs: list[torch.Tensor] = [] @@ -2212,7 +2214,6 @@ class GPUModelRunner( [video_mm_kwargs_item], device=self.device, pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ) ) @@ -2855,7 +2856,7 @@ class GPUModelRunner( ) -> tuple[ CUDAGraphMode, BatchDescriptor, - UBatchSlices | None, + bool, torch.Tensor | None, CUDAGraphStat | None, ]: @@ -2891,7 +2892,7 @@ class GPUModelRunner( # Extra coordination when running data-parallel since we need to coordinate # across ranks - ubatch_slices, num_tokens_across_dp = None, None + should_ubatch, num_tokens_across_dp = False, None if self.vllm_config.parallel_config.data_parallel_size > 1: # Disable DP padding when running eager to avoid excessive padding when # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller @@ -2901,8 +2902,8 @@ class GPUModelRunner( self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE ) - ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp( - num_tokens_unpadded=num_tokens_padded, + should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens, parallel_config=self.parallel_config, allow_microbatching=allow_microbatching, allow_dp_padding=allow_dp_padding, @@ -2934,11 +2935,47 @@ class GPUModelRunner( return ( cudagraph_mode, batch_descriptor, - ubatch_slices, + should_ubatch, num_tokens_across_dp, cudagraph_stats, ) + def _register_layerwise_nvtx_hooks(self) -> None: + """ + Register layerwise NVTX hooks if --enable-layerwise-nvtx-tracing is enabled + to trace detailed information of each layer or module in the model. + """ + + if ( + self.vllm_config.observability_config.enable_layerwise_nvtx_tracing + and not self.layerwise_nvtx_hooks_registered + ): + if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + logger.debug_once( + "layerwise NVTX tracing is not supported when CUDA graph is " + "turned off; you may observe part or all of the model " + "missing NVTX markers" + ) + + # In STOCK_TORCH_COMPILE mode, after registering hooks here, + # the __call__ function of nn.module will be recompiled with + # fullgraph=True. Since nvtx.range_push/pop are not traceable + # by torch dynamo, we can't register hook functions here + # because hook functions will also be traced by torch dynamo. + if ( + self.vllm_config.compilation_config.mode + == CompilationMode.STOCK_TORCH_COMPILE + ): + logger.debug_once( + "layerwise NVTX tracing is not supported when " + "CompilationMode is STOCK_TORCH_COMPILE, skipping " + "function hooks registration" + ) + else: + pyt_hooks = PytHooks() + pyt_hooks.register_hooks(self.model, self.model.__class__.__name__) + self.layerwise_nvtx_hooks_registered = True + @torch.inference_mode() def execute_model( self, @@ -3033,7 +3070,7 @@ class GPUModelRunner( ( cudagraph_mode, batch_desc, - ubatch_slices, + should_ubatch, num_tokens_across_dp, cudagraph_stats, ) = self._determine_batch_execution_and_padding( @@ -3046,10 +3083,10 @@ class GPUModelRunner( logger.debug( "Running batch with cudagraph_mode: %s, batch_descriptor: %s, " - "ubatch_slices: %s, num_tokens_across_dp: %s", + "should_ubatch: %s, num_tokens_across_dp: %s", cudagraph_mode, batch_desc, - ubatch_slices, + should_ubatch, num_tokens_across_dp, ) @@ -3057,9 +3094,17 @@ class GPUModelRunner( num_reqs_padded = ( batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ) + ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( + should_ubatch, + num_scheduled_tokens_np, + num_tokens_padded, + num_reqs_padded, + ) + + pad_attn = cudagraph_mode == CUDAGraphMode.FULL use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 - pad_attn = cudagraph_mode == CUDAGraphMode.FULL + ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices (attn_metadata, spec_decode_common_attn_metadata) = ( self._build_attention_metadata( @@ -3068,7 +3113,7 @@ class GPUModelRunner( num_reqs=num_reqs, num_reqs_padded=num_reqs_padded if pad_attn else None, max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_attn, logits_indices=logits_indices, use_spec_decode=use_spec_decode, num_scheduled_tokens=scheduler_output.num_scheduled_tokens, @@ -3105,7 +3150,7 @@ class GPUModelRunner( num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_mode, batch_descriptor=batch_desc, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded, ), record_function_or_nullcontext("gpu_model_runner: forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output, @@ -3959,7 +4004,6 @@ class GPUModelRunner( dummy_mm_items, device=self.device, pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ) ) @@ -4058,7 +4102,7 @@ class GPUModelRunner( num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) - _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp, _ = ( + _cudagraph_mode, batch_desc, should_ubatch, num_tokens_across_dp, _ = ( self._determine_batch_execution_and_padding( num_tokens=num_tokens_unpadded, num_reqs=num_reqs, @@ -4092,6 +4136,9 @@ class GPUModelRunner( num_reqs_padded = ( batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ) + ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( + should_ubatch, num_scheduled_tokens, num_tokens_padded, num_reqs_padded + ) attn_metadata: PerLayerAttnMetadata | None = None @@ -4113,11 +4160,12 @@ class GPUModelRunner( self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens self.query_start_loc.copy_to_gpu() + pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL attn_metadata, _ = self._build_attention_metadata( num_tokens=num_tokens_unpadded, num_reqs=num_reqs_padded, max_query_len=max_query_len, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices, for_cudagraph_capture=is_graph_capturing, ) @@ -4169,11 +4217,11 @@ class GPUModelRunner( num_tokens_padded, None, False ) - if ubatch_slices is not None: + if ubatch_slices_padded is not None: # Adjust values to reflect a single ubatch. # TODO(sage,lucas): this is cruft that should be addressed in # the padding refactor. - num_tokens_padded = ubatch_slices[0].num_tokens + num_tokens_padded = ubatch_slices_padded[0].num_tokens if num_tokens_across_dp is not None: num_tokens_across_dp[:] = num_tokens_padded @@ -4186,7 +4234,7 @@ class GPUModelRunner( num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_desc, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded, ), ): outputs = self.model( @@ -4222,6 +4270,17 @@ class GPUModelRunner( is_graph_capturing=is_graph_capturing, ) + # We register layerwise NVTX hooks here after the first dynamo tracing is + # done to avoid nvtx operations in hook functions being traced by + # torch dynamo and causing graph breaks. + # Note that for DYNAMO_ONCE and VLLM_COMPILE mode, + # compiled model's dynamo tracing is only done once and the compiled model's + # __call__ function is replaced by calling the compiled function. + # So it's safe to register hooks here. Hooks will be registered to + # both compiled and uncompiled models but they will never + # be called on the compiled model execution path. + self._register_layerwise_nvtx_hooks() + # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real # requests to process. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9f503c0b8b0b7..6f44c0b4f584c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -970,7 +970,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): mm_kwargs, device=self.device, pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ): # Run the encoder. @@ -2063,7 +2062,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): dummy_mm_items, device=self.device, pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ) ) diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py index 33a1921d2d98e..44788476fc9c5 100644 --- a/vllm/v1/worker/ubatch_utils.py +++ b/vllm/v1/worker/ubatch_utils.py @@ -42,9 +42,37 @@ def check_ubatch_thresholds( return num_tokens >= config.dbo_prefill_token_threshold -def create_ubatch_slices( - num_scheduled_tokens: np.ndarray, split_point: int +# This just pads the second ubatch slice out to the total number of tokens +# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding. +def _pad_out_ubatch_slices( + ubatch_slices: UBatchSlices, num_total_tokens: int, num_reqs_padded: int ) -> UBatchSlices: + # TODO(lucas): handle empty second ubatch + padded_second_request_slice = slice( + ubatch_slices[1].request_slice.start, num_reqs_padded + ) + padded_second_token_slice = slice( + ubatch_slices[1].token_slice.start, num_total_tokens + ) + return [ + ubatch_slices[0], + UBatchSlice(padded_second_request_slice, padded_second_token_slice), + ] + + +def maybe_create_ubatch_slices( + should_ubatch: bool, + num_scheduled_tokens: np.ndarray, + num_tokens_padded: int, + num_reqs_padded: int, + split_point: int | None = None, +) -> tuple[UBatchSlices | None, UBatchSlices | None]: + if not should_ubatch: + return None, None + + if split_point is None: + split_point = int(num_tokens_padded) // 2 + # TODO(lucas): Refactor the gpu_model_runner.py so we can pass # in cu_num_tokens directly (i.e. query_start_loc) cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32) @@ -67,7 +95,15 @@ def create_ubatch_slices( ) second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1) - return [ + ubatch_slices = [ UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice), UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice), ] + + ubatch_slices_padded = _pad_out_ubatch_slices( + ubatch_slices, num_tokens_padded, num_reqs_padded + ) + + assert sum(s.num_tokens for s in ubatch_slices_padded) == num_tokens_padded + + return ubatch_slices, ubatch_slices_padded