mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-03 12:27:04 +08:00
Merge branch 'main' into mlm-full-lora-support
This commit is contained in:
commit
1fbd7287b8
@ -8,3 +8,4 @@ tasks:
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
rtol: 0.05
|
||||
|
||||
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
|
||||
--tp-size=1
|
||||
"""
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import yaml
|
||||
|
||||
RTOL = 0.08
|
||||
DEFAULT_RTOL = 0.08
|
||||
|
||||
|
||||
@contextmanager
|
||||
def scoped_env_vars(new_env: dict[str, str]):
|
||||
if not new_env:
|
||||
# Fast path: nothing to do
|
||||
yield
|
||||
return
|
||||
|
||||
old_values = {}
|
||||
new_keys = []
|
||||
|
||||
try:
|
||||
for key, value in new_env.items():
|
||||
if key in os.environ:
|
||||
old_values[key] = os.environ[key]
|
||||
else:
|
||||
new_keys.append(key)
|
||||
os.environ[key] = str(value)
|
||||
yield
|
||||
finally:
|
||||
# Restore / clean up
|
||||
for key, value in old_values.items():
|
||||
os.environ[key] = value
|
||||
for key in new_keys:
|
||||
os.environ.pop(key, None)
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
env_vars = eval_config.get("env_vars", None)
|
||||
with scoped_env_vars(env_vars):
|
||||
results = lm_eval.simple_evaluate(
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||
apply_chat_template=eval_config.get(
|
||||
"apply_chat_template", backend == "vllm-vlm"
|
||||
),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
|
||||
rtol = eval_config.get("rtol", DEFAULT_RTOL)
|
||||
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
f"ground_truth={ground_truth:.3f} | "
|
||||
f"measured={measured_value:.3f} | rtol={rtol}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
|
||||
|
||||
assert success
|
||||
|
||||
@ -9,6 +9,7 @@ import argparse
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
@ -20,6 +21,7 @@ if not sys.version_info >= (3, 12):
|
||||
|
||||
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html>
|
||||
<!-- {comment} -->
|
||||
<meta name="pypi:repository-version" content="1.0">
|
||||
<body>
|
||||
{items}
|
||||
@ -90,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
)
|
||||
|
||||
|
||||
def generate_project_list(subdir_names: list[str]) -> str:
|
||||
def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
|
||||
"""
|
||||
Generate project list HTML content linking to each project & variant sub-directory.
|
||||
"""
|
||||
@ -98,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
|
||||
for name in sorted(subdir_names):
|
||||
name = name.strip("/").strip(".")
|
||||
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
|
||||
|
||||
def generate_package_index_and_metadata(
|
||||
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
|
||||
wheel_files: list[WheelFileInfo],
|
||||
wheel_base_dir: Path,
|
||||
index_base_dir: Path,
|
||||
comment: str = "",
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate package index HTML content for a specific package, linking to actual wheel files.
|
||||
@ -120,7 +125,7 @@ def generate_package_index_and_metadata(
|
||||
file_meta = asdict(file)
|
||||
file_meta["path"] = file_path_quoted
|
||||
metadata.append(file_meta)
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
|
||||
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
|
||||
metadata_str = json.dumps(metadata, indent=2)
|
||||
return index_str, metadata_str
|
||||
|
||||
@ -131,6 +136,7 @@ def generate_index_and_metadata(
|
||||
index_base_dir: Path,
|
||||
default_variant: str | None = None,
|
||||
alias_to_default: str | None = None,
|
||||
comment: str = "",
|
||||
):
|
||||
"""
|
||||
Generate index for all wheel files.
|
||||
@ -141,6 +147,7 @@ def generate_index_and_metadata(
|
||||
index_base_dir (Path): Base directory to store index files.
|
||||
default_variant (str | None): The default variant name, if any.
|
||||
alias_to_default (str | None): Alias variant name for the default variant, if any.
|
||||
comment (str | None): Optional comment to include in the generated HTML files.
|
||||
|
||||
First, parse all wheel files to extract metadata.
|
||||
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
|
||||
@ -234,6 +241,10 @@ def generate_index_and_metadata(
|
||||
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
|
||||
print(f"Alias variant '{alias_to_default}' created for default variant.")
|
||||
|
||||
# Generate comment in HTML header
|
||||
comment_str = f" ({comment})" if comment else ""
|
||||
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
|
||||
|
||||
# Generate index for each variant
|
||||
subdir_names = set()
|
||||
for variant, files in variant_to_files.items():
|
||||
@ -253,7 +264,7 @@ def generate_index_and_metadata(
|
||||
subdir_names = subdir_names.union(packages)
|
||||
else:
|
||||
# generate project list for this variant directly
|
||||
project_list_str = generate_project_list(sorted(packages))
|
||||
project_list_str = generate_project_list(sorted(packages), comment_tmpl)
|
||||
with open(variant_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
@ -263,7 +274,7 @@ def generate_index_and_metadata(
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
package_files, wheel_base_dir, package_dir
|
||||
package_files, wheel_base_dir, package_dir, comment
|
||||
)
|
||||
with open(package_dir / "index.html", "w") as f:
|
||||
f.write(index_str)
|
||||
@ -271,7 +282,7 @@ def generate_index_and_metadata(
|
||||
f.write(metadata_str)
|
||||
|
||||
# Generate top-level project list index
|
||||
project_list_str = generate_project_list(sorted(subdir_names))
|
||||
project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
|
||||
with open(index_base_dir / "index.html", "w") as f:
|
||||
f.write(project_list_str)
|
||||
|
||||
@ -283,6 +294,7 @@ if __name__ == "__main__":
|
||||
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
|
||||
--output-dir <output_directory> : directory to store generated index files
|
||||
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
|
||||
--comment <comment_string> : (optional) comment string to include in generated HTML files
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
@ -312,6 +324,12 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="Alias variant name for the default variant",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comment",
|
||||
type=str,
|
||||
default="",
|
||||
help="Optional comment string to include in generated HTML files",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -366,5 +384,6 @@ if __name__ == "__main__":
|
||||
index_base_dir=index_base_dir,
|
||||
default_variant=None,
|
||||
alias_to_default=args.alias_to_default,
|
||||
comment=args.comment.strip(),
|
||||
)
|
||||
print(f"Successfully generated index and metadata in {output_dir}")
|
||||
|
||||
@ -81,7 +81,10 @@ else
|
||||
alias_arg=""
|
||||
fi
|
||||
|
||||
$PYTHON pip install regex && .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
|
||||
# HACK: we do not need regex module here, but it is required by pre-commit hook
|
||||
# To avoid any external dependency, we simply replace it back to the stdlib re module
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
|
||||
|
||||
# copy indices to /<commit>/ unconditionally
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
|
||||
@ -718,17 +718,6 @@ steps:
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness # 10min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@ -974,19 +963,6 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 60min
|
||||
timeout_in_minutes: 120
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1162,21 +1138,6 @@ steps:
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
@ -1194,16 +1155,6 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@ -1380,7 +1331,7 @@ steps:
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# doesn't support LoRA yet
|
||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
@ -1446,37 +1397,6 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H100 test #####
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distributed Tests (H200) # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -1508,20 +1428,94 @@ steps:
|
||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
##### E2E Eval Tests #####
|
||||
- label: LM Eval Small Models (1 Card) # 15min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: Blackwell LM Eval Small Models
|
||||
timeout_in_minutes: 120
|
||||
gpu: b200
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 10min
|
||||
timeout_in_minutes: 70
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_1
|
||||
# grade: Blocking
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: LM Eval Large Models (4 Card)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: a100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
- label: LM Eval Large Models (H100) # optional
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
agent_pool: mi325_4
|
||||
# grade: Blocking
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||
|
||||
- label: ROCm LM Eval Large Models (8 Card)
|
||||
mirror_hardwares: [amdproduction]
|
||||
agent_pool: mi325_8
|
||||
num_gpus: 8
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
|
||||
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@ -1554,4 +1548,19 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
##### RL Integration Tests #####
|
||||
- label: Prime-RL Integration Test # 15min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
agent_pool: mi325_2
|
||||
# grade: Blocking
|
||||
timeout_in_minutes: 30
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
@ -350,7 +350,8 @@ steps:
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- vllm/v1/attention
|
||||
- vllm/model_executor/layers
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
|
||||
160
docs/contributing/ci/nightly_builds.md
Normal file
160
docs/contributing/ci/nightly_builds.md
Normal file
@ -0,0 +1,160 @@
|
||||
# Nightly Builds of vLLM Wheels
|
||||
|
||||
vLLM maintains a per-commit wheel repository (commonly referred to as "nightly") at `https://wheels.vllm.ai` that provides pre-built wheels for every commit on the `main` branch since `v0.5.3`. This document explains how the nightly wheel index mechanism works.
|
||||
|
||||
## Build and Upload Process on CI
|
||||
|
||||
### Wheel Building
|
||||
|
||||
Wheels are built in the `Release` pipeline (`.buildkite/release-pipeline.yaml`) after a PR is merged into the main branch, with multiple variants:
|
||||
|
||||
- **Backend variants**: `cpu` and `cuXXX` (e.g., `cu129`, `cu130`).
|
||||
- **Architecture variants**: `x86_64` and `aarch64`.
|
||||
|
||||
Each build step:
|
||||
|
||||
1. Builds the wheel in a Docker container.
|
||||
2. Renames the wheel filename to use the correct manylinux tag (currently `manylinux_2_31`) for PEP 600 compliance.
|
||||
3. Uploads the wheel to S3 bucket `vllm-wheels` under `/{commit_hash}/`.
|
||||
|
||||
### Index Generation
|
||||
|
||||
After uploading each wheel, the `.buildkite/scripts/upload-wheels.sh` script:
|
||||
|
||||
1. **Lists all existing wheels** in the commit directory from S3
|
||||
2. **Generates indices** using `.buildkite/scripts/generate-nightly-index.py`:
|
||||
- Parses wheel filenames to extract metadata (version, variant, platform tags).
|
||||
- Creates HTML index files (`index.html`) for PyPI compatibility.
|
||||
- Generates machine-readable `metadata.json` files.
|
||||
3. **Uploads indices** to multiple locations (overriding existing ones):
|
||||
- `/{commit_hash}/` - Always uploaded for commit-specific access.
|
||||
- `/nightly/` - Only for commits on `main` branch (not PRs).
|
||||
- `/{version}/` - Only for release wheels (no `dev` in its version).
|
||||
|
||||
!!! tip "Handling Concurrent Builds"
|
||||
The index generation script can handle multiple variants being built concurrently by always listing all wheels in the commit directory before generating indices, avoiding race conditions.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
The S3 bucket structure follows this pattern:
|
||||
|
||||
```text
|
||||
s3://vllm-wheels/
|
||||
├── {commit_hash}/ # Commit-specific wheels and indices
|
||||
│ ├── vllm-*.whl # All wheel files
|
||||
│ ├── index.html # Project list (default variant)
|
||||
│ ├── vllm/
|
||||
│ │ ├── index.html # Package index (default variant)
|
||||
│ │ └── metadata.json # Metadata (default variant)
|
||||
│ ├── cu129/ # Variant subdirectory
|
||||
│ │ ├── index.html # Project list (cu129 variant)
|
||||
│ │ └── vllm/
|
||||
│ │ ├── index.html # Package index (cu129 variant)
|
||||
│ │ └── metadata.json # Metadata (cu129 variant)
|
||||
│ ├── cu130/ # Variant subdirectory
|
||||
│ ├── cpu/ # Variant subdirectory
|
||||
│ └── .../ # More variant subdirectories
|
||||
├── nightly/ # Latest main branch wheels (mirror of latest commit)
|
||||
└── {version}/ # Release version indices (e.g., 0.11.2)
|
||||
```
|
||||
|
||||
All built wheels are stored in `/{commit_hash}/`, while different indices are generated and reference them.
|
||||
This avoids duplication of wheel files.
|
||||
|
||||
For example, you can specify the following URLs to use different indices:
|
||||
|
||||
- `https://wheels.vllm.ai/nightly/cu130` for the latest main branch wheels built with CUDA 13.0.
|
||||
- `https://wheels.vllm.ai/{commit_hash}` for wheels built at a specific commit (default variant).
|
||||
- `https://wheels.vllm.ai/0.12.0/cpu` for 0.12.0 release wheels built for CPU variant.
|
||||
|
||||
Please note that not all variants are present on every commit. The available variants are subject to change over time, e.g., changing cu130 to cu131.
|
||||
|
||||
### Variant Organization
|
||||
|
||||
Indices are organized by variant:
|
||||
|
||||
- **Default variant**: Wheels without variant suffix (i.e., built with the current `VLLM_MAIN_CUDA_VERSION`) are placed in the root.
|
||||
- **Variant subdirectories**: Wheels with variant suffixes (e.g., `+cu130`, `.cpu`) are organized in subdirectories.
|
||||
- **Alias to default**: The default variant can have an alias (e.g., `cu129` for now) for consistency and convenience.
|
||||
|
||||
The variant is extracted from the wheel filename (as described in the [file name convention](https://packaging.python.org/en/latest/specifications/binary-distribution-format/#file-name-convention)):
|
||||
|
||||
- The variant is encoded in the local version identifier (e.g. `+cu129` or `dev<N>+g<hash>.cu130`).
|
||||
- Examples:
|
||||
- `vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl` → default variant
|
||||
- `vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl` → `cu129` variant
|
||||
- `vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl` → `cu130` variant
|
||||
|
||||
## Index Generation Details
|
||||
|
||||
The `generate-nightly-index.py` script performs the following:
|
||||
|
||||
1. **Parses wheel filenames** using regex to extract:
|
||||
- Package name
|
||||
- Version (with variant extracted)
|
||||
- Python tag, ABI tag, platform tag
|
||||
- Build tag (if present)
|
||||
2. **Groups wheels by variant**, then by package name:
|
||||
- Currently only `vllm` is built, but the structure supports multiple packages in the future.
|
||||
3. **Generates HTML indices** (compliant with the [Simple repository API](https://packaging.python.org/en/latest/specifications/simple-repository-api/#simple-repository-api)):
|
||||
- Top-level `index.html`: Lists all packages and variant subdirectories
|
||||
- Package-level `index.html`: Lists all wheel files for that package
|
||||
- Uses relative paths to wheel files for portability
|
||||
4. **Generates metadata.json**:
|
||||
- Machine-readable JSON containing all wheel metadata
|
||||
- Includes `path` field with URL-encoded relative path to wheel file
|
||||
- Used by `setup.py` to locate compatible pre-compiled wheels during Python-only builds
|
||||
|
||||
### Special Handling for AWS Services
|
||||
|
||||
The wheels and indices are directly stored on AWS S3, and we use AWS CloudFront as a CDN in front of the S3 bucket.
|
||||
|
||||
Since S3 does not provide proper directory listing, to support PyPI-compatible simple repository API behavior, we deploy a CloudFront Function that:
|
||||
|
||||
- redirects any URL that does not end with `/` and does not look like a file (i.e., does not contain a dot `.` in the last path segment) to the same URL with a trailing `/`
|
||||
- appends `/index.html` to any URL that ends with `/`
|
||||
|
||||
For example, the following requests would be handled as:
|
||||
|
||||
- `/nightly` -> `/nightly/index.html`
|
||||
- `/nightly/cu130/` -> `/nightly/cu130/index.html`
|
||||
- `/nightly/index.html` or `/nightly/vllm.whl` -> unchanged
|
||||
|
||||
!!! note "AWS S3 Filename Escaping"
|
||||
|
||||
S3 will automatically escape filenames upon upload according to its [naming rule](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html). The direct impact on vllm is that `+` in filenames will be converted to `%2B`. We take special care in the index generation script to escape filenames properly when generating the HTML indices and JSON metadata, to ensure the URLs are correct and can be directly used.
|
||||
|
||||
## Usage of precompiled wheels in `setup.py` {#precompiled-wheels-usage}
|
||||
|
||||
When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
|
||||
|
||||
1. **Determines wheel location** via `precompiled_wheel_utils.determine_wheel_url()`:
|
||||
- Env var `VLLM_PRECOMPILED_WHEEL_LOCATION` (user-specified URL/path) always takes precedence and skips all other steps.
|
||||
- Determines the variant from `VLLM_MAIN_CUDA_VERSION` (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_VARIANT`); the default variant will also be tried as a fallback.
|
||||
- Determines the _base commit_ (explained later) of this branch (can be overridden with env var `VLLM_PRECOMPILED_WHEEL_COMMIT`).
|
||||
2. **Fetches metadata** from `https://wheels.vllm.ai/{commit}/vllm/metadata.json` (for the default variant) or `https://wheels.vllm.ai/{commit}/{variant}/vllm/metadata.json` (for a specific variant).
|
||||
3. **Selects compatible wheel** based on:
|
||||
- Package name (`vllm`)
|
||||
- Platform tag (architecture match)
|
||||
4. **Downloads and extracts** precompiled binaries from the wheel:
|
||||
- C++ extension modules (`.so` files)
|
||||
- Flash Attention Python modules
|
||||
- Triton kernel Python files
|
||||
5. **Patches package_data** to include extracted files in the installation
|
||||
|
||||
!!! note "What is the base commit?"
|
||||
|
||||
The base commit is determined by finding the merge-base
|
||||
between the current branch and upstream `main`, ensuring
|
||||
compatibility between source code and precompiled binaries.
|
||||
|
||||
_Note: it's users' responsibility to ensure there is no native code (e.g., C++ or CUDA) changes before using precompiled wheels._
|
||||
|
||||
## Implementation Files
|
||||
|
||||
Key files involved in the nightly wheel mechanism:
|
||||
|
||||
- **`.buildkite/release-pipeline.yaml`**: CI pipeline that builds wheels
|
||||
- **`.buildkite/scripts/upload-wheels.sh`**: Script that uploads wheels and generates indices
|
||||
- **`.buildkite/scripts/generate-nightly-index.py`**: Python script that generates PyPI-compatible indices
|
||||
- **`setup.py`**: Contains `precompiled_wheel_utils` class for fetching and using precompiled wheels
|
||||
333
docs/deployment/integrations/kthena.md
Normal file
333
docs/deployment/integrations/kthena.md
Normal file
@ -0,0 +1,333 @@
|
||||
# Kthena
|
||||
|
||||
[**Kthena**](https://github.com/volcano-sh/kthena) is a Kubernetes-native LLM inference platform that transforms how organizations deploy and manage Large Language Models in production. Built with declarative model lifecycle management and intelligent request routing, it provides high performance and enterprise-grade scalability for LLM inference workloads.
|
||||
|
||||
This guide shows how to deploy a production-grade, **multi-node vLLM** service on Kubernetes.
|
||||
|
||||
We’ll:
|
||||
|
||||
- Install the required components (Kthena + Volcano).
|
||||
- Deploy a multi-node vLLM model via Kthena’s `ModelServing` CR.
|
||||
- Validate the deployment.
|
||||
|
||||
---
|
||||
|
||||
## 1. Prerequisites
|
||||
|
||||
You need:
|
||||
|
||||
- A Kubernetes cluster with **GPU nodes**.
|
||||
- `kubectl` access with cluster-admin or equivalent permissions.
|
||||
- **Volcano** installed for gang scheduling.
|
||||
- **Kthena** installed with the `ModelServing` CRD available.
|
||||
- A valid **Hugging Face token** if loading models from Hugging Face Hub.
|
||||
|
||||
### 1.1 Install Volcano
|
||||
|
||||
```bash
|
||||
helm repo add volcano-sh https://volcano-sh.github.io/helm-charts
|
||||
helm repo update
|
||||
helm install volcano volcano-sh/volcano -n volcano-system --create-namespace
|
||||
```
|
||||
|
||||
This provides the gang-scheduling and network topology features used by Kthena.
|
||||
|
||||
### 1.2 Install Kthena
|
||||
|
||||
```bash
|
||||
helm install kthena oci://ghcr.io/volcano-sh/charts/kthena --version v0.1.0 --namespace kthena-system --create-namespace
|
||||
```
|
||||
|
||||
- The `kthena-system` namespace is created.
|
||||
- Kthena controllers and CRDs, including `ModelServing`, are installed and healthy.
|
||||
|
||||
Validate:
|
||||
|
||||
```bash
|
||||
kubectl get crd | grep modelserving
|
||||
```
|
||||
|
||||
You should see:
|
||||
|
||||
```text
|
||||
modelservings.workload.serving.volcano.sh ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. The Multi-Node vLLM `ModelServing` Example
|
||||
|
||||
Kthena provides an example manifest to deploy a **multi-node vLLM cluster running Llama**. Conceptually this is equivalent to the vLLM production stack Helm deployment, but expressed with `ModelServing`.
|
||||
|
||||
A simplified version of the example (`llama-multinode`) looks like:
|
||||
|
||||
- `spec.replicas: 1` – one `ServingGroup` (one logical model deployment).
|
||||
- `roles`:
|
||||
- `entryTemplate` – defines **leader** pods that run:
|
||||
- vLLM’s **multi-node cluster bootstrap script** (Ray cluster).
|
||||
- vLLM **OpenAI-compatible API server**.
|
||||
- `workerTemplate` – defines **worker** pods that join the leader’s Ray cluster.
|
||||
|
||||
Key points from the example YAML:
|
||||
|
||||
- **Image**: `vllm/vllm-openai:latest` (matches upstream vLLM images).
|
||||
- **Command** (leader):
|
||||
|
||||
```yaml
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- >
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
|
||||
python3 -m vllm.entrypoints.openai.api_server
|
||||
--port 8080
|
||||
--model meta-llama/Llama-3.1-405B-Instruct
|
||||
--tensor-parallel-size 8
|
||||
--pipeline-parallel-size 2
|
||||
```
|
||||
|
||||
- **Command** (worker):
|
||||
|
||||
```yaml
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- >
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Deploying Multi-Node llama vLLM via Kthena
|
||||
|
||||
### 3.1 Prepare the Manifest
|
||||
|
||||
**Recommended**: use a Secret instead of a raw env var:
|
||||
|
||||
```bash
|
||||
kubectl create secret generic hf-token \
|
||||
-n default \
|
||||
--from-literal=HUGGING_FACE_HUB_TOKEN='<your-token>'
|
||||
```
|
||||
|
||||
### 3.2 Apply the `ModelServing`
|
||||
|
||||
```bash
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: workload.serving.volcano.sh/v1alpha1
|
||||
kind: ModelServing
|
||||
metadata:
|
||||
name: llama-multinode
|
||||
namespace: default
|
||||
spec:
|
||||
schedulerName: volcano
|
||||
replicas: 1 # group replicas
|
||||
template:
|
||||
restartGracePeriodSeconds: 60
|
||||
gangPolicy:
|
||||
minRoleReplicas:
|
||||
405b: 1
|
||||
roles:
|
||||
- name: 405b
|
||||
replicas: 2
|
||||
entryTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: leader
|
||||
image: vllm/vllm-openai:latest
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token
|
||||
key: HUGGING_FACE_HUB_TOKEN
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
workerReplicas: 1
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: worker
|
||||
image: vllm/vllm-openai:latest
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token
|
||||
key: HUGGING_FACE_HUB_TOKEN
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
EOF
|
||||
```
|
||||
|
||||
Kthena will:
|
||||
|
||||
- Create a `ModelServing` object.
|
||||
- Derive a `PodGroup` for Volcano gang scheduling.
|
||||
- Create the leader and worker pods for each `ServingGroup` and `Role`.
|
||||
|
||||
---
|
||||
|
||||
## 4. Verifying the Deployment
|
||||
|
||||
### 4.1 Check ModelServing Status
|
||||
|
||||
Use the snippet from the Kthena docs:
|
||||
|
||||
```bash
|
||||
kubectl get modelserving -oyaml | grep status -A 10
|
||||
```
|
||||
|
||||
You should see something like:
|
||||
|
||||
```yaml
|
||||
status:
|
||||
availableReplicas: 1
|
||||
conditions:
|
||||
- type: Available
|
||||
status: "True"
|
||||
reason: AllGroupsReady
|
||||
message: All Serving groups are ready
|
||||
- type: Progressing
|
||||
status: "False"
|
||||
...
|
||||
replicas: 1
|
||||
updatedReplicas: 1
|
||||
```
|
||||
|
||||
### 4.2 Check Pods
|
||||
|
||||
List pods for your deployment:
|
||||
|
||||
```bash
|
||||
kubectl get pod -owide -l modelserving.volcano.sh/name=llama-multinode
|
||||
```
|
||||
|
||||
Example output (from docs):
|
||||
|
||||
```text
|
||||
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE ...
|
||||
default llama-multinode-0-405b-0-0 1/1 Running 0 15m 10.244.0.56 192.168.5.12 ...
|
||||
default llama-multinode-0-405b-0-1 1/1 Running 0 15m 10.244.0.58 192.168.5.43 ...
|
||||
default llama-multinode-0-405b-1-0 1/1 Running 0 15m 10.244.0.57 192.168.5.58 ...
|
||||
default llama-multinode-0-405b-1-1 1/1 Running 0 15m 10.244.0.53 192.168.5.36 ...
|
||||
```
|
||||
|
||||
Pod name pattern:
|
||||
|
||||
- `llama-multinode-<group-idx>-<role-name>-<replica-idx>-<ordinal>`.
|
||||
|
||||
The first number indicates `ServingGroup`. The second (`405b`) is the `Role`. The remaining indices identify the pod within the role.
|
||||
|
||||
---
|
||||
|
||||
## 6. Accessing the vLLM OpenAI-Compatible API
|
||||
|
||||
Expose the entry via a Service:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-multinode-openai
|
||||
namespace: default
|
||||
spec:
|
||||
selector:
|
||||
modelserving.volcano.sh/name: llama-multinode
|
||||
modelserving.volcano.sh/entry: "true"
|
||||
# optionally further narrow to leader role if you label it
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
type: ClusterIP
|
||||
```
|
||||
|
||||
Port-forward from your local machine:
|
||||
|
||||
```bash
|
||||
kubectl port-forward svc/llama-multinode-openai 30080:80 -n default
|
||||
```
|
||||
|
||||
Then:
|
||||
|
||||
- List models:
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:30080/v1/models
|
||||
```
|
||||
|
||||
- Send a completion request (mirroring vLLM production stack docs):
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:30080/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Llama-3.1-405B-Instruct",
|
||||
"prompt": "Once upon a time,",
|
||||
"max_tokens": 10
|
||||
}'
|
||||
```
|
||||
|
||||
You should see an OpenAI-style response from vLLM.
|
||||
|
||||
---
|
||||
|
||||
## 7. Clean Up
|
||||
|
||||
To remove the deployment and its resources:
|
||||
|
||||
```bash
|
||||
kubectl delete modelserving llama-multinode -n default
|
||||
```
|
||||
|
||||
If you’re done with the entire stack:
|
||||
|
||||
```bash
|
||||
helm uninstall kthena -n kthena-system # or your Kthena release name
|
||||
helm uninstall volcano -n volcano-system
|
||||
```
|
||||
@ -14,6 +14,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
||||
- [InftyAI/llmaz](integrations/llmaz.md)
|
||||
- [KAITO](integrations/kaito.md)
|
||||
- [KServe](integrations/kserve.md)
|
||||
- [Kthena](integrations/kthena.md)
|
||||
- [KubeRay](integrations/kuberay.md)
|
||||
- [kubernetes-sigs/lws](frameworks/lws.md)
|
||||
- [meta-llama/llama-stack](integrations/llamastack.md)
|
||||
|
||||
@ -86,7 +86,7 @@ LLM(model, enforce_eager=True)
|
||||
```
|
||||
|
||||
To turn off just torch.compile, pass `mode = NONE` to the compilation config.
|
||||
(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
|
||||
(`-cc` is short for `--compilation_config`):
|
||||
|
||||
```sh
|
||||
# Online
|
||||
|
||||
@ -62,7 +62,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
|
||||
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
|
||||
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
|
||||
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
|
||||
- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
|
||||
- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
|
||||
- `vllm:request_prompt_tokens` - Request prompt length.
|
||||
- `vllm:request_generation_tokens` - Request generation length.
|
||||
- `vllm:request_success` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.
|
||||
|
||||
@ -443,6 +443,8 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
For Qwen3-VL, the `image_embeds` should contain both the base image embedding and deepstack features.
|
||||
|
||||
#### Audio Embeddings
|
||||
|
||||
You can pass pre-computed audio embeddings similar to image embeddings:
|
||||
|
||||
@ -18,6 +18,7 @@ vLLM currently supports the following reasoning models:
|
||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
| [Holo2 series](https://huggingface.co/collections/Hcompany/holo2) | `holo2` | `json`, `regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
|
||||
@ -28,6 +29,7 @@ vLLM currently supports the following reasoning models:
|
||||
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
|
||||
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
|
||||
Holo2 reasoning is enabled by default. To disable it, you must also pass `thinking=False` in your `chat_template_kwargs`.
|
||||
|
||||
## Quickstart
|
||||
|
||||
|
||||
@ -58,10 +58,14 @@ schemathesis==3.39.15
|
||||
|
||||
# Evaluation and benchmarking
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
||||
jiwer==4.0.0
|
||||
|
||||
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
|
||||
multiprocess==0.70.16
|
||||
|
||||
# Required for v1/metrics/test_engine_logger_apis.py
|
||||
ray[cgraph,default]>=2.48.0
|
||||
|
||||
# Plugins test
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
|
||||
torchgeo==0.7.0
|
||||
|
||||
@ -260,13 +260,18 @@ def test_deep_sleep_fp8_kvcache():
|
||||
llm.sleep(level=2)
|
||||
|
||||
used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
|
||||
assert used_bytes < 3 * GiB_bytes
|
||||
|
||||
# Rocm uses more memory for CudaGraphs, so we add 2 GiB more for the threshold
|
||||
rocm_extra_mem_bytes = 2 * GiB_bytes if current_platform.is_rocm() else 0
|
||||
mem_threshold_after_sleep = 3 * GiB_bytes + rocm_extra_mem_bytes
|
||||
assert used_bytes < mem_threshold_after_sleep
|
||||
|
||||
llm.wake_up(tags=["weights"])
|
||||
llm.collective_rpc("reload_weights")
|
||||
|
||||
used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
|
||||
assert used_bytes < 4 * GiB_bytes
|
||||
mem_threshold_after_wake_up = 4 * GiB_bytes + rocm_extra_mem_bytes
|
||||
assert used_bytes < mem_threshold_after_wake_up
|
||||
|
||||
# now allocate kv cache and cuda graph memory
|
||||
llm.wake_up(tags=["kv_cache"])
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import functools
|
||||
import multiprocessing
|
||||
import tempfile
|
||||
from contextlib import contextmanager
|
||||
|
||||
@ -137,3 +139,67 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
||||
artifacts = compiled_mod.aot_compiled_fn._artifacts
|
||||
guards_string = artifacts.compiled_fn.shape_env.format_guards()
|
||||
assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
@use_vllm_config(make_vllm_config())
|
||||
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that compiling gpt2 twice results in a cache hit and
|
||||
capture torch dynamic symbol creations to ensure make_symbol
|
||||
not called on cache hit.
|
||||
"""
|
||||
|
||||
import torch.fx.experimental.symbolic_shapes as symbolic_shapes_module
|
||||
from torch.utils._sympy.symbol import make_symbol
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
create_symbol_counter = multiprocessing.Value("i", 0)
|
||||
original_make_symbol = make_symbol
|
||||
|
||||
@functools.wraps(original_make_symbol)
|
||||
def counting_make_symbol(prefix, idx, **kwargs):
|
||||
with create_symbol_counter.get_lock():
|
||||
create_symbol_counter.value += 1
|
||||
return original_make_symbol(prefix, idx, **kwargs)
|
||||
|
||||
symbolic_shapes_module.make_symbol = counting_make_symbol
|
||||
try:
|
||||
with monkeypatch.context() as m, tempfile.TemporaryDirectory() as tmpdirname:
|
||||
m.setenv("VLLM_CACHE_ROOT", tmpdirname)
|
||||
m.setenv("VLLM_USE_AOT_COMPILE", "1")
|
||||
# First compilation - initialize model and generate
|
||||
llm_model = LLM(
|
||||
model="gpt2",
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
),
|
||||
max_model_len=256,
|
||||
)
|
||||
|
||||
llm_model.generate("Hello, my name is")
|
||||
assert create_symbol_counter.value == 2
|
||||
create_symbol_counter.value = 0
|
||||
|
||||
# Clean up first model
|
||||
del llm_model
|
||||
|
||||
# Second compilation - should hit cache
|
||||
m.setenv("VLLM_FORCE_AOT_LOAD", "1")
|
||||
llm_model = LLM(
|
||||
model="gpt2",
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
),
|
||||
max_model_len=256,
|
||||
)
|
||||
llm_model.generate("Hello, my name is")
|
||||
|
||||
assert create_symbol_counter.value == 0
|
||||
|
||||
finally:
|
||||
# Restore original method
|
||||
symbolic_shapes_module.make_symbol = original_make_symbol
|
||||
|
||||
@ -27,7 +27,7 @@ import threading
|
||||
from collections.abc import Generator
|
||||
from contextlib import nullcontext
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, TypedDict, TypeVar, cast
|
||||
from typing import Any, Callable, TypedDict, TypeVar, cast, TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -67,6 +67,11 @@ from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from transformers.generation.utils import GenerateOutput
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_TEST_DIR = os.path.dirname(__file__)
|
||||
@ -202,10 +207,7 @@ def dynamo_reset():
|
||||
|
||||
@pytest.fixture
|
||||
def example_prompts() -> list[str]:
|
||||
prompts = []
|
||||
for filename in _TEST_PROMPTS:
|
||||
prompts += _read_prompts(filename)
|
||||
return prompts
|
||||
return [prompt for filename in _TEST_PROMPTS for prompt in _read_prompts(filename)]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -224,10 +226,7 @@ class DecoderPromptType(Enum):
|
||||
|
||||
@pytest.fixture
|
||||
def example_long_prompts() -> list[str]:
|
||||
prompts = []
|
||||
for filename in _LONG_PROMPTS:
|
||||
prompts += _read_prompts(filename)
|
||||
return prompts
|
||||
return [prompt for filename in _LONG_PROMPTS for prompt in _read_prompts(filename)]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@ -353,10 +352,13 @@ class HfRunner:
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
else:
|
||||
model = auto_cls.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**model_kwargs,
|
||||
model = cast(
|
||||
nn.Module,
|
||||
auto_cls.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**model_kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
# in case some unquantized custom models are not in same dtype
|
||||
@ -374,10 +376,12 @@ class HfRunner:
|
||||
self.model = model
|
||||
|
||||
if not skip_tokenizer_init:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
|
||||
AutoTokenizer.from_pretrained(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
)
|
||||
)
|
||||
|
||||
# don't put this import at the top level
|
||||
@ -495,7 +499,7 @@ class HfRunner:
|
||||
|
||||
outputs: list[tuple[list[list[int]], list[str]]] = []
|
||||
for inputs in all_inputs:
|
||||
output_ids = self.model.generate(
|
||||
output_ids: torch.Tensor = self.model.generate(
|
||||
**self.wrap_device(inputs),
|
||||
use_cache=True,
|
||||
**kwargs,
|
||||
@ -505,8 +509,7 @@ class HfRunner:
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
)
|
||||
output_ids = output_ids.cpu().tolist()
|
||||
outputs.append((output_ids, output_str))
|
||||
outputs.append((output_ids.cpu().tolist(), output_str))
|
||||
return outputs
|
||||
|
||||
def generate_greedy(
|
||||
@ -574,7 +577,7 @@ class HfRunner:
|
||||
|
||||
all_logprobs: list[list[torch.Tensor]] = []
|
||||
for inputs in all_inputs:
|
||||
output = self.model.generate(
|
||||
output: "GenerateOutput" = self.model.generate(
|
||||
**self.wrap_device(inputs),
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
@ -656,7 +659,7 @@ class HfRunner:
|
||||
all_output_strs: list[str] = []
|
||||
|
||||
for inputs in all_inputs:
|
||||
output = self.model.generate(
|
||||
output: "GenerateOutput" = self.model.generate(
|
||||
**self.wrap_device(inputs),
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
|
||||
@ -16,16 +16,35 @@ from typing import Literal, NamedTuple
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
|
||||
from tests.utils import RemoteOpenAIServer, create_new_process_for_each_test
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_context_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
]
|
||||
|
||||
# GSM8K eval configuration
|
||||
NUM_QUESTIONS = 256 # Fast eval for CI
|
||||
NUM_SHOTS = 5 # Few-shot examples
|
||||
# tp accuracy with 2% buffer
|
||||
MIN_ACCURACY = {
|
||||
# .buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": 0.64,
|
||||
# .buildkite/lm-eval-harness/configs/Qwen2.5-1.5B-Instruct.yaml
|
||||
"Qwen/Qwen2.5-1.5B-Instruct": 0.52,
|
||||
}
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
@ -38,7 +57,6 @@ class ParallelSetup(NamedTuple):
|
||||
|
||||
class CPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: str | None = None
|
||||
attn_backend: str | None = None
|
||||
|
||||
|
||||
@ -54,17 +72,20 @@ class CPTestSettings:
|
||||
*,
|
||||
tp_base: int = 4,
|
||||
pp_base: int = 1,
|
||||
dcp_base: int = 1,
|
||||
dcp_multipliers: list[float] | None = None,
|
||||
cp_kv_cache_interleave_size: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: str | None = None,
|
||||
attn_backend: str | None = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
if dcp_multipliers is None:
|
||||
dcp_multipliers = [
|
||||
0.5,
|
||||
]
|
||||
for eager_mode_val in [False]:
|
||||
for pp_multiplier in [1]:
|
||||
for dcp_multiplier in [0.5, 1]:
|
||||
for dcp_multiplier in dcp_multipliers:
|
||||
for chunked_prefill_val in [True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(
|
||||
@ -82,7 +103,6 @@ class CPTestSettings:
|
||||
runner=runner,
|
||||
test_options=CPTestOptions(
|
||||
multi_node_only=multi_node_only,
|
||||
load_format=load_format,
|
||||
attn_backend=attn_backend,
|
||||
),
|
||||
)
|
||||
@ -101,7 +121,24 @@ class CPTestSettings:
|
||||
)
|
||||
|
||||
|
||||
def _compare_cp_with_tp(
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
|
||||
CPTestSettings.detailed(
|
||||
dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64
|
||||
),
|
||||
],
|
||||
"Qwen/Qwen2.5-1.5B-Instruct": [
|
||||
CPTestSettings.detailed(
|
||||
cp_kv_cache_interleave_size=16, attn_backend="FLASH_ATTN"
|
||||
),
|
||||
CPTestSettings.detailed(
|
||||
cp_kv_cache_interleave_size=16, attn_backend="FLASHINFER"
|
||||
),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _test_cp_gsm8k(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
@ -121,7 +158,7 @@ def _compare_cp_with_tp(
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
|
||||
multi_node_only, load_format, attn_backend = test_options
|
||||
multi_node_only, attn_backend = test_options
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
@ -130,22 +167,7 @@ def _compare_cp_with_tp(
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
text_overrides = {
|
||||
"num_hidden_layers": 4,
|
||||
"hidden_size": 512,
|
||||
"intermediate_size": 800,
|
||||
"num_attention_heads": 4,
|
||||
"num_key_value_heads": 1,
|
||||
}
|
||||
|
||||
if is_multimodal:
|
||||
hf_overrides.update({"text_config": text_overrides})
|
||||
else:
|
||||
hf_overrides.update(text_overrides)
|
||||
else:
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
@ -157,90 +179,70 @@ def _compare_cp_with_tp(
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
common_args = [
|
||||
server_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"4096",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
"64",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
server_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
server_args.append("--enforce-eager")
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
server_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
server_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
server_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
server_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
if not attn_backend:
|
||||
cp_env = tp_env = {}
|
||||
else:
|
||||
cp_env = tp_env = {
|
||||
"VLLM_ATTENTION_BACKEND": attn_backend,
|
||||
}
|
||||
|
||||
cp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--decode-context-parallel-size",
|
||||
str(dcp_size),
|
||||
"--dcp-kv-cache-interleave-size",
|
||||
str(cp_kv_cache_interleave_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
compare_two_settings(
|
||||
model_id,
|
||||
cp_args,
|
||||
tp_args,
|
||||
cp_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=720,
|
||||
server_args.extend(
|
||||
[
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--decode-context-parallel-size",
|
||||
str(dcp_size),
|
||||
"--dcp-kv-cache-interleave-size",
|
||||
str(cp_kv_cache_interleave_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
)
|
||||
|
||||
server_env = {}
|
||||
if attn_backend:
|
||||
server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
|
||||
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
|
||||
CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2),
|
||||
CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
|
||||
],
|
||||
"bigcode/gpt_bigcode-santacoder": [
|
||||
CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2),
|
||||
],
|
||||
}
|
||||
with RemoteOpenAIServer(
|
||||
model_id,
|
||||
server_args,
|
||||
env_dict=server_env,
|
||||
max_wait_seconds=720,
|
||||
) as remote_server:
|
||||
host = f"http://{remote_server.host}"
|
||||
port = remote_server.port
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
"bigcode/gpt_bigcode-santacoder",
|
||||
]
|
||||
# Run GSM8K evaluation
|
||||
results = evaluate_gsm8k(
|
||||
num_questions=NUM_QUESTIONS,
|
||||
num_shots=NUM_SHOTS,
|
||||
host=host,
|
||||
port=port,
|
||||
)
|
||||
|
||||
# Validate accuracy is reasonable
|
||||
accuracy = results["accuracy"]
|
||||
min_accuracy = MIN_ACCURACY[model_id]
|
||||
assert accuracy >= min_accuracy, (
|
||||
f"TP+DCP accuracy too low: {accuracy:.3f} < {min_accuracy:.3f}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -274,12 +276,12 @@ def test_cp_generation(
|
||||
):
|
||||
pytest.skip(reason="MLA+DCP requires compute capability of 9.0 or higher")
|
||||
if (
|
||||
model_id == "bigcode/gpt_bigcode-santacoder"
|
||||
model_id == "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
and torch.cuda.get_device_capability() != (9, 0)
|
||||
):
|
||||
pytest.skip(reason="GQA+DCP currently requires compute capability of 9.0")
|
||||
|
||||
_compare_cp_with_tp(
|
||||
_test_cp_gsm8k(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.distributed.eplb.rebalance_algo import rebalance_experts
|
||||
from vllm.distributed.eplb.policy.default import DefaultEplbPolicy
|
||||
|
||||
|
||||
def test_basic_rebalance():
|
||||
@ -23,7 +23,7 @@ def test_basic_rebalance():
|
||||
num_nodes = 2
|
||||
num_gpus = 8
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -77,7 +77,7 @@ def test_single_gpu_case():
|
||||
num_nodes = 1
|
||||
num_gpus = 1
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -99,7 +99,7 @@ def test_equal_weights():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -122,7 +122,7 @@ def test_extreme_weight_imbalance():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -150,7 +150,7 @@ def test_multiple_layers():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -175,14 +175,14 @@ def test_parameter_validation():
|
||||
# Test non-divisible case - this should handle normally without throwing
|
||||
# errors because the function will fall back to global load balancing
|
||||
# strategy
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4)
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(weight, 8, 3, 2, 4)
|
||||
assert phy2log.shape == (1, 8)
|
||||
assert logcnt.shape == (1, 4)
|
||||
|
||||
# Test cases that will actually cause errors:
|
||||
# num_physical_experts not divisible by num_gpus
|
||||
with pytest.raises(AssertionError):
|
||||
rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4
|
||||
DefaultEplbPolicy.rebalance_experts(weight, 7, 2, 2, 4) # 7 not divisible by 4
|
||||
|
||||
|
||||
def test_small_scale_hierarchical():
|
||||
@ -197,7 +197,7 @@ def test_small_scale_hierarchical():
|
||||
num_nodes = 2 # 2 nodes
|
||||
num_gpus = 4 # 4 GPUs
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -224,7 +224,7 @@ def test_global_load_balance_fallback():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -246,7 +246,7 @@ def test_device_compatibility(device):
|
||||
num_nodes = 1
|
||||
num_gpus = 2
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
@ -263,7 +263,9 @@ def test_additional_cases():
|
||||
weight1 = torch.tensor(
|
||||
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
|
||||
)
|
||||
phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
|
||||
phy2log1, log2phy1, logcnt1 = DefaultEplbPolicy.rebalance_experts(
|
||||
weight1, 24, 8, 4, 8
|
||||
)
|
||||
|
||||
assert phy2log1.shape == (1, 24)
|
||||
assert logcnt1.shape == (1, 16)
|
||||
@ -276,7 +278,9 @@ def test_additional_cases():
|
||||
[12, 25, 50, 100, 150, 200], # Increasing weights
|
||||
]
|
||||
)
|
||||
phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
|
||||
phy2log2, log2phy2, logcnt2 = DefaultEplbPolicy.rebalance_experts(
|
||||
weight2, 10, 3, 1, 2
|
||||
)
|
||||
|
||||
assert phy2log2.shape == (2, 10)
|
||||
assert logcnt2.shape == (2, 6)
|
||||
@ -300,7 +304,7 @@ if __name__ == "__main__":
|
||||
num_nodes = 2
|
||||
num_gpus = 8
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
phy2log, log2phy, logcnt = DefaultEplbPolicy.rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
print(phy2log)
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from openai_harmony import Role
|
||||
from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem
|
||||
from openai_harmony import Author, Message, Role, TextContent
|
||||
|
||||
from vllm.entrypoints.harmony_utils import (
|
||||
has_custom_tools,
|
||||
parse_input_to_harmony_message,
|
||||
parse_output_message,
|
||||
)
|
||||
|
||||
|
||||
@ -257,6 +259,191 @@ class TestParseInputToHarmonyMessage:
|
||||
assert messages[0].content[1].text == "actual text"
|
||||
|
||||
|
||||
class TestParseOutputMessage:
|
||||
"""Tests for parse_output_message function."""
|
||||
|
||||
def test_commentary_with_no_recipient_creates_reasoning(self):
|
||||
"""Test that commentary with recipient=None (preambles) creates reasoning items.
|
||||
|
||||
Per Harmony format, commentary channel can contain preambles to calling
|
||||
multiple functions - explanatory text with no recipient.
|
||||
"""
|
||||
message = Message.from_role_and_content(
|
||||
Role.ASSISTANT, "I will now search for the weather information."
|
||||
)
|
||||
message = message.with_channel("commentary")
|
||||
# recipient is None by default, representing a preamble
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseReasoningItem)
|
||||
assert output_items[0].type == "reasoning"
|
||||
assert (
|
||||
output_items[0].content[0].text
|
||||
== "I will now search for the weather information."
|
||||
)
|
||||
assert output_items[0].content[0].type == "reasoning_text"
|
||||
|
||||
def test_commentary_with_function_recipient_creates_function_call(self):
|
||||
"""Test commentary with recipient='functions.X' creates function calls."""
|
||||
message = Message.from_role_and_content(
|
||||
Role.ASSISTANT, '{"location": "San Francisco", "units": "celsius"}'
|
||||
)
|
||||
message = message.with_channel("commentary")
|
||||
message = message.with_recipient("functions.get_weather")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseFunctionToolCall)
|
||||
assert output_items[0].type == "function_call"
|
||||
assert output_items[0].name == "get_weather"
|
||||
assert (
|
||||
output_items[0].arguments
|
||||
== '{"location": "San Francisco", "units": "celsius"}'
|
||||
)
|
||||
assert output_items[0].call_id.startswith("call_")
|
||||
assert output_items[0].id.startswith("fc_")
|
||||
|
||||
def test_commentary_with_python_recipient_creates_reasoning(self):
|
||||
"""Test that commentary with recipient='python' creates reasoning items."""
|
||||
message = Message.from_role_and_content(
|
||||
Role.ASSISTANT, "import numpy as np\nprint(np.array([1, 2, 3]))"
|
||||
)
|
||||
message = message.with_channel("commentary")
|
||||
message = message.with_recipient("python")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseReasoningItem)
|
||||
assert output_items[0].type == "reasoning"
|
||||
assert (
|
||||
output_items[0].content[0].text
|
||||
== "import numpy as np\nprint(np.array([1, 2, 3]))"
|
||||
)
|
||||
|
||||
def test_commentary_with_browser_recipient_creates_reasoning(self):
|
||||
"""Test that commentary with recipient='browser' creates reasoning items."""
|
||||
message = Message.from_role_and_content(
|
||||
Role.ASSISTANT, "Navigating to the specified URL"
|
||||
)
|
||||
message = message.with_channel("commentary")
|
||||
message = message.with_recipient("browser")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseReasoningItem)
|
||||
assert output_items[0].type == "reasoning"
|
||||
assert output_items[0].content[0].text == "Navigating to the specified URL"
|
||||
|
||||
def test_commentary_with_container_recipient_creates_reasoning(self):
|
||||
"""Test that commentary with recipient='container' creates reasoning items."""
|
||||
message = Message.from_role_and_content(
|
||||
Role.ASSISTANT, "Running command in container"
|
||||
)
|
||||
message = message.with_channel("commentary")
|
||||
message = message.with_recipient("container")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseReasoningItem)
|
||||
assert output_items[0].type == "reasoning"
|
||||
assert output_items[0].content[0].text == "Running command in container"
|
||||
|
||||
def test_commentary_with_empty_content_and_no_recipient(self):
|
||||
"""Test edge case: empty commentary with recipient=None."""
|
||||
message = Message.from_role_and_content(Role.ASSISTANT, "")
|
||||
message = message.with_channel("commentary")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseReasoningItem)
|
||||
assert output_items[0].content[0].text == ""
|
||||
|
||||
def test_commentary_with_multiple_contents_and_no_recipient(self):
|
||||
"""Test multiple content items in commentary with no recipient."""
|
||||
contents = [
|
||||
TextContent(text="Step 1: Analyze the request"),
|
||||
TextContent(text="Step 2: Prepare to call functions"),
|
||||
]
|
||||
message = Message.from_role_and_contents(Role.ASSISTANT, contents)
|
||||
message = message.with_channel("commentary")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 2
|
||||
assert all(isinstance(item, ResponseReasoningItem) for item in output_items)
|
||||
assert output_items[0].content[0].text == "Step 1: Analyze the request"
|
||||
assert output_items[1].content[0].text == "Step 2: Prepare to call functions"
|
||||
|
||||
def test_commentary_with_multiple_function_calls(self):
|
||||
"""Test multiple function calls in commentary channel."""
|
||||
contents = [
|
||||
TextContent(text='{"location": "San Francisco"}'),
|
||||
TextContent(text='{"location": "New York"}'),
|
||||
]
|
||||
message = Message.from_role_and_contents(Role.ASSISTANT, contents)
|
||||
message = message.with_channel("commentary")
|
||||
message = message.with_recipient("functions.get_weather")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 2
|
||||
assert all(isinstance(item, ResponseFunctionToolCall) for item in output_items)
|
||||
assert output_items[0].name == "get_weather"
|
||||
assert output_items[1].name == "get_weather"
|
||||
assert output_items[0].arguments == '{"location": "San Francisco"}'
|
||||
assert output_items[1].arguments == '{"location": "New York"}'
|
||||
|
||||
def test_commentary_with_unknown_recipient_raises_error(self):
|
||||
"""Test that commentary with unknown recipient raises ValueError."""
|
||||
message = Message.from_role_and_content(Role.ASSISTANT, "some content")
|
||||
message = message.with_channel("commentary")
|
||||
message = message.with_recipient("unknown_recipient")
|
||||
|
||||
try:
|
||||
parse_output_message(message)
|
||||
raise AssertionError("Expected ValueError to be raised")
|
||||
except ValueError as e:
|
||||
assert "Unknown recipient: unknown_recipient" in str(e)
|
||||
|
||||
def test_analysis_channel_creates_reasoning(self):
|
||||
"""Test that analysis channel creates reasoning items."""
|
||||
message = Message.from_role_and_content(
|
||||
Role.ASSISTANT, "Analyzing the problem step by step..."
|
||||
)
|
||||
message = message.with_channel("analysis")
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 1
|
||||
assert isinstance(output_items[0], ResponseReasoningItem)
|
||||
assert output_items[0].type == "reasoning"
|
||||
assert (
|
||||
output_items[0].content[0].text == "Analyzing the problem step by step..."
|
||||
)
|
||||
|
||||
def test_non_assistant_message_returns_empty(self):
|
||||
"""Test that non-assistant messages return empty list.
|
||||
|
||||
Per the implementation, tool messages to assistant (e.g., search results)
|
||||
are not included in final output to align with OpenAI behavior.
|
||||
"""
|
||||
message = Message.from_author_and_content(
|
||||
Author.new(Role.TOOL, "functions.get_weather"),
|
||||
"The weather is sunny, 72°F",
|
||||
)
|
||||
|
||||
output_items = parse_output_message(message)
|
||||
|
||||
assert len(output_items) == 0
|
||||
|
||||
|
||||
def test_has_custom_tools() -> None:
|
||||
assert not has_custom_tools(set())
|
||||
assert not has_custom_tools({"web_search_preview", "code_interpreter", "container"})
|
||||
|
||||
@ -113,12 +113,10 @@ def test_mrope(
|
||||
is_neox_style = True
|
||||
|
||||
max_position = config.max_position_embeddings
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
|
||||
rotary_dim = int(head_dim * partial_rotary_factor)
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=config.rope_parameters,
|
||||
@ -184,12 +182,10 @@ def test_mrope_torch_compile_tracing(
|
||||
)
|
||||
is_neox_style = True
|
||||
max_position = config.max_position_embeddings
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
|
||||
rotary_dim = int(head_dim * partial_rotary_factor)
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=config.rope_parameters,
|
||||
|
||||
@ -1,160 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import SimpleBuffer
|
||||
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
|
||||
|
||||
# TODO: the test depends on a lot of fields in the current implementation.
|
||||
# We should have standard interface instead direct field access
|
||||
|
||||
|
||||
def test_run(my_rank, buffer, device):
|
||||
# buffer should be empty in the beginning
|
||||
if my_rank == 0:
|
||||
assert buffer.buffer_size == 0
|
||||
assert len(buffer.buffer) == 0
|
||||
|
||||
print(f"My rank: {my_rank}, device: {device}")
|
||||
|
||||
# insert
|
||||
tokens = torch.tensor([1, 2, 3]).to(device)
|
||||
roi = tokens > 0
|
||||
if my_rank == 0:
|
||||
key = 2.0 * torch.ones([5, 6]).to(device)
|
||||
value = 3.0 * torch.ones([5, 6]).to(device)
|
||||
|
||||
placeholder = torch.tensor([1]).to(device)
|
||||
|
||||
buffer.insert(tokens, roi, key, value, placeholder)
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
# drop_select
|
||||
if my_rank == 1:
|
||||
tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
|
||||
assert torch.allclose(tokens, tok)
|
||||
assert torch.allclose(roi, roi_)
|
||||
assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
|
||||
assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
|
||||
torch.distributed.barrier()
|
||||
|
||||
if my_rank == 0:
|
||||
assert buffer.buffer_size == 0
|
||||
assert len(buffer.buffer) == 0
|
||||
|
||||
print(f"My rank: {my_rank}, Test run passed!")
|
||||
|
||||
|
||||
def stress_test(my_rank, buf, device):
|
||||
torch.distributed.barrier()
|
||||
torch.manual_seed(100)
|
||||
|
||||
reqs = [
|
||||
(
|
||||
torch.rand(100).to(device), # tokens
|
||||
torch.ones(100).bool().to(device), # roi
|
||||
torch.rand(100).to(device), # key
|
||||
torch.rand(100).to(device), # value
|
||||
torch.rand(100).to(device), # hidden
|
||||
)
|
||||
for i in tqdm(range(200))
|
||||
]
|
||||
|
||||
random.seed(my_rank)
|
||||
random.shuffle(reqs)
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
n = 0
|
||||
|
||||
# the buffer size can only store 100 reqs
|
||||
# so the sender will occasionally block to wait for the receiver.
|
||||
for req in tqdm(reqs):
|
||||
if my_rank == 0:
|
||||
buf.insert(*req)
|
||||
else:
|
||||
tok, roi, k, v, h = req
|
||||
tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
|
||||
|
||||
if tok_ is None:
|
||||
assert roi_ is None
|
||||
assert k_ is None
|
||||
assert v_ is None
|
||||
assert h_ is None
|
||||
n += 1
|
||||
else:
|
||||
assert torch.allclose(tok, tok_)
|
||||
assert torch.allclose(roi, roi_)
|
||||
assert torch.allclose(k, k_)
|
||||
assert torch.allclose(v, v_)
|
||||
assert torch.allclose(h, h_)
|
||||
print(f"Rank {my_rank} done")
|
||||
torch.distributed.barrier()
|
||||
|
||||
if my_rank == 0:
|
||||
x = torch.tensor([0])
|
||||
torch.distributed.recv(x, 1)
|
||||
# the # of None received is the kv that are not selected
|
||||
assert x.item() == len(buf.buffer)
|
||||
# and the size of the buffer should be 2000 * buffer len
|
||||
print(buf.buffer_size)
|
||||
assert buf.buffer_size == 1700 * len(buf.buffer)
|
||||
else:
|
||||
torch.distributed.send(torch.tensor([n]), 0)
|
||||
|
||||
print(f"My rank: {my_rank}, Passed stress test!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
my_rank = int(os.environ["RANK"])
|
||||
|
||||
torch.distributed.init_process_group(
|
||||
backend="gloo",
|
||||
init_method="tcp://localhost:12398",
|
||||
world_size=2,
|
||||
rank=my_rank,
|
||||
)
|
||||
|
||||
print(f"initialized! My rank is {my_rank}")
|
||||
|
||||
config = KVTransferConfig(
|
||||
kv_connector="P2pNcclConnector",
|
||||
kv_buffer_device="cuda",
|
||||
kv_buffer_size=1e9,
|
||||
kv_rank=my_rank,
|
||||
kv_role="kv_both", # this arg doesn't matter in this test
|
||||
kv_parallel_size=2,
|
||||
kv_ip="127.0.0.1",
|
||||
kv_port=12345,
|
||||
)
|
||||
|
||||
data_pipe = PyNcclPipe(
|
||||
local_rank=my_rank,
|
||||
config=config,
|
||||
device="cuda",
|
||||
port_offset=0,
|
||||
)
|
||||
cpu_pipe = PyNcclPipe(
|
||||
local_rank=my_rank,
|
||||
config=config,
|
||||
device="cpu",
|
||||
port_offset=1,
|
||||
)
|
||||
|
||||
buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000)
|
||||
|
||||
test_run(my_rank, buffer, data_pipe.device)
|
||||
|
||||
stress_test(my_rank, buffer, data_pipe.device)
|
||||
|
||||
buffer.close()
|
||||
data_pipe.close()
|
||||
cpu_pipe.close()
|
||||
print("Done")
|
||||
@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
RANK=0 python3 test_lookup_buffer.py &
|
||||
PID0=$!
|
||||
RANK=1 python3 test_lookup_buffer.py &
|
||||
PID1=$!
|
||||
|
||||
wait $PID0
|
||||
wait $PID1
|
||||
@ -1,62 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
def run_python_script(script_name, timeout):
|
||||
script_name = f"kv_transfer/{script_name}"
|
||||
try:
|
||||
# Start both processes asynchronously using Popen
|
||||
process0 = subprocess.Popen(
|
||||
[sys.executable, script_name],
|
||||
env={"RANK": "0"}, # Set the RANK environment variable for process 0
|
||||
stdout=sys.stdout, # Pipe stdout to current stdout
|
||||
stderr=sys.stderr, # Pipe stderr to current stderr
|
||||
)
|
||||
|
||||
process1 = subprocess.Popen(
|
||||
[sys.executable, script_name],
|
||||
env={"RANK": "1"}, # Set the RANK environment variable for process 1
|
||||
stdout=sys.stdout, # Pipe stdout to current stdout
|
||||
stderr=sys.stderr, # Pipe stderr to current stderr
|
||||
)
|
||||
|
||||
# Wait for both processes to complete, with a timeout
|
||||
process0.wait(timeout=timeout)
|
||||
process1.wait(timeout=timeout)
|
||||
|
||||
# Check the return status of both processes
|
||||
if process0.returncode != 0:
|
||||
pytest.fail(f"Test {script_name} failed for RANK=0, {process0.returncode}")
|
||||
if process1.returncode != 0:
|
||||
pytest.fail(f"Test {script_name} failed for RANK=1, {process1.returncode}")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
# If either process times out, terminate both and fail the test
|
||||
process0.terminate()
|
||||
process1.terminate()
|
||||
pytest.fail(f"Test {script_name} timed out")
|
||||
except Exception as e:
|
||||
pytest.fail(f"Test {script_name} failed with error: {str(e)}")
|
||||
|
||||
|
||||
# Define the test cases using pytest's parametrize
|
||||
@pytest.mark.parametrize(
|
||||
"script_name,timeout",
|
||||
[
|
||||
("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
|
||||
("test_send_recv.py", 120), # First test case with a 120-second timeout
|
||||
],
|
||||
)
|
||||
def test_run_python_script(script_name, timeout):
|
||||
# Check the number of GPUs
|
||||
if torch.cuda.device_count() < 2:
|
||||
pytest.skip(f"Skipping test {script_name} because <2 GPUs are available")
|
||||
|
||||
# Run the test if there are at least 2 GPUs
|
||||
run_python_script(script_name, timeout)
|
||||
@ -1,154 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
|
||||
|
||||
|
||||
def test_run(my_rank, pipe):
|
||||
print(f"rank {my_rank} test_run starts....")
|
||||
# test run
|
||||
x = torch.tensor([1]).to(pipe.device)
|
||||
y = torch.tensor([[2.0, 3.0, 4.0, 8.0]]).to(pipe.device)
|
||||
if my_rank == 0:
|
||||
pipe.send_tensor(x)
|
||||
print(f"rank {my_rank} sent tensor x")
|
||||
pipe.send_tensor(y)
|
||||
print(f"rank {my_rank} sent tensor y")
|
||||
x2 = pipe.recv_tensor()
|
||||
print(f"rank {my_rank} received x2 = ", x2)
|
||||
y2 = pipe.recv_tensor()
|
||||
print(f"rank {my_rank} received y2 = ", y2)
|
||||
|
||||
else:
|
||||
x2 = pipe.recv_tensor()
|
||||
print(f"rank {my_rank} received x2 = ", x2)
|
||||
y2 = pipe.recv_tensor()
|
||||
print(f"rank {my_rank} received y2 = ", y2)
|
||||
pipe.send_tensor(x)
|
||||
print(f"rank {my_rank} sent tensor x")
|
||||
pipe.send_tensor(y)
|
||||
print(f"rank {my_rank} sent tensor y")
|
||||
|
||||
assert torch.allclose(x, x2)
|
||||
assert torch.allclose(y, y2)
|
||||
|
||||
print(f"rank {my_rank} test_run passed!")
|
||||
|
||||
|
||||
def stress_test(my_rank, pipe):
|
||||
print(f"rank {my_rank} stress_test starts....")
|
||||
|
||||
tensors: list[torch.Tensor] = []
|
||||
|
||||
torch.distributed.barrier()
|
||||
torch.manual_seed(0)
|
||||
|
||||
for i in tqdm(range(500)):
|
||||
mean = torch.rand(1).item() * 100
|
||||
std = torch.rand(1).item() * 100
|
||||
size = torch.randint(900, 1000, (2,))
|
||||
x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
|
||||
|
||||
# 5% probability of sending a None
|
||||
if torch.rand(1).item() < 0.05:
|
||||
tensors.append(None)
|
||||
tensors.append(None)
|
||||
tensors.append(None)
|
||||
else:
|
||||
tensors.append(x)
|
||||
tensors.append(x.mean().unsqueeze(0))
|
||||
tensors.append(x.std().unsqueeze(0))
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
for i in tqdm(range(500)):
|
||||
if my_rank == int((i % 10) > 3):
|
||||
pipe.send_tensor(tensors[3 * i])
|
||||
pipe.send_tensor(tensors[3 * i + 1])
|
||||
pipe.send_tensor(tensors[3 * i + 2])
|
||||
else:
|
||||
x = pipe.recv_tensor()
|
||||
mean = pipe.recv_tensor()
|
||||
std = pipe.recv_tensor()
|
||||
|
||||
if x is None:
|
||||
assert mean is None
|
||||
assert std is None
|
||||
else:
|
||||
assert torch.allclose(x, tensors[3 * i])
|
||||
assert x.mean() == mean[0]
|
||||
assert x.std() == std[0]
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
|
||||
def latency_test(my_rank, pipe, nelement, ntensor):
|
||||
latencies = []
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
for i in tqdm(range(500)):
|
||||
tensors = []
|
||||
|
||||
if my_rank == 0:
|
||||
# create tensor
|
||||
tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
if my_rank == 0:
|
||||
t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
|
||||
for tensor in tensors:
|
||||
pipe.send_tensor(tensor)
|
||||
pipe.send_tensor(t)
|
||||
else:
|
||||
for _ in range(ntensor):
|
||||
pipe.recv_tensor()
|
||||
t = pipe.recv_tensor()
|
||||
latencies.append(time.time() - t.item())
|
||||
|
||||
torch.distributed.barrier()
|
||||
|
||||
print("Latency test passed.")
|
||||
print("Latency:", torch.tensor(latencies).mean().item() * 1000, "ms")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
my_rank = int(os.environ["RANK"])
|
||||
|
||||
torch.distributed.init_process_group(
|
||||
backend="gloo",
|
||||
init_method="tcp://localhost:12398",
|
||||
world_size=2,
|
||||
rank=my_rank,
|
||||
)
|
||||
|
||||
config = KVTransferConfig(
|
||||
kv_connector="P2pNcclConnector",
|
||||
kv_buffer_device="cuda",
|
||||
kv_buffer_size=1e9,
|
||||
kv_rank=my_rank,
|
||||
kv_role="kv_both", # this arg doesn't matter in this test
|
||||
kv_parallel_size=2,
|
||||
kv_ip="127.0.0.1",
|
||||
kv_port=12345,
|
||||
)
|
||||
|
||||
pipe = PyNcclPipe(
|
||||
local_rank=my_rank,
|
||||
config=config,
|
||||
)
|
||||
|
||||
test_run(my_rank, pipe)
|
||||
|
||||
stress_test(my_rank, pipe)
|
||||
|
||||
# Use this function if you want to test the latency of pipe impl.
|
||||
# latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
|
||||
@ -1,9 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
RANK=0 python3 test_send_recv.py &
|
||||
PID0=$!
|
||||
RANK=1 python3 test_send_recv.py &
|
||||
PID1=$!
|
||||
|
||||
wait $PID0
|
||||
wait $PID1
|
||||
@ -20,7 +20,7 @@ from vllm.config.multimodal import (
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.tokenizers import (
|
||||
MistralTokenizer,
|
||||
@ -418,4 +418,4 @@ def _assert_inputs_equal(
|
||||
a_data.pop(key, None)
|
||||
b_data.pop(key, None)
|
||||
|
||||
assert a_data == b_data, msg
|
||||
assert batched_tensors_equal(a_data, b_data), msg
|
||||
|
||||
@ -5,6 +5,7 @@ import pytest
|
||||
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import batched_tensors_equal
|
||||
from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
|
||||
|
||||
from ...utils import build_model_context
|
||||
@ -103,7 +104,7 @@ def test_video_loader_consistency(
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
|
||||
assert (
|
||||
static_outputs["mm_kwargs"].get_data()
|
||||
== dynamic_outputs["mm_kwargs"].get_data()
|
||||
assert batched_tensors_equal(
|
||||
static_outputs["mm_kwargs"].get_data(),
|
||||
dynamic_outputs["mm_kwargs"].get_data(),
|
||||
)
|
||||
|
||||
@ -130,10 +130,9 @@ def create_batched_mm_kwargs(
|
||||
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=processor_inputs.tokenization_kwargs,
|
||||
)["mm_kwargs"].require_data()
|
||||
items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
|
||||
|
||||
return group_mm_kwargs_by_modality(
|
||||
items,
|
||||
merge_by_field_config=model_cls.merge_by_field_config,
|
||||
[item for modality in supported_mm_limits for item in mm_kwargs[modality]]
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -416,7 +416,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"Qwen2ForCausalLM": _HfExamplesInfo(
|
||||
"Qwen/Qwen2-0.5B-Instruct", extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}
|
||||
"Qwen/Qwen2-0.5B-Instruct",
|
||||
extras={
|
||||
"2.5": "Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"2.5-1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
|
||||
},
|
||||
),
|
||||
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
|
||||
"Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
|
||||
|
||||
@ -85,12 +85,6 @@ def _dummy_items(
|
||||
(_dummy_item("a", {"a1": 100}), 100),
|
||||
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
||||
(_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||
(
|
||||
_dummy_items(
|
||||
{"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}
|
||||
).get_data(),
|
||||
460,
|
||||
), # noqa: E501
|
||||
],
|
||||
)
|
||||
def test_cache_item_size(item, expected_size):
|
||||
@ -107,6 +101,9 @@ def test_cache_item_size(item, expected_size):
|
||||
cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
|
||||
assert cache.currsize == expected_size
|
||||
|
||||
cache[""] = item.get_data()
|
||||
assert cache.currsize == expected_size
|
||||
|
||||
|
||||
def _create_vllm_config(
|
||||
*,
|
||||
|
||||
@ -1,91 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def assert_nested_tensors_equal(expected: NestedTensors, actual: NestedTensors):
|
||||
assert type(expected) == type(actual) # noqa: E721
|
||||
if isinstance(expected, torch.Tensor):
|
||||
assert torch.equal(expected, actual)
|
||||
else:
|
||||
for expected_item, actual_item in zip(expected, actual):
|
||||
assert_nested_tensors_equal(expected_item, actual_item)
|
||||
|
||||
|
||||
def assert_multimodal_inputs_equal(
|
||||
expected: MultiModalKwargs, actual: MultiModalKwargs
|
||||
):
|
||||
assert set(expected.keys()) == set(actual.keys())
|
||||
for key in expected:
|
||||
assert_nested_tensors_equal(expected[key], actual[key])
|
||||
|
||||
|
||||
def test_multimodal_input_batch_single_tensor():
|
||||
t = torch.rand([1, 2])
|
||||
result = MultiModalKwargs.batch([{"image": t}])
|
||||
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
|
||||
|
||||
|
||||
def test_multimodal_input_batch_multiple_tensors():
|
||||
a = torch.rand([1, 1, 2])
|
||||
b = torch.rand([1, 1, 2])
|
||||
c = torch.rand([1, 1, 2])
|
||||
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
|
||||
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
|
||||
|
||||
|
||||
def test_multimodal_input_batch_multiple_heterogeneous_tensors():
|
||||
a = torch.rand([1, 2, 2])
|
||||
b = torch.rand([1, 3, 2])
|
||||
c = torch.rand([1, 4, 2])
|
||||
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
|
||||
assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
|
||||
|
||||
|
||||
def test_multimodal_input_batch_nested_tensors():
|
||||
a = torch.rand([2, 3])
|
||||
b = torch.rand([2, 3])
|
||||
c = torch.rand([2, 3])
|
||||
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b]}, {"image": [c]}])
|
||||
assert_multimodal_inputs_equal(
|
||||
result, {"image": torch.stack([a.unsqueeze(0), b.unsqueeze(0), c.unsqueeze(0)])}
|
||||
)
|
||||
|
||||
|
||||
def test_multimodal_input_batch_heterogeneous_lists():
|
||||
a = torch.rand([1, 2, 3])
|
||||
b = torch.rand([1, 2, 3])
|
||||
c = torch.rand([1, 2, 3])
|
||||
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
|
||||
assert_multimodal_inputs_equal(
|
||||
result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}
|
||||
)
|
||||
|
||||
|
||||
def test_multimodal_input_batch_multiple_batchable_lists():
|
||||
a = torch.rand([1, 2, 3])
|
||||
b = torch.rand([1, 2, 3])
|
||||
c = torch.rand([1, 2, 3])
|
||||
d = torch.rand([1, 2, 3])
|
||||
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
|
||||
assert_multimodal_inputs_equal(
|
||||
result, {"image": torch.stack([torch.stack([a, b]), torch.stack([c, d])])}
|
||||
)
|
||||
|
||||
|
||||
def test_multimodal_input_batch_mixed_stacking_depths():
|
||||
a = torch.rand([1, 2, 3])
|
||||
b = torch.rand([1, 3, 3])
|
||||
c = torch.rand([1, 4, 3])
|
||||
|
||||
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
|
||||
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
|
||||
|
||||
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
|
||||
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
|
||||
188
tests/reasoning/test_holo2_reasoning_parser.py
Normal file
188
tests/reasoning/test_holo2_reasoning_parser.py
Normal file
@ -0,0 +1,188 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
|
||||
from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser
|
||||
from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
|
||||
|
||||
REASONING_MODEL_NAME = "HCompany/Holo2-4B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"thinking,expected_parser_type",
|
||||
[
|
||||
(True, DeepSeekR1ReasoningParser),
|
||||
(False, IdentityReasoningParser),
|
||||
],
|
||||
)
|
||||
def test_parser_selection(tokenizer, thinking, expected_parser_type):
|
||||
parser = Holo2ReasoningParser(
|
||||
tokenizer,
|
||||
chat_template_kwargs={
|
||||
"thinking": thinking,
|
||||
},
|
||||
)
|
||||
|
||||
assert isinstance(parser._parser, expected_parser_type)
|
||||
|
||||
|
||||
def test_holo2_default_parser_is_deepseekr1(tokenizer):
|
||||
parser = Holo2ReasoningParser(tokenizer)
|
||||
|
||||
assert isinstance(parser._parser, DeepSeekR1ReasoningParser)
|
||||
|
||||
|
||||
def test_holo2_supports_structured_output(tokenizer):
|
||||
# Structured output manager uses the reasoning parser to check if the
|
||||
# reasoning content is ended before applying the grammar. The main function
|
||||
# used is is_reasoning_end. This test checks if the parser is able to
|
||||
# correctly identify the end of the reasoning content.
|
||||
|
||||
# important to not pass chat_template_kwargs here as it is done in the
|
||||
# StructuredOutputManager
|
||||
parser = Holo2ReasoningParser(tokenizer)
|
||||
|
||||
end_token_id = tokenizer.encode("</think>", add_special_tokens=False)[0]
|
||||
|
||||
assert parser.is_reasoning_end([1, 2, 4, end_token_id])
|
||||
assert not parser.is_reasoning_end([1, 2, 4])
|
||||
assert parser.is_reasoning_end([1, 2, 4, end_token_id, 5])
|
||||
|
||||
|
||||
# thinking is True, non-streaming
|
||||
WITH_THINK = {
|
||||
"output": "This is a reasoning section</think>This is the rest",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# thinking is True, streaming
|
||||
WITH_THINK_STREAM = {
|
||||
"output": "This is a reasoning section</think>This is the rest",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# thinking is False, non-streaming
|
||||
THINKING_DISABLED = {
|
||||
"output": "This is the rest",
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# thinking is False, streaming
|
||||
THINKING_DISABLED_STREAM = {
|
||||
"output": "This is the rest",
|
||||
"reasoning": None,
|
||||
"content": "This is the rest",
|
||||
}
|
||||
# thinking is False but the model output </think>, non-streaming
|
||||
THINKING_DISABLED_WITH_CLOSE_TAG = {
|
||||
"output": "</think>This is the rest",
|
||||
"reasoning": None,
|
||||
"content": "</think>This is the rest",
|
||||
}
|
||||
# thinking is False but the model output </think>, streaming
|
||||
THINKING_DISABLED_WITH_CLOSE_TAG_STREAM = {
|
||||
"output": "some text</think>This is the rest",
|
||||
"reasoning": None,
|
||||
"content": "some text</think>This is the rest",
|
||||
}
|
||||
COMPLETE_REASONING = {
|
||||
"output": "This is a reasoning section</think>",
|
||||
"reasoning": "This is a reasoning section",
|
||||
"content": None,
|
||||
}
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
False,
|
||||
WITH_THINK,
|
||||
None,
|
||||
id="with_think",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
WITH_THINK_STREAM,
|
||||
None,
|
||||
id="with_think_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
WITH_THINK,
|
||||
{"thinking": True},
|
||||
id="with_think_enabled",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
WITH_THINK_STREAM,
|
||||
{"thinking": True},
|
||||
id="with_think_stream_enabled",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
THINKING_DISABLED,
|
||||
{"thinking": False},
|
||||
id="thinking_disabled",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
THINKING_DISABLED_STREAM,
|
||||
{"thinking": False},
|
||||
id="thinking_disabled_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
THINKING_DISABLED_WITH_CLOSE_TAG,
|
||||
{"thinking": False},
|
||||
id="thinking_disabled_with_close_tag",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
THINKING_DISABLED_WITH_CLOSE_TAG_STREAM,
|
||||
{"thinking": False},
|
||||
id="thinking_disabled_with_close_tag_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
COMPLETE_REASONING,
|
||||
None,
|
||||
id="complete_reasoning",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
COMPLETE_REASONING,
|
||||
None,
|
||||
id="complete_reasoning_stream",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, param_dict, chat_template_kwargs", TEST_CASES)
|
||||
def test_reasoning(
|
||||
streaming: bool,
|
||||
param_dict: dict,
|
||||
chat_template_kwargs: dict | None,
|
||||
tokenizer,
|
||||
):
|
||||
output = tokenizer.tokenize(param_dict["output"])
|
||||
output_tokens: list[str] = [
|
||||
tokenizer.convert_tokens_to_string([token]) for token in output
|
||||
]
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser("holo2")(
|
||||
tokenizer,
|
||||
chat_template_kwargs=chat_template_kwargs,
|
||||
)
|
||||
|
||||
reasoning, content = run_reasoning_extraction(
|
||||
parser, output_tokens, streaming=streaming
|
||||
)
|
||||
|
||||
assert reasoning == param_dict["reasoning"]
|
||||
assert content == param_dict["content"]
|
||||
@ -460,23 +460,20 @@ def test_flat_product():
|
||||
]
|
||||
|
||||
|
||||
def test_o_legacy_syntax_deprecation(caplog_vllm):
|
||||
"""Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
|
||||
def test_o_dotted_syntax_error():
|
||||
"""Test that -O.* dotted syntax raises a clear error message."""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("-cc", "--compilation-config", type=json.loads)
|
||||
|
||||
# Test that -O.backend gets converted correctly AND emits warning
|
||||
args = parser.parse_args(["-O.backend=eager"])
|
||||
assert args.compilation_config == {"backend": "eager"}
|
||||
# Test that -O.* syntax raises a clear ValueError
|
||||
with pytest.raises(ValueError, match=r"The -O\.\* syntax is no longer supported"):
|
||||
parser.parse_args(["-O.backend=eager"])
|
||||
|
||||
# Check that deprecation warning was logged
|
||||
assert len(caplog_vllm.records) >= 1
|
||||
assert (
|
||||
"The -O.* dotted syntax for --compilation-config is deprecated"
|
||||
in caplog_vllm.text
|
||||
)
|
||||
with pytest.raises(ValueError, match=r"Please use -cc\.\* instead"):
|
||||
parser.parse_args(["-O.mode=2"])
|
||||
|
||||
# Test that -O.mode gets converted correctly
|
||||
# Note: warning_once won't emit again in same session
|
||||
args = parser.parse_args(["-O.mode=2"])
|
||||
assert args.compilation_config == {"mode": 2}
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=r"replace '-O\.cudagraph_mode=NONE' with '-cc\.cudagraph_mode=NONE'",
|
||||
):
|
||||
parser.parse_args(["-O.cudagraph_mode=NONE"])
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import (
|
||||
split_attn_metadata,
|
||||
split_decodes_and_prefills,
|
||||
)
|
||||
from vllm.v1.worker.ubatch_utils import create_ubatch_slices
|
||||
from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -294,8 +294,14 @@ def test_prefill_split_across_ubatches(
|
||||
qsl_np = common.query_start_loc_cpu.numpy()
|
||||
num_tokens = common.num_actual_tokens
|
||||
|
||||
ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point)
|
||||
assert len(ubatch_slices) == 2
|
||||
ubatch_slices, _ = maybe_create_ubatch_slices(
|
||||
True,
|
||||
num_scheduled_tokens,
|
||||
num_tokens,
|
||||
batch_spec.batch_size,
|
||||
split_point=split_point,
|
||||
)
|
||||
assert ubatch_slices is not None and len(ubatch_slices) == 2
|
||||
|
||||
first_meta = _make_metadata_with_slice(ubatch_slices[0], common)
|
||||
second_meta = _make_metadata_with_slice(ubatch_slices[1], common)
|
||||
|
||||
@ -21,6 +21,7 @@ def test_reset_prefix_cache_e2e(monkeypatch):
|
||||
max_num_batched_tokens=32,
|
||||
max_model_len=2048,
|
||||
compilation_config={"mode": 0},
|
||||
dtype="float16",
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
sampling_params = SamplingParams(
|
||||
|
||||
@ -9,10 +9,22 @@ correctly with the DeepSeek-V2-Lite model using GSM8K evaluation.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
# Detect Blackwell / B200 (compute capability 10.x)
|
||||
try:
|
||||
if torch.cuda.is_available():
|
||||
cap = torch.cuda.get_device_capability(0)
|
||||
IS_BLACKWELL = cap[0] >= 10
|
||||
else:
|
||||
IS_BLACKWELL = False
|
||||
except Exception:
|
||||
# Be conservative: if we can't detect, don't xfail by default
|
||||
IS_BLACKWELL = False
|
||||
|
||||
MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
DP_SIZE = 2
|
||||
|
||||
@ -33,6 +45,13 @@ DEEPEP_BACKENDS = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS)
|
||||
@pytest.mark.xfail(
|
||||
IS_BLACKWELL,
|
||||
reason=(
|
||||
"Temporary: DBO accuracy unstable on Blackwell "
|
||||
"(doesn't meet expectation of MIN_ACCURACY = 0.62)"
|
||||
),
|
||||
)
|
||||
def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
|
||||
"""
|
||||
Test DBO with DP+EP using GSM8K evaluation.
|
||||
|
||||
@ -1,9 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from concurrent.futures import Future
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.config import StructuredOutputsConfig, VllmConfig
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
from vllm.v1.request import Request
|
||||
@ -116,3 +121,72 @@ def test_grammar_bitmask_with_specdec():
|
||||
) # EOS not the final token
|
||||
grammar_bitmask(request, prompt[i:]) # EOS not present
|
||||
grammar_bitmask(request, prompt[i:] + [tokenizer.eos_token_id])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("async_grammar", [True, False])
|
||||
def test_grammar_init_async_and_sync(async_grammar):
|
||||
"""Test grammar initialization works correctly in both async and sync modes.
|
||||
|
||||
This test validates that the distributed_executor_backend config option
|
||||
correctly controls whether grammar compilation happens asynchronously
|
||||
(via executor.submit) or synchronously. When set to "external_launcher",
|
||||
grammar compilation is synchronous to avoid deadlocks.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
|
||||
prompt = tokenizer.encode('{"a": "b"}')
|
||||
|
||||
# Use "external_launcher" for sync mode, None for async mode
|
||||
executor_backend = None if async_grammar else "external_launcher"
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(tokenizer=TOKENIZER),
|
||||
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
||||
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
|
||||
)
|
||||
structured_output_manager = StructuredOutputManager(vllm_config)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json='{"type": "object"}',
|
||||
),
|
||||
)
|
||||
sampling_params.structured_outputs._backend = "guidance"
|
||||
|
||||
request = Request(
|
||||
"test_request",
|
||||
prompt_token_ids=prompt,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
)
|
||||
|
||||
structured_output_manager.grammar_init(request)
|
||||
|
||||
# Check the internal _grammar type immediately after init
|
||||
# Before _check_grammar_completion is called, async mode should have a Future
|
||||
raw_grammar = request.structured_output_request._grammar
|
||||
if async_grammar:
|
||||
assert isinstance(raw_grammar, Future), (
|
||||
"Async mode should store a Future before completion"
|
||||
)
|
||||
else:
|
||||
assert not isinstance(raw_grammar, Future), (
|
||||
"Sync mode should store the grammar directly, not a Future"
|
||||
)
|
||||
|
||||
# Wait for grammar to be ready (handles both async and sync cases)
|
||||
start_time = time.time()
|
||||
while not request.structured_output_request._check_grammar_completion():
|
||||
if time.time() - start_time > 5: # 5-second timeout
|
||||
pytest.fail("Grammar compilation timed out")
|
||||
time.sleep(0.01)
|
||||
|
||||
# After completion, _grammar should no longer be a Future
|
||||
assert not isinstance(request.structured_output_request._grammar, Future)
|
||||
|
||||
# Verify grammar is properly initialized and functional
|
||||
grammar = request.structured_output_request.grammar
|
||||
assert grammar is not None
|
||||
assert not grammar.is_terminated()
|
||||
|
||||
# Verify the grammar can accept valid tokens
|
||||
assert grammar.accept_tokens(request.request_id, prompt)
|
||||
|
||||
@ -1842,6 +1842,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
random_seed=args.seed,
|
||||
dataset_path=args.dataset_path,
|
||||
disable_shuffle=args.disable_shuffle,
|
||||
prefix_len=args.common_prefix_len,
|
||||
).sample(
|
||||
tokenizer=tokenizer,
|
||||
num_requests=args.num_prompts,
|
||||
|
||||
@ -1221,6 +1221,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
help="Repetition penalty sampling parameter. Only has effect on "
|
||||
"openai-compatible backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--common-prefix-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Common prefix length shared by all prompts (used by random dataset)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tokenizer-mode",
|
||||
|
||||
@ -402,6 +402,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
||||
self.extra_traceback = False
|
||||
|
||||
def run(self, *args):
|
||||
# maybe instead just assert inputs are fake?
|
||||
fake_args = [
|
||||
self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
|
||||
for t in args
|
||||
@ -416,11 +417,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
||||
kwargs: dict[str, Any],
|
||||
) -> Any:
|
||||
assert isinstance(target, str)
|
||||
|
||||
output = super().call_module(target, args, kwargs)
|
||||
|
||||
if target in self.compile_submod_names:
|
||||
index = self.compile_submod_names.index(target)
|
||||
submod = self.fetch_attr(target)
|
||||
|
||||
sym_shape_indices = [
|
||||
i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
|
||||
]
|
||||
@ -746,11 +749,21 @@ class VllmBackend:
|
||||
if not item.is_splitting_graph
|
||||
]
|
||||
|
||||
# Extract fake values from the graph to use them when needed.
|
||||
all_fake_values = []
|
||||
for i in graph.graph.find_nodes(op="placeholder"):
|
||||
all_fake_values.append(i.meta["example_value"])
|
||||
|
||||
fake_args = [
|
||||
all_fake_values[i] if isinstance(t, torch.Tensor) else t
|
||||
for i, t in enumerate(example_inputs)
|
||||
]
|
||||
|
||||
# propagate the split graph to the piecewise backend,
|
||||
# compile submodules with symbolic shapes
|
||||
PiecewiseCompileInterpreter(
|
||||
self.split_gm, submod_names_to_compile, self.vllm_config, self
|
||||
).run(*example_inputs)
|
||||
).run(*fake_args)
|
||||
|
||||
graph_path = os.path.join(local_cache_dir, "computation_graph.py")
|
||||
if not os.path.exists(graph_path):
|
||||
@ -780,14 +793,7 @@ class VllmBackend:
|
||||
)
|
||||
|
||||
# if we need to copy input buffers for cudagraph
|
||||
from torch._guards import detect_fake_mode
|
||||
|
||||
fake_mode = detect_fake_mode()
|
||||
fake_args = [
|
||||
fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
|
||||
for t in example_inputs
|
||||
]
|
||||
|
||||
#
|
||||
# index of tensors that have symbolic shapes (batch size)
|
||||
# for weights and static buffers, they will have concrete shapes.
|
||||
# symbolic shape only happens for input tensors.
|
||||
|
||||
@ -433,7 +433,6 @@ def _support_torch_compile(
|
||||
return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
|
||||
|
||||
# This is the path for the first compilation.
|
||||
|
||||
# the first compilation needs to have dynamic shapes marked
|
||||
_mark_dynamic_inputs(
|
||||
self,
|
||||
|
||||
@ -5,6 +5,7 @@ from collections.abc import Iterable
|
||||
|
||||
import torch.fx
|
||||
from torch import SymInt
|
||||
from torch.fx.experimental.symbolic_shapes import statically_known_true
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
@ -116,12 +117,7 @@ class NoOpEliminationPass(VllmInductorPass):
|
||||
2. The dimensions both correspond to the same SymInt
|
||||
"""
|
||||
# Case 1
|
||||
if isinstance(i_dim, int) and isinstance(dim, int):
|
||||
return dim == i_dim
|
||||
# Case 2
|
||||
if isinstance(i_dim, SymInt) and isinstance(dim, SymInt):
|
||||
return dim == i_dim
|
||||
return False
|
||||
return statically_known_true(dim == i_dim)
|
||||
|
||||
def all_dims_equivalent(
|
||||
self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]
|
||||
|
||||
@ -14,6 +14,7 @@ import torch._C._dynamo.guards
|
||||
import vllm.envs as envs
|
||||
from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.nvtx_pytorch_hooks import layerwise_nvtx_marker_context
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -92,12 +93,29 @@ class TorchCompileWithNoGuardsWrapper:
|
||||
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
def _call_with_optional_nvtx_range(self, callable_fn, *args, **kwargs):
|
||||
if self.layerwise_nvtx_tracing_enabled:
|
||||
args_list = list(args)
|
||||
kwargs_dict = dict(kwargs)
|
||||
with layerwise_nvtx_marker_context(
|
||||
"Torch Compiled Module (input):{}".format(self.__class__.__name__),
|
||||
self,
|
||||
in_tensor=args_list,
|
||||
kwargs=kwargs_dict,
|
||||
) as ctx:
|
||||
ctx.result = callable_fn(*args, **kwargs)
|
||||
return ctx.result
|
||||
return callable_fn(*args, **kwargs)
|
||||
|
||||
def __init__(self):
|
||||
self.compiled = False
|
||||
|
||||
vllm_config = get_current_vllm_config()
|
||||
self.vllm_config = vllm_config
|
||||
mode = vllm_config.compilation_config.mode
|
||||
self.layerwise_nvtx_tracing_enabled = (
|
||||
vllm_config.observability_config.enable_layerwise_nvtx_tracing
|
||||
)
|
||||
if mode is None:
|
||||
raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
|
||||
|
||||
@ -168,13 +186,19 @@ class TorchCompileWithNoGuardsWrapper:
|
||||
# Make sure a compilation is triggered by clearing dynamo
|
||||
# cache.
|
||||
torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
|
||||
return self._compiled_callable(*args, **kwargs)
|
||||
return self._call_with_optional_nvtx_range(
|
||||
self._compiled_callable, *args, **kwargs
|
||||
)
|
||||
else:
|
||||
with self._dispatch_to_compiled_code():
|
||||
return self.forward(*args, **kwargs)
|
||||
return self._call_with_optional_nvtx_range(
|
||||
self.forward, *args, **kwargs
|
||||
)
|
||||
else:
|
||||
with _compilation_context():
|
||||
return self._compiled_callable(*args, **kwargs)
|
||||
return self._call_with_optional_nvtx_range(
|
||||
self._compiled_callable, *args, **kwargs
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def forward(self, *args, **kwargs): ...
|
||||
|
||||
@ -29,7 +29,7 @@ CacheDType = Literal[
|
||||
"fp8_inc",
|
||||
"fp8_ds_mla",
|
||||
]
|
||||
MambaDType = Literal["auto", "float32"]
|
||||
MambaDType = Literal["auto", "float32", "float16"]
|
||||
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor", "xxhash", "xxhash_cbor"]
|
||||
KVOffloadingBackend = Literal["native", "lmcache"]
|
||||
|
||||
|
||||
@ -59,6 +59,11 @@ class ObservabilityConfig:
|
||||
"""Enable CUDA graph metrics (number of padded/unpadded tokens, runtime cudagraph
|
||||
dispatch modes, and their observed frequencies at every logging interval)."""
|
||||
|
||||
enable_layerwise_nvtx_tracing: bool = False
|
||||
"""Enable layerwise NVTX tracing. This traces the execution of each layer or
|
||||
module in the model and attach informations such as input/output shapes to
|
||||
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
|
||||
|
||||
@cached_property
|
||||
def collect_model_forward_time(self) -> bool:
|
||||
"""Whether to collect model forward time for the request."""
|
||||
|
||||
@ -35,6 +35,7 @@ logger = init_logger(__name__)
|
||||
ExpertPlacementStrategy = Literal["linear", "round_robin"]
|
||||
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
|
||||
DataParallelBackend = Literal["ray", "mp"]
|
||||
EPLBPolicyOption = Literal["default"]
|
||||
|
||||
|
||||
@config
|
||||
@ -65,6 +66,9 @@ class EPLBConfig:
|
||||
Whether to use non-blocking EPLB.
|
||||
"""
|
||||
|
||||
policy: EPLBPolicyOption = "default"
|
||||
"""The policy type for expert parallel load balancing (EPLB)."""
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
|
||||
@ -671,36 +671,22 @@ class VllmConfig:
|
||||
|
||||
if current_platform.support_static_graph_mode():
|
||||
# if cudagraph_mode has full cudagraphs, we need to check support
|
||||
if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||
# decode context parallel does not support full cudagraphs
|
||||
if self.parallel_config.decode_context_parallel_size > 1:
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
and self.model_config is not None
|
||||
):
|
||||
if self.model_config.pooler_config is not None:
|
||||
logger.warning_once(
|
||||
"Decode context parallel (DCP) is enabled, which is "
|
||||
"incompatible with full CUDA graphs. "
|
||||
"Pooling models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
# prefill context parallel do not support full cudagraphs
|
||||
elif self.parallel_config.prefill_context_parallel_size > 1:
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
logger.warning_once(
|
||||
"Prefill context parallel (PCP) is enabled, which is "
|
||||
"incompatible with full CUDA graphs. "
|
||||
"Encoder-decoder models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
elif self.model_config is not None:
|
||||
if self.model_config.pooler_config is not None:
|
||||
logger.warning_once(
|
||||
"Pooling models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
logger.warning_once(
|
||||
"Encoder-decoder models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and self.model_config.enforce_eager:
|
||||
|
||||
@ -1,8 +1,3 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Expert parallelism load balancer (EPLB).
|
||||
"""
|
||||
|
||||
from .eplb_state import *
|
||||
from .rebalance_algo import *
|
||||
"""Expert parallelism load balancer (EPLB)."""
|
||||
|
||||
@ -45,7 +45,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.models.interfaces import MixtureOfExperts
|
||||
|
||||
from .async_worker import start_async_worker
|
||||
from .rebalance_algo import rebalance_experts
|
||||
from .policy import EPLB_POLICIES, AbstractEplbPolicy, DefaultEplbPolicy
|
||||
from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -213,18 +213,23 @@ class EplbState:
|
||||
self.parallel_config = parallel_config
|
||||
self.device = device
|
||||
self.model_states: dict[str, EplbModelState] = {}
|
||||
self.policy: type[AbstractEplbPolicy] = DefaultEplbPolicy
|
||||
"""
|
||||
Selected EPLB algorithm class
|
||||
"""
|
||||
self.expert_load_window_step: int = 0
|
||||
"""
|
||||
Current step in the sliding window.
|
||||
|
||||
Different from `expert_rearrangement_step`,
|
||||
each EP rank may have its own `expert_load_window_step`.
|
||||
"""
|
||||
self.expert_load_window_step: int = 0
|
||||
self.expert_load_window_size: int = 0
|
||||
"""
|
||||
Size of the expert load sliding window.
|
||||
This is a constant and is taken from the config.
|
||||
"""
|
||||
self.expert_load_window_size: int = 0
|
||||
self.expert_rearrangement_step: int = 0
|
||||
"""
|
||||
Steps after last rearrangement.
|
||||
Will trigger a rearrangement if it exceeds the threshold.
|
||||
@ -415,6 +420,10 @@ class EplbState:
|
||||
)
|
||||
self.expert_rearrangement_step_interval = eplb_step_interval
|
||||
|
||||
# Set the policy based on the selected eplb algorithm type.
|
||||
policy_type = self.parallel_config.eplb_config.policy
|
||||
self.policy = EPLB_POLICIES[policy_type]
|
||||
logger.debug("Selected EPLB policy: %d", policy_type)
|
||||
if global_expert_load is not None:
|
||||
ep_group = get_ep_group().device_group
|
||||
assert global_expert_load.shape == (
|
||||
@ -441,7 +450,7 @@ class EplbState:
|
||||
new_physical_to_logical_map,
|
||||
new_logical_to_physical_map,
|
||||
new_logical_replica_count,
|
||||
) = rebalance_experts(
|
||||
) = self.policy.rebalance_experts(
|
||||
global_expert_load,
|
||||
num_replicas,
|
||||
num_groups,
|
||||
@ -776,6 +785,7 @@ class EplbState:
|
||||
f"{num_gpus=}, {num_nodes=}"
|
||||
)
|
||||
|
||||
# Get new expert mappings
|
||||
for eplb_model_state, global_expert_load_window in zip(
|
||||
self.model_states.values(), global_expert_load_windows
|
||||
):
|
||||
@ -784,7 +794,7 @@ class EplbState:
|
||||
new_physical_to_logical_map,
|
||||
new_logical_to_physical_map,
|
||||
new_logical_replica_count,
|
||||
) = rebalance_experts(
|
||||
) = self.policy.rebalance_experts(
|
||||
global_expert_load_window,
|
||||
num_replicas,
|
||||
num_groups,
|
||||
|
||||
19
vllm/distributed/eplb/policy/__init__.py
Normal file
19
vllm/distributed/eplb/policy/__init__.py
Normal file
@ -0,0 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import get_args
|
||||
|
||||
from vllm.config.parallel import EPLBPolicyOption
|
||||
|
||||
from .abstract import AbstractEplbPolicy
|
||||
from .default import DefaultEplbPolicy
|
||||
|
||||
EPLB_POLICIES = {"default": DefaultEplbPolicy}
|
||||
|
||||
# Ensure that the EPLB_POLICIES keys match the EPLBPolicyOption values
|
||||
assert set(EPLB_POLICIES.keys()) == set(get_args(EPLBPolicyOption))
|
||||
|
||||
__all__ = [
|
||||
"AbstractEplbPolicy",
|
||||
"DefaultEplbPolicy",
|
||||
"EPLB_POLICIES",
|
||||
]
|
||||
40
vllm/distributed/eplb/policy/abstract.py
Normal file
40
vllm/distributed/eplb/policy/abstract.py
Normal file
@ -0,0 +1,40 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class AbstractEplbPolicy(ABC):
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def rebalance_experts(
|
||||
cls,
|
||||
weight: torch.Tensor,
|
||||
num_replicas: int,
|
||||
num_groups: int,
|
||||
num_nodes: int,
|
||||
num_ranks: int,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Entry point for expert-parallelism load balancer.
|
||||
|
||||
Parameters:
|
||||
weight: [layers, num_logical_experts], the load statistics
|
||||
for all logical experts
|
||||
num_replicas: number of physical experts, must be a multiple of
|
||||
`num_ranks`
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes
|
||||
num_ranks: number of ranks, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
physical_to_logical_map: [layers, num_replicas], the expert
|
||||
index of each replica
|
||||
logical_to_physical_map: [layers, num_logical_experts, X],
|
||||
the replica indices for each expert
|
||||
expert_count: [layers, num_logical_experts], number of
|
||||
physical replicas for each logical expert
|
||||
"""
|
||||
raise NotImplementedError
|
||||
267
vllm/distributed/eplb/policy/default.py
Normal file
267
vllm/distributed/eplb/policy/default.py
Normal file
@ -0,0 +1,267 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Expert parallelism load balancer (EPLB) for vLLM.
|
||||
|
||||
This module implements the core rearrangement algorithm.
|
||||
|
||||
The rearrangement algorithm is adapted from
|
||||
[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
|
||||
|
||||
Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
|
||||
on how the EPLB algorithm works.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .abstract import AbstractEplbPolicy
|
||||
|
||||
|
||||
class DefaultEplbPolicy(AbstractEplbPolicy):
|
||||
@classmethod
|
||||
def balanced_packing(
|
||||
cls, weight: torch.Tensor, num_packs: int
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Pack n weighted objects to m packs, such that each bin contains exactly
|
||||
n/m objects and the weights of all packs are as balanced as possible.
|
||||
|
||||
Parameters:
|
||||
weight: [X, n], the weight of each item
|
||||
num_packs: number of packs
|
||||
|
||||
Returns:
|
||||
pack_index: [X, n], the pack index of each item
|
||||
rank_in_pack: [X, n], the rank of the item in the pack
|
||||
"""
|
||||
num_layers, num_groups = weight.shape
|
||||
assert num_groups % num_packs == 0
|
||||
groups_per_pack = num_groups // num_packs
|
||||
|
||||
device = weight.device
|
||||
|
||||
if groups_per_pack == 1:
|
||||
pack_index = torch.arange(
|
||||
weight.size(-1), dtype=torch.int64, device=device
|
||||
).expand(weight.shape)
|
||||
rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
|
||||
return pack_index, rank_in_pack
|
||||
|
||||
weight_np = weight.cpu().numpy()
|
||||
|
||||
# Sort and get indices in decending order
|
||||
indices_np = np.argsort(-weight_np, axis=-1)
|
||||
|
||||
pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
|
||||
rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
|
||||
|
||||
# Run the packing algorithm
|
||||
for i in range(num_layers):
|
||||
pack_weights = [0.0] * num_packs
|
||||
pack_items = [0] * num_packs
|
||||
|
||||
for group in indices_np[i]:
|
||||
# Find a pack with capacity that has the lowest weight
|
||||
pack = min(
|
||||
(j for j in range(num_packs) if pack_items[j] < groups_per_pack),
|
||||
key=pack_weights.__getitem__,
|
||||
)
|
||||
|
||||
assert pack_items[pack] < groups_per_pack
|
||||
pack_index_np[i, group] = pack
|
||||
rank_in_pack_np[i, group] = pack_items[pack]
|
||||
pack_weights[pack] += weight_np[i, group]
|
||||
pack_items[pack] += 1
|
||||
|
||||
pack_index = torch.from_numpy(pack_index_np).to(device)
|
||||
rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
|
||||
|
||||
return pack_index, rank_in_pack
|
||||
|
||||
@classmethod
|
||||
def replicate_experts(
|
||||
cls, weight: torch.Tensor, num_phy: int
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Replicate `num_log` experts to `num_phy` replicas, such that the maximum
|
||||
load of all replicas is minimized.
|
||||
|
||||
Parameters:
|
||||
weight: [X, num_log]
|
||||
num_phy: total number of experts after replication
|
||||
|
||||
Returns:
|
||||
phy2log: [X, num_phy], logical expert id of each physical expert
|
||||
rank: [X, num_phy], the replica rank
|
||||
logcnt: [X, num_log], number of replicas for each logical expert
|
||||
"""
|
||||
n, num_log = weight.shape
|
||||
num_redundant = num_phy - num_log
|
||||
assert num_redundant >= 0
|
||||
device = weight.device
|
||||
phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
|
||||
rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
|
||||
logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
|
||||
arangen = torch.arange(n, dtype=torch.int64, device=device)
|
||||
for i in range(num_log, num_phy):
|
||||
redundant_indices = (weight / logcnt).max(dim=-1).indices
|
||||
phy2log[:, i] = redundant_indices
|
||||
rank[:, i] = logcnt[arangen, redundant_indices]
|
||||
logcnt[arangen, redundant_indices] += 1
|
||||
return phy2log, rank, logcnt
|
||||
|
||||
@classmethod
|
||||
def rebalance_experts_hierarchical(
|
||||
cls,
|
||||
weight: torch.Tensor,
|
||||
num_physical_experts: int,
|
||||
num_groups: int,
|
||||
num_nodes: int,
|
||||
num_gpus: int,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Parameters:
|
||||
weight: [num_moe_layers, num_logical_experts]
|
||||
num_physical_experts: number of physical experts after replication
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network
|
||||
(e.g, NVLink) is faster
|
||||
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
phy2log: [layers, num_replicas], the expert
|
||||
index of each replica
|
||||
log2phy: [layers, num_logical_experts, X],
|
||||
the replica indices for each expert
|
||||
logcnt: [layers, num_logical_experts], number of
|
||||
physical replicas for each logical expert
|
||||
"""
|
||||
num_layers, num_logical_experts = weight.shape
|
||||
assert num_logical_experts % num_groups == 0
|
||||
group_size = num_logical_experts // num_groups
|
||||
assert num_groups % num_nodes == 0
|
||||
groups_per_node = num_groups // num_nodes
|
||||
assert num_gpus % num_nodes == 0
|
||||
assert num_physical_experts % num_gpus == 0
|
||||
phy_experts_per_gpu = num_physical_experts // num_gpus
|
||||
|
||||
def inverse(perm: torch.Tensor) -> torch.Tensor:
|
||||
inv = torch.empty_like(perm)
|
||||
inv.scatter_(
|
||||
1,
|
||||
perm,
|
||||
torch.arange(
|
||||
perm.size(1), dtype=torch.int64, device=perm.device
|
||||
).expand(perm.shape),
|
||||
)
|
||||
return inv
|
||||
|
||||
# Step 1: pack groups to nodes
|
||||
tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
|
||||
group_pack_index, group_rank_in_pack = cls.balanced_packing(
|
||||
tokens_per_group, num_nodes
|
||||
)
|
||||
log2mlog = (
|
||||
(
|
||||
(group_pack_index * groups_per_node + group_rank_in_pack) * group_size
|
||||
).unsqueeze(-1)
|
||||
+ torch.arange(
|
||||
group_size, dtype=torch.int64, device=group_pack_index.device
|
||||
)
|
||||
).flatten(-2)
|
||||
mlog2log = inverse(log2mlog)
|
||||
|
||||
# Step 2: construct redundant experts within nodes
|
||||
# [num_layers * num_nodes, num_logical_experts // num_nodes]
|
||||
tokens_per_mlog = weight.gather(-1, mlog2log).view(
|
||||
-1, num_logical_experts // num_nodes
|
||||
)
|
||||
phy2mlog, phyrank, mlogcnt = cls.replicate_experts(
|
||||
tokens_per_mlog, num_physical_experts // num_nodes
|
||||
)
|
||||
|
||||
# Step 3: pack physical_experts to GPUs
|
||||
# [num_layers * num_nodes, num_physical_experts // num_nodes]
|
||||
tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
|
||||
pack_index, rank_in_pack = cls.balanced_packing(
|
||||
tokens_per_phy, num_gpus // num_nodes
|
||||
)
|
||||
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
|
||||
pphy2phy = inverse(phy2pphy)
|
||||
|
||||
pphy2mlog = phy2mlog.gather(
|
||||
-1, pphy2phy
|
||||
) # [num_layers * num_nodes, num_log_per_nodes]
|
||||
pphy2mlog = (
|
||||
pphy2mlog.view(num_layers, num_nodes, -1)
|
||||
+ torch.arange(
|
||||
0,
|
||||
num_logical_experts,
|
||||
num_logical_experts // num_nodes,
|
||||
device=group_pack_index.device,
|
||||
).view(1, -1, 1)
|
||||
).flatten(-2)
|
||||
pphy2log = mlog2log.gather(-1, pphy2mlog)
|
||||
pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
|
||||
logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
|
||||
return pphy2log, pphyrank, logcnt
|
||||
|
||||
@classmethod
|
||||
def rebalance_experts(
|
||||
cls,
|
||||
weight: torch.Tensor,
|
||||
num_replicas: int,
|
||||
num_groups: int,
|
||||
num_nodes: int,
|
||||
num_ranks: int,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Entry point for expert-parallelism load balancer.
|
||||
|
||||
Parameters:
|
||||
weight: [layers, num_logical_experts], the load statistics for all
|
||||
logical experts
|
||||
num_replicas: number of physical experts, must be a multiple of
|
||||
`num_gpus`
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network
|
||||
(e.g, NVLink) is faster
|
||||
num_ranks: number of ranks, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
phy2log: [layers, num_replicas], the expert
|
||||
index of each replica
|
||||
log2phy: [layers, num_logical_experts, X],
|
||||
the replica indices for each expert
|
||||
logcnt: [layers, num_logical_experts], number of
|
||||
physical replicas for each logical expert
|
||||
"""
|
||||
num_layers, num_logical_experts = weight.shape
|
||||
weight = weight.float()
|
||||
if num_groups % num_nodes == 0:
|
||||
# use hierarchical load-balance policy
|
||||
phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
|
||||
weight, num_replicas, num_groups, num_nodes, num_ranks
|
||||
)
|
||||
else:
|
||||
# use global load-balance policy
|
||||
phy2log, phyrank, logcnt = cls.rebalance_experts_hierarchical(
|
||||
weight, num_replicas, 1, 1, num_ranks
|
||||
)
|
||||
num_redundant_experts = num_replicas - num_logical_experts
|
||||
maxlogcnt = num_redundant_experts + 1
|
||||
log2phy: torch.Tensor = torch.full(
|
||||
(num_layers, num_logical_experts, maxlogcnt),
|
||||
-1,
|
||||
dtype=torch.int64,
|
||||
device=logcnt.device,
|
||||
)
|
||||
log2phy.view(num_layers, -1).scatter_(
|
||||
-1,
|
||||
phy2log * maxlogcnt + phyrank,
|
||||
torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
|
||||
num_layers, -1
|
||||
),
|
||||
)
|
||||
return phy2log, log2phy, logcnt
|
||||
@ -1,260 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Expert parallelism load balancer (EPLB) for vLLM.
|
||||
|
||||
This module implements the core rearrangement algorithm.
|
||||
|
||||
The rearrangement algorithm is adapted from
|
||||
[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
|
||||
|
||||
Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
|
||||
on how the EPLB algorithm works.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def balanced_packing(
|
||||
weight: torch.Tensor, num_packs: int
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Pack n weighted objects to m packs, such that each bin contains exactly
|
||||
n/m objects and the weights of all packs are as balanced as possible.
|
||||
|
||||
Parameters:
|
||||
weight: [X, n], the weight of each item
|
||||
num_packs: number of packs
|
||||
|
||||
Returns:
|
||||
pack_index: [X, n], the pack index of each item
|
||||
rank_in_pack: [X, n], the rank of the item in the pack
|
||||
"""
|
||||
num_layers, num_groups = weight.shape
|
||||
assert num_groups % num_packs == 0
|
||||
groups_per_pack = num_groups // num_packs
|
||||
|
||||
device = weight.device
|
||||
|
||||
if groups_per_pack == 1:
|
||||
pack_index = torch.arange(
|
||||
weight.size(-1), dtype=torch.int64, device=device
|
||||
).expand(weight.shape)
|
||||
rank_in_pack = torch.zeros_like(weight, dtype=torch.int64, device=device)
|
||||
return pack_index, rank_in_pack
|
||||
|
||||
weight_np = weight.cpu().numpy()
|
||||
|
||||
# Sort and get indices in decending order
|
||||
indices_np = np.argsort(-weight_np, axis=-1)
|
||||
|
||||
pack_index_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
|
||||
rank_in_pack_np = np.full((num_layers, num_groups), -1, dtype=np.int64)
|
||||
|
||||
# Run the packing algorithm
|
||||
for i in range(num_layers):
|
||||
pack_weights = [0.0] * num_packs
|
||||
pack_items = [0] * num_packs
|
||||
|
||||
for group in indices_np[i]:
|
||||
# Find a pack with capacity that has the lowest weight
|
||||
pack = min(
|
||||
(j for j in range(num_packs) if pack_items[j] < groups_per_pack),
|
||||
key=pack_weights.__getitem__,
|
||||
)
|
||||
|
||||
assert pack_items[pack] < groups_per_pack
|
||||
pack_index_np[i, group] = pack
|
||||
rank_in_pack_np[i, group] = pack_items[pack]
|
||||
pack_weights[pack] += weight_np[i, group]
|
||||
pack_items[pack] += 1
|
||||
|
||||
pack_index = torch.from_numpy(pack_index_np).to(device)
|
||||
rank_in_pack = torch.from_numpy(rank_in_pack_np).to(device)
|
||||
|
||||
return pack_index, rank_in_pack
|
||||
|
||||
|
||||
def replicate_experts(
|
||||
weight: torch.Tensor, num_phy: int
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Replicate `num_log` experts to `num_phy` replicas, such that the maximum
|
||||
load of all replicas is minimized.
|
||||
|
||||
Parameters:
|
||||
weight: [X, num_log]
|
||||
num_phy: total number of experts after replication
|
||||
|
||||
Returns:
|
||||
phy2log: [X, num_phy], logical expert id of each physical expert
|
||||
rank: [X, num_phy], the replica rank
|
||||
logcnt: [X, num_log], number of replicas for each logical expert
|
||||
"""
|
||||
n, num_log = weight.shape
|
||||
num_redundant = num_phy - num_log
|
||||
assert num_redundant >= 0
|
||||
device = weight.device
|
||||
phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
|
||||
rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
|
||||
logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
|
||||
arangen = torch.arange(n, dtype=torch.int64, device=device)
|
||||
for i in range(num_log, num_phy):
|
||||
redundant_indices = (weight / logcnt).max(dim=-1).indices
|
||||
phy2log[:, i] = redundant_indices
|
||||
rank[:, i] = logcnt[arangen, redundant_indices]
|
||||
logcnt[arangen, redundant_indices] += 1
|
||||
return phy2log, rank, logcnt
|
||||
|
||||
|
||||
def rebalance_experts_hierarchical(
|
||||
weight: torch.Tensor,
|
||||
num_physical_experts: int,
|
||||
num_groups: int,
|
||||
num_nodes: int,
|
||||
num_gpus: int,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Parameters:
|
||||
weight: [num_moe_layers, num_logical_experts]
|
||||
num_physical_experts: number of physical experts after replication
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network
|
||||
(e.g., NVLink) is faster
|
||||
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
physical_to_logical_map (torch.Tensor):
|
||||
[num_moe_layers, num_physical_experts]
|
||||
logical_to_physical_map (torch.Tensor):
|
||||
[num_moe_layers, num_logical_experts, X]
|
||||
logical_count (torch.Tensor):
|
||||
[num_moe_layers, num_logical_experts]
|
||||
"""
|
||||
num_layers, num_logical_experts = weight.shape
|
||||
assert num_logical_experts % num_groups == 0
|
||||
group_size = num_logical_experts // num_groups
|
||||
assert num_groups % num_nodes == 0
|
||||
groups_per_node = num_groups // num_nodes
|
||||
assert num_gpus % num_nodes == 0
|
||||
assert num_physical_experts % num_gpus == 0
|
||||
phy_experts_per_gpu = num_physical_experts // num_gpus
|
||||
|
||||
def inverse(perm: torch.Tensor) -> torch.Tensor:
|
||||
inv = torch.empty_like(perm)
|
||||
inv.scatter_(
|
||||
1,
|
||||
perm,
|
||||
torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
|
||||
perm.shape
|
||||
),
|
||||
)
|
||||
return inv
|
||||
|
||||
# Step 1: pack groups to nodes
|
||||
tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
|
||||
group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
|
||||
log2mlog = (
|
||||
(
|
||||
(group_pack_index * groups_per_node + group_rank_in_pack) * group_size
|
||||
).unsqueeze(-1)
|
||||
+ torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
|
||||
).flatten(-2)
|
||||
mlog2log = inverse(log2mlog)
|
||||
|
||||
# Step 2: construct redundant experts within nodes
|
||||
# [num_layers * num_nodes, num_logical_experts // num_nodes]
|
||||
tokens_per_mlog = weight.gather(-1, mlog2log).view(
|
||||
-1, num_logical_experts // num_nodes
|
||||
)
|
||||
phy2mlog, phyrank, mlogcnt = replicate_experts(
|
||||
tokens_per_mlog, num_physical_experts // num_nodes
|
||||
)
|
||||
|
||||
# Step 3: pack physical_experts to GPUs
|
||||
# [num_layers * num_nodes, num_physical_experts // num_nodes]
|
||||
tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
|
||||
pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
|
||||
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
|
||||
pphy2phy = inverse(phy2pphy)
|
||||
|
||||
pphy2mlog = phy2mlog.gather(
|
||||
-1, pphy2phy
|
||||
) # [num_layers * num_nodes, num_log_per_nodes]
|
||||
pphy2mlog = (
|
||||
pphy2mlog.view(num_layers, num_nodes, -1)
|
||||
+ torch.arange(
|
||||
0,
|
||||
num_logical_experts,
|
||||
num_logical_experts // num_nodes,
|
||||
device=group_pack_index.device,
|
||||
).view(1, -1, 1)
|
||||
).flatten(-2)
|
||||
pphy2log = mlog2log.gather(-1, pphy2mlog)
|
||||
pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
|
||||
logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
|
||||
return pphy2log, pphyrank, logcnt
|
||||
|
||||
|
||||
def rebalance_experts(
|
||||
weight: torch.Tensor,
|
||||
num_replicas: int,
|
||||
num_groups: int,
|
||||
num_nodes: int,
|
||||
num_gpus: int,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Entry point for expert-parallelism load balancer.
|
||||
|
||||
Parameters:
|
||||
weight: [layers, num_logical_experts], the load statistics for all
|
||||
logical experts
|
||||
num_replicas: number of physical experts, must be a multiple of
|
||||
`num_gpus`
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network
|
||||
(e.g, NVLink) is faster
|
||||
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
physical_to_logical_map:
|
||||
[layers, num_replicas], the expert index of each replica
|
||||
logical_to_physical_map:
|
||||
[layers, num_logical_experts, X], the replica indices for each
|
||||
expert
|
||||
expert_count:
|
||||
[layers, num_logical_experts], number of physical
|
||||
replicas for each logical expert
|
||||
"""
|
||||
num_layers, num_logical_experts = weight.shape
|
||||
weight = weight.float()
|
||||
if num_groups % num_nodes == 0:
|
||||
# use hierarchical load-balance policy
|
||||
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
else:
|
||||
# use global load-balance policy
|
||||
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
|
||||
weight, num_replicas, 1, 1, num_gpus
|
||||
)
|
||||
num_redundant_experts = num_replicas - num_logical_experts
|
||||
maxlogcnt = num_redundant_experts + 1
|
||||
log2phy: torch.Tensor = torch.full(
|
||||
(num_layers, num_logical_experts, maxlogcnt),
|
||||
-1,
|
||||
dtype=torch.int64,
|
||||
device=logcnt.device,
|
||||
)
|
||||
log2phy.view(num_layers, -1).scatter_(
|
||||
-1,
|
||||
phy2log * maxlogcnt + phyrank,
|
||||
torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
|
||||
num_layers, -1
|
||||
),
|
||||
)
|
||||
return phy2log, log2phy, logcnt
|
||||
|
||||
|
||||
__all__ = ["rebalance_experts"]
|
||||
@ -1,179 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file contains a new class `KVLookupBufferBase` that allows developers to
|
||||
think of KV cache operations as inserting new KV cache entries (`insert`)
|
||||
into the lookup buffer and querying existing KV caches (`drop_select`)
|
||||
from the lookup buffer.
|
||||
|
||||
This file also contains a new class `KVStoreBufferBase` that allows developers
|
||||
to manage the KVCache buffer as a simple key-value storage buffer with basic
|
||||
put/get operations.
|
||||
|
||||
These classes above are abstracted behind class `KVCacheBufferBase`.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class KVCacheBufferBase(ABC):
|
||||
"""
|
||||
Abstract base class for a KVCache buffer.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None:
|
||||
"""Close the buffer and release resources.
|
||||
|
||||
This method is responsible for cleaning up resources related to the
|
||||
KVCache buffer when it is no longer needed.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class KVLookupBufferBase(KVCacheBufferBase):
|
||||
"""
|
||||
Abstract base class for a KVCache lookup buffer.
|
||||
|
||||
This class provides an abstraction for a key-value (KV) cache lookup buffer.
|
||||
|
||||
The key of the lookup buffer:
|
||||
- input_tokens: token IDs of the request
|
||||
- roi: a binary mask on top of input_tokens.
|
||||
- Purpose of roi: Since KV cache may only be available for a subset of
|
||||
tokens in the input (for example, when vLLM is connected to an external
|
||||
KV cache service), roi specifies the subset of tokens that the KV cache
|
||||
is associated with.
|
||||
- NOTE: roi can be further extended to describe which part of KV the
|
||||
current process is holding (each process may only hold a part of KV
|
||||
due to TP and PP). This is not implemented for now.
|
||||
|
||||
The value of the lookup buffer:
|
||||
- key: the key tensor in the KV cache
|
||||
- value: the value tensor in the KV cache
|
||||
- hidden: the final hidden state generated by model forwarding. This allows
|
||||
vLLM to bypass further model forwarding by transmitting the hidden state.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def insert(
|
||||
self,
|
||||
input_tokens: torch.Tensor,
|
||||
roi: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
hidden: torch.Tensor,
|
||||
) -> None:
|
||||
"""Insert into the lookup buffer.
|
||||
|
||||
The functionality is similar to the following python statement
|
||||
```
|
||||
buffer[input_tokens, roi] = [key, value, hidden]
|
||||
```
|
||||
|
||||
FIXME: in the future, we should only have two arguments, key and value,
|
||||
where key is a tensor dict and value is a tensor dict.
|
||||
|
||||
FIXME: we should transmit both sampler outputs and the hidden states.
|
||||
|
||||
Args:
|
||||
input_tokens (torch.Tensor): token IDs.
|
||||
roi (torch.Tensor): A binary mask on top of the input tokens
|
||||
key (torch.Tensor): The key tensor in the KV cache.
|
||||
value (torch.Tensor): The value tensor in the KV cache.
|
||||
hidden (torch.Tensor): The final hidden state tensor generated
|
||||
during model forwarding to bypass model
|
||||
forwarding.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def drop_select(
|
||||
self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
|
||||
) -> list[torch.Tensor | None]:
|
||||
"""Select and *drop* KV cache entries from the lookup buffer.
|
||||
|
||||
The functionality is similar to the following python statements
|
||||
```
|
||||
ret = buffer.pop(input_tokens, roi)
|
||||
return ret
|
||||
```
|
||||
|
||||
If `input_tokens` and `roi` is `None`, it means selecting any of the
|
||||
KV caches in the buffer, return, and remove it from the buffer, useful
|
||||
when offloading KV cache to KV cache storage service.
|
||||
|
||||
Args:
|
||||
input_tokens (torch.Tensor): token IDs.
|
||||
roi (torch.Tensor): A binary mask on top of the input tokens
|
||||
|
||||
Returns:
|
||||
list[Optional[torch.Tensor]]: A list of tensors. Can be None.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class KVStoreBufferBase(KVCacheBufferBase):
|
||||
"""
|
||||
Abstract base class for a KVCache storage buffer with key-value semantics.
|
||||
This class provides a simple key-value storage buffer abstract with basic
|
||||
put/get operations, which enables flexible KVCache transfer granular
|
||||
control.
|
||||
|
||||
The functionality is similar to a distributed key-value store, where:
|
||||
- Key: A unique string identifier for the cached entry
|
||||
- Value:
|
||||
- Tensor to be stored and retrieved
|
||||
- None (indicating deletion or empty value)
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def put(
|
||||
self,
|
||||
key: str,
|
||||
value: torch.Tensor | None,
|
||||
) -> None:
|
||||
"""Store a key-value pair in the buffer.
|
||||
|
||||
Args:
|
||||
key (str): Unique identifier for a tensor, this tensor could be the
|
||||
key cache tensor, value cache tensor, or hidden state tensor
|
||||
generated during model forwarding.
|
||||
|
||||
value (Optional[torch.Tensor]): Tensor to be stored.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get(
|
||||
self,
|
||||
key: str,
|
||||
) -> torch.Tensor | None:
|
||||
"""Retrieve a value from the buffer by key.
|
||||
|
||||
Args:
|
||||
key (str): Unique identifier for a tensor, this tensor could be the
|
||||
key cache tensor, value cache tensor, or hidden state tensor
|
||||
generated during model forwarding.
|
||||
|
||||
Returns:
|
||||
Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@ -1,164 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file contains a new class `MooncakeStore` that allows developers to
|
||||
think of KV cache transfer operations as putting new KV cache entries
|
||||
into a remote KVStore-based lookup buffer and getting existing KV caches
|
||||
from this remote lookup buffer.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from safetensors.torch import load as safetensors_load
|
||||
from safetensors.torch import save as safetensors_save
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVStoreBufferBase
|
||||
from vllm.logger import init_logger
|
||||
|
||||
DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB
|
||||
DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MooncakeStoreConfig:
|
||||
local_hostname: str
|
||||
metadata_server: str
|
||||
global_segment_size: int
|
||||
local_buffer_size: int
|
||||
protocol: str
|
||||
device_name: str
|
||||
master_server_address: str
|
||||
|
||||
@staticmethod
|
||||
def from_file(file_path: str) -> "MooncakeStoreConfig":
|
||||
"""Load the config from a JSON file."""
|
||||
with open(file_path) as fin:
|
||||
config = json.load(fin)
|
||||
return MooncakeStoreConfig(
|
||||
local_hostname=config.get("local_hostname"),
|
||||
metadata_server=config.get("metadata_server"),
|
||||
global_segment_size=config.get(
|
||||
"global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
|
||||
),
|
||||
local_buffer_size=config.get(
|
||||
"local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
|
||||
),
|
||||
protocol=config.get("protocol", "tcp"),
|
||||
device_name=config.get("device_name", ""),
|
||||
master_server_address=config.get("master_server_address"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_from_env() -> "MooncakeStoreConfig":
|
||||
"""Load config from a file specified in the environment variable."""
|
||||
config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
|
||||
if config_file_path is None:
|
||||
raise ValueError(
|
||||
"The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
|
||||
)
|
||||
return MooncakeStoreConfig.from_file(config_file_path)
|
||||
|
||||
|
||||
class MooncakeStore(KVStoreBufferBase):
|
||||
def __init__(
|
||||
self,
|
||||
config: VllmConfig,
|
||||
):
|
||||
try:
|
||||
from mooncake.store import MooncakeDistributedStore
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Please install mooncake by following the instructions at "
|
||||
"https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
|
||||
"to run vLLM with MooncakeConnector."
|
||||
) from e
|
||||
|
||||
try:
|
||||
self.store = MooncakeDistributedStore()
|
||||
self.config = MooncakeStoreConfig.load_from_env()
|
||||
logger.info("Mooncake Configuration loaded successfully.")
|
||||
|
||||
self.store.setup(
|
||||
self.config.local_hostname,
|
||||
self.config.metadata_server,
|
||||
self.config.global_segment_size,
|
||||
self.config.local_buffer_size,
|
||||
self.config.protocol,
|
||||
self.config.device_name,
|
||||
self.config.master_server_address,
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
logger.error("Configuration loading failed: %s", e)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.error("An error occurred while loading the configuration: %s", exc)
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
# MooncakeDistributedStore will automatically call the destructor, so
|
||||
# it is unnecessary to close it manually.
|
||||
pass
|
||||
|
||||
def put(
|
||||
self,
|
||||
key: str,
|
||||
value: torch.Tensor | None,
|
||||
) -> None:
|
||||
# A message queue needs to be introduced before making it asynchronous.
|
||||
if value is not None:
|
||||
self._put_impl(key, value)
|
||||
|
||||
def get(
|
||||
self,
|
||||
key: str,
|
||||
) -> torch.Tensor | None:
|
||||
# A message queue needs to be introduced before making it asynchronous.
|
||||
value = self._get_impl(key)
|
||||
return value
|
||||
|
||||
def _put_impl(
|
||||
self,
|
||||
key: str,
|
||||
value: torch.Tensor,
|
||||
) -> None:
|
||||
"""Put KVCache to Mooncake Store"""
|
||||
device_id = value.device.index if value.device.type == "cuda" else -1
|
||||
device_tensor = torch.tensor(device_id, dtype=torch.int32)
|
||||
value_bytes = safetensors_save({"tensor": value, "device_id": device_tensor})
|
||||
try:
|
||||
self.store.put(key, value_bytes)
|
||||
except TypeError as err:
|
||||
logger.error("Failed to put value into Mooncake Store: %s", err)
|
||||
raise TypeError("Mooncake Store Put Type Error.") from err
|
||||
|
||||
def _get_impl(
|
||||
self,
|
||||
key: str,
|
||||
) -> torch.Tensor | None:
|
||||
"""Get KVCache from Mooncake Store"""
|
||||
try:
|
||||
data = self.store.get(key)
|
||||
except TypeError as err:
|
||||
logger.error("Failed to get value from Mooncake Store: %s", err)
|
||||
raise TypeError("Mooncake Store Get Type Error.") from err
|
||||
|
||||
if data:
|
||||
loaded_tensors = safetensors_load(data)
|
||||
tensor = loaded_tensors["tensor"]
|
||||
device_id_tensor = loaded_tensors["device_id"]
|
||||
device_id = int(device_id_tensor.item())
|
||||
device = (
|
||||
torch.device("cuda", device_id)
|
||||
if device_id >= 0
|
||||
else torch.device("cpu")
|
||||
)
|
||||
return tensor.to(device)
|
||||
|
||||
return None
|
||||
@ -1,242 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Implements a distributed key-value (KV) cache transfer mechanism.
|
||||
|
||||
Key Features:
|
||||
- Distributed KV cache transmission using PyNccl pipes.
|
||||
- Non-blocking `insert`, blocking `drop_select`.
|
||||
- Use CPU signal pipe to avoid racing condition
|
||||
- Handles buffer size constraints and provide backpressure mechanism to
|
||||
stop the prefill instance when the decode instance is slow.
|
||||
"""
|
||||
|
||||
import threading
|
||||
from collections import deque
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_lookup_buffer.base import KVLookupBufferBase
|
||||
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class SimpleBuffer(KVLookupBufferBase):
|
||||
def __init__(
|
||||
self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, buffer_size_thresh: float
|
||||
):
|
||||
"""
|
||||
signal_pipe: on CPU
|
||||
|
||||
NOTE: on-device recv will block all threads in the process, making the
|
||||
KV cache producer unable to listen to new request while transmitting
|
||||
KV cache. Luckily CPU recv only blocks the current thread so we use
|
||||
CPU recv to listen to new request.
|
||||
|
||||
data_pipe: on device (e.g. GPU)
|
||||
"""
|
||||
|
||||
self.buffer: deque[list[torch.Tensor]] = deque()
|
||||
|
||||
self.buffer_size = 0
|
||||
self.buffer_size_threshold = buffer_size_thresh
|
||||
self.buffer_cv = threading.Condition()
|
||||
self.signal_pipe = signal_pipe
|
||||
self.data_pipe = data_pipe
|
||||
self.request_handling_thread: threading.Thread | None = None
|
||||
|
||||
self.normal_signal = torch.tensor([0], device="cpu")
|
||||
self.end_signal = None
|
||||
|
||||
def _matches(
|
||||
self,
|
||||
tokens_roi_sender: list[torch.Tensor],
|
||||
tokens_roi_recver: list[torch.Tensor],
|
||||
):
|
||||
# tokens_roi_sender: tokens and roi of the producer (in the buffer)
|
||||
# tokens_roi_recver: tokens and roi of the consumer (query)
|
||||
|
||||
tokens_sender = tokens_roi_sender[0]
|
||||
tokens_recver = tokens_roi_recver[0]
|
||||
roi_sender = tokens_roi_sender[1]
|
||||
roi_recver = tokens_roi_recver[1]
|
||||
|
||||
if tokens_recver is None:
|
||||
# consumer sends an empty request
|
||||
# semantics: DROP SELECT * LIMIT 1
|
||||
# so any of the data in the buffer can be drop-selected
|
||||
return True
|
||||
|
||||
# Assuming that roi is a binary mask on tokens
|
||||
tokens_sender = tokens_sender[roi_sender]
|
||||
tokens_recver = tokens_recver[roi_recver]
|
||||
|
||||
# simple common prefix matching
|
||||
min_length = min(len(tokens_sender), len(tokens_recver))
|
||||
if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
|
||||
return min_length
|
||||
|
||||
return 0
|
||||
|
||||
def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
|
||||
assert tensor is not None, "Use self.data_pipe.send(None) instead"
|
||||
self.buffer_size -= tensor.element_size() * tensor.numel()
|
||||
if tensor.dtype == torch.bool:
|
||||
tensor = tensor.float()
|
||||
self.data_pipe.send_tensor(tensor)
|
||||
|
||||
def _get_element_size(self, data: list | torch.Tensor | None):
|
||||
if isinstance(data, torch.Tensor):
|
||||
return data.element_size() * data.numel()
|
||||
if not data:
|
||||
# cannot perform `not data` on a tensor
|
||||
# so this check needs to go after the check above
|
||||
return 0
|
||||
|
||||
raise AssertionError(f"Unknown data type {type(data)}")
|
||||
|
||||
def _add_to_buffer(
|
||||
self,
|
||||
input_tokens: torch.Tensor,
|
||||
roi: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
hidden: torch.Tensor,
|
||||
):
|
||||
if isinstance(input_tokens, torch.Tensor):
|
||||
input_tokens = input_tokens.clone()
|
||||
if isinstance(roi, torch.Tensor):
|
||||
roi = roi.clone()
|
||||
if isinstance(key, torch.Tensor):
|
||||
key = key.clone()
|
||||
if isinstance(value, torch.Tensor):
|
||||
value = value.clone()
|
||||
if isinstance(hidden, torch.Tensor):
|
||||
hidden = hidden.clone()
|
||||
|
||||
buffer_item = [input_tokens, roi, key, value, hidden]
|
||||
data_size = sum([self._get_element_size(data) for data in buffer_item])
|
||||
|
||||
with self.buffer_cv:
|
||||
if self.buffer_size + data_size > self.buffer_size_threshold:
|
||||
# log outside the while loop to avoid this message being logged
|
||||
# repeatedly.
|
||||
logger.debug("KV transfer buffer is full. Handling...")
|
||||
while self.buffer_size + data_size > self.buffer_size_threshold:
|
||||
self.buffer_cv.wait()
|
||||
|
||||
self.buffer_size += data_size
|
||||
self.buffer.append(buffer_item)
|
||||
self.buffer_cv.notify()
|
||||
|
||||
def _is_end_signal(self, signal):
|
||||
return signal is None
|
||||
|
||||
def drop_select_handler(self):
|
||||
try:
|
||||
while True:
|
||||
signal = self.signal_pipe.recv_tensor()
|
||||
if self._is_end_signal(signal):
|
||||
logger.info("Received end signal!")
|
||||
break
|
||||
|
||||
input_tokens = self.data_pipe.recv_tensor()
|
||||
|
||||
roi = self.data_pipe.recv_tensor()
|
||||
assert roi is not None, (
|
||||
"Please provide the roi when sending drop-select request"
|
||||
)
|
||||
roi = roi > 0.5
|
||||
tokens_roi_recver = [input_tokens, roi]
|
||||
|
||||
def is_buffer_available(
|
||||
tokens_roi_recver: list[torch.Tensor],
|
||||
) -> bool:
|
||||
# perform input tokens and roi matching
|
||||
# FIXME: this matching is O(n), ideally it should be O(1)
|
||||
# but this buffer size won't (and shouldn't) be too large so
|
||||
# the fix is not urgent.
|
||||
for _ in range(len(self.buffer)):
|
||||
if self._matches(self.buffer[0], tokens_roi_recver) > 0:
|
||||
return True
|
||||
# rotate the element we just accessed to the end
|
||||
self.buffer.rotate(-1)
|
||||
return False
|
||||
|
||||
with self.buffer_cv:
|
||||
while not is_buffer_available(tokens_roi_recver):
|
||||
logger.debug("KV transfer buffer is not available. Waiting...")
|
||||
self.buffer_cv.wait()
|
||||
# need to clone the tensor
|
||||
# in case the tensor is freed before sending finishes
|
||||
matched_item = self.buffer.popleft()
|
||||
for tensor in matched_item:
|
||||
self._send_tensor_and_dec_size(tensor)
|
||||
self.buffer_cv.notify()
|
||||
|
||||
except RuntimeError as e:
|
||||
if "Connection closed by peer" not in str(e):
|
||||
raise e
|
||||
|
||||
logger.debug("Closing drop_select_handler")
|
||||
|
||||
def drop_select(
|
||||
self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
|
||||
) -> list[torch.Tensor | None]:
|
||||
assert self.request_handling_thread is None, (
|
||||
"drop_select should be called by the KV cache consumer "
|
||||
"(e.g. the decode vLLM instance)"
|
||||
)
|
||||
|
||||
if isinstance(input_tokens, torch.Tensor):
|
||||
input_tokens = input_tokens.clone()
|
||||
if isinstance(roi, torch.Tensor):
|
||||
roi = roi.clone().float()
|
||||
|
||||
self.signal_pipe.send_tensor(self.normal_signal)
|
||||
self.data_pipe.send_tensor(input_tokens)
|
||||
self.data_pipe.send_tensor(roi)
|
||||
|
||||
input_tokens = self.data_pipe.recv_tensor()
|
||||
roi = self.data_pipe.recv_tensor()
|
||||
if roi is not None:
|
||||
# convert from float tensor to bool tensor
|
||||
# as PyNccl does not support sending bool tensor
|
||||
roi = roi > 0.5
|
||||
key = self.data_pipe.recv_tensor()
|
||||
value = self.data_pipe.recv_tensor()
|
||||
hidden = self.data_pipe.recv_tensor()
|
||||
|
||||
return [input_tokens, roi, key, value, hidden]
|
||||
|
||||
def insert(
|
||||
self,
|
||||
input_tokens: torch.Tensor,
|
||||
roi: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
hidden: torch.Tensor,
|
||||
) -> None:
|
||||
self._add_to_buffer(input_tokens, roi, key, value, hidden)
|
||||
|
||||
# when calling the insert, the current process is a sender
|
||||
# need to launch the request handler and start listening to request.
|
||||
if self.request_handling_thread is None:
|
||||
self.request_handling_thread = threading.Thread(
|
||||
target=self.drop_select_handler
|
||||
)
|
||||
self.request_handling_thread.start()
|
||||
|
||||
def close(self):
|
||||
if (
|
||||
hasattr(self, "request_handling_thread")
|
||||
and self.request_handling_thread is not None
|
||||
):
|
||||
self.request_handling_thread.join()
|
||||
|
||||
else:
|
||||
# TODO: have a explicit close signal and have a explicit way to
|
||||
# check if it's requester
|
||||
self.signal_pipe.send_tensor(self.end_signal)
|
||||
@ -1,66 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file defines an interface `KVPipeBase`
|
||||
that provides an abstraction for sending and receiving tensors, or None, via
|
||||
distributed communications.
|
||||
|
||||
All classes instantiated from this interface are assumed to be a FIFO pipe.
|
||||
|
||||
If your distributed communication platform already supports key-value lookup,
|
||||
you can bypass this interface and directly start from `kv_lookup_buffer`.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class KVPipeBase(ABC):
|
||||
"""
|
||||
This class provides an interface for sending and receiving tensors, or
|
||||
None, by distributed communications.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def send_tensor(self, tensor: torch.Tensor | None) -> None:
|
||||
"""Send a tensor, or None, via the pipe.
|
||||
|
||||
Need to support sending None -- important for error handling.
|
||||
|
||||
TODO: add a `key` argument so that we can use traditional
|
||||
key-value database as the distributed communication mechanism behind
|
||||
the pipe.
|
||||
|
||||
Args:
|
||||
tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def recv_tensor(self) -> torch.Tensor | None:
|
||||
"""Receive a tensor (can be None) from the pipeline.
|
||||
|
||||
Returns:
|
||||
Optional[torch.Tensor]: The tensor received from the pipeline. Can
|
||||
be None.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None:
|
||||
"""Close the pipeline and release resources.
|
||||
|
||||
This method is responsible for closing the communication pipeline
|
||||
and releasing any resources associated with it.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: This method must be implemented in subclasses.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@ -1,295 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
import zmq
|
||||
from safetensors.torch import load as safetensors_load
|
||||
from safetensors.torch import save as safetensors_save
|
||||
|
||||
from vllm.config.kv_transfer import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.network_utils import join_host_port, make_zmq_path, split_host_port
|
||||
|
||||
logger = init_logger(__name__)
|
||||
NONE_INT = -150886311
|
||||
|
||||
|
||||
@dataclass
|
||||
class MooncakeTransferEngineConfig:
|
||||
prefill_url: str
|
||||
decode_url: str
|
||||
metadata_backend: str | None
|
||||
metadata_server: str
|
||||
protocol: str
|
||||
device_name: str
|
||||
|
||||
@staticmethod
|
||||
def from_file(file_path: str) -> "MooncakeTransferEngineConfig":
|
||||
"""Load the config from a JSON file."""
|
||||
with open(file_path) as fin:
|
||||
config = json.load(fin)
|
||||
return MooncakeTransferEngineConfig(
|
||||
prefill_url=config.get("prefill_url"),
|
||||
decode_url=config.get("decode_url"),
|
||||
metadata_backend=config.get("metadata_backend", None),
|
||||
metadata_server=config.get("metadata_server"),
|
||||
protocol=config.get("protocol", "tcp"),
|
||||
device_name=config.get("device_name", ""),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_from_env() -> "MooncakeTransferEngineConfig":
|
||||
"""Load config from a file specified in the environment variable."""
|
||||
config_file_path = os.getenv("MOONCAKE_CONFIG_PATH")
|
||||
if config_file_path is None:
|
||||
raise ValueError(
|
||||
"The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
|
||||
)
|
||||
return MooncakeTransferEngineConfig.from_file(config_file_path)
|
||||
|
||||
|
||||
class MooncakeTransferEngine:
|
||||
"""Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
|
||||
|
||||
def __init__(self, kv_rank: int, local_rank: int):
|
||||
try:
|
||||
from mooncake.engine import TransferEngine
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Please install mooncake by following the instructions at "
|
||||
"https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
|
||||
"to run vLLM with MooncakeConnector."
|
||||
) from e
|
||||
|
||||
self.engine = TransferEngine()
|
||||
self.local_rank = local_rank
|
||||
|
||||
try:
|
||||
self.config = MooncakeTransferEngineConfig.load_from_env()
|
||||
logger.info("Mooncake Configuration loaded successfully.")
|
||||
except ValueError as e:
|
||||
logger.error(e)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.error("An error occurred while loading the configuration: %s", exc)
|
||||
raise
|
||||
prefill_host, base_prefill_port = split_host_port(self.config.prefill_url)
|
||||
decode_host, base_decode_port = split_host_port(self.config.decode_url)
|
||||
|
||||
# Avoid ports conflict when running prefill and decode on the same node
|
||||
if prefill_host == decode_host and base_prefill_port == base_decode_port:
|
||||
base_decode_port = base_decode_port + 100
|
||||
|
||||
prefill_port = base_prefill_port + self.local_rank
|
||||
decode_port = base_decode_port + self.local_rank
|
||||
self.prefill_url = join_host_port(prefill_host, prefill_port)
|
||||
self.decode_url = join_host_port(decode_host, decode_port)
|
||||
|
||||
self.initialize(
|
||||
self.prefill_url if kv_rank == 0 else self.decode_url,
|
||||
self.config.metadata_server,
|
||||
self.config.protocol,
|
||||
self.config.device_name,
|
||||
self.config.metadata_backend,
|
||||
)
|
||||
|
||||
self.remote_url = self.decode_url if kv_rank == 0 else self.prefill_url
|
||||
|
||||
# Initialize ZeroMQ context and sockets
|
||||
self.context = zmq.Context() # type: ignore[attr-defined]
|
||||
self.sender_socket = self.context.socket(zmq.constants.PUSH)
|
||||
self.receiver_socket = self.context.socket(zmq.constants.PULL)
|
||||
self.sender_ack = self.context.socket(zmq.constants.PULL)
|
||||
self.receiver_ack = self.context.socket(zmq.constants.PUSH)
|
||||
|
||||
self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
|
||||
self._setup_metadata_sockets(
|
||||
kv_rank, prefill_host, base_prefill_port, decode_host, base_decode_port
|
||||
)
|
||||
|
||||
def _setup_metadata_sockets(
|
||||
self, kv_rank: int, p_host: str, p_port: int, d_host: str, d_port: int
|
||||
) -> None:
|
||||
"""Set up ZeroMQ sockets for sending and receiving data."""
|
||||
# Offsets < 8 are left for initialization in case tp and pp are enabled
|
||||
p_rank_offset = p_port + 8 + self.local_rank * 2
|
||||
d_rank_offset = d_port + 8 + self.local_rank * 2
|
||||
if kv_rank == 0:
|
||||
self.sender_socket.bind(make_zmq_path("tcp", p_host, p_rank_offset + 1))
|
||||
self.receiver_socket.connect(
|
||||
make_zmq_path("tcp", d_host, d_rank_offset + 1)
|
||||
)
|
||||
self.sender_ack.connect(make_zmq_path("tcp", d_host, d_rank_offset + 2))
|
||||
self.receiver_ack.bind(make_zmq_path("tcp", p_host, p_rank_offset + 2))
|
||||
else:
|
||||
self.receiver_socket.connect(
|
||||
make_zmq_path("tcp", p_host, p_rank_offset + 1)
|
||||
)
|
||||
self.sender_socket.bind(make_zmq_path("tcp", d_host, d_rank_offset + 1))
|
||||
self.receiver_ack.bind(make_zmq_path("tcp", d_host, d_rank_offset + 2))
|
||||
self.sender_ack.connect(make_zmq_path("tcp", p_host, p_rank_offset + 2))
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
local_hostname: str,
|
||||
metadata_server: str,
|
||||
protocol: str,
|
||||
device_name: str,
|
||||
metadata_backend: str | None,
|
||||
) -> None:
|
||||
"""Initialize the mooncake instance."""
|
||||
if metadata_backend is None:
|
||||
self.engine.initialize(
|
||||
local_hostname, metadata_server, protocol, device_name
|
||||
)
|
||||
else:
|
||||
supported_backend = ["etcd", "redis"]
|
||||
metadata_backend = metadata_backend.lower()
|
||||
if metadata_backend not in supported_backend:
|
||||
raise ValueError(
|
||||
"Mooncake Configuration error. `metadata_backend`"
|
||||
f" should be one of {supported_backend}."
|
||||
)
|
||||
|
||||
self.engine.initialize_ext(
|
||||
local_hostname, metadata_server, protocol, device_name, metadata_backend
|
||||
)
|
||||
|
||||
def allocate_managed_buffer(self, length: int) -> int:
|
||||
"""Allocate a managed buffer of the specified length."""
|
||||
ret = self.engine.allocate_managed_buffer(length)
|
||||
if ret <= 0:
|
||||
logger.error("Allocation Return Error")
|
||||
raise Exception("Allocation Return Error")
|
||||
return ret
|
||||
|
||||
def free_managed_buffer(self, buffer: int, length: int) -> int:
|
||||
"""Free a previously allocated managed buffer."""
|
||||
return self.engine.free_managed_buffer(buffer, length)
|
||||
|
||||
def transfer_sync(self, buffer: int, peer_buffer_address: int, length: int) -> int:
|
||||
"""Synchronously transfer data to the specified address."""
|
||||
ret = self.engine.transfer_sync_read(
|
||||
self.remote_url, buffer, peer_buffer_address, length
|
||||
)
|
||||
if ret < 0:
|
||||
logger.error("Transfer Return Error")
|
||||
raise Exception("Transfer Return Error")
|
||||
return ret
|
||||
|
||||
def write_bytes_to_buffer(self, buffer: int, user_data: bytes, length: int) -> int:
|
||||
"""Write bytes to the allocated buffer."""
|
||||
return self.engine.write_bytes_to_buffer(buffer, user_data, length)
|
||||
|
||||
def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
|
||||
"""Read bytes from the allocated buffer."""
|
||||
return self.engine.read_bytes_from_buffer(buffer, length)
|
||||
|
||||
def wait_for_ack(self, src_ptr: int, length: int) -> None:
|
||||
"""Asynchronously wait for ACK from the receiver."""
|
||||
ack = self.sender_ack.recv()
|
||||
if ack != b"ACK":
|
||||
logger.error("Failed to receive ACK from the receiver")
|
||||
|
||||
self.free_managed_buffer(src_ptr, length)
|
||||
|
||||
def send_bytes(self, user_data: bytes) -> None:
|
||||
"""Send bytes to the remote process."""
|
||||
length = len(user_data)
|
||||
src_ptr = self.allocate_managed_buffer(length)
|
||||
self.write_bytes_to_buffer(src_ptr, user_data, length)
|
||||
self.sender_socket.send_multipart(
|
||||
[struct.pack("!Q", src_ptr), struct.pack("!Q", length)]
|
||||
)
|
||||
self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
|
||||
|
||||
def recv_bytes(self) -> bytes:
|
||||
"""Receive bytes from the remote process."""
|
||||
data = self.receiver_socket.recv_multipart()
|
||||
src_ptr = struct.unpack("!Q", data[0])[0]
|
||||
length = struct.unpack("!Q", data[1])[0]
|
||||
dst_ptr = self.allocate_managed_buffer(length)
|
||||
self.transfer_sync(dst_ptr, src_ptr, length)
|
||||
ret = self.read_bytes_from_buffer(dst_ptr, length)
|
||||
|
||||
# Buffer cleanup
|
||||
self.receiver_ack.send(b"ACK")
|
||||
self.free_managed_buffer(dst_ptr, length)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
class MooncakePipe(KVPipeBase):
|
||||
"""MooncakeTransferEngine based Pipe implementation."""
|
||||
|
||||
def __init__(
|
||||
self, local_rank: int, config: KVTransferConfig, device: str | None = None
|
||||
):
|
||||
"""Initialize the mooncake pipe and set related parameters."""
|
||||
self.config = config
|
||||
self.local_rank = local_rank
|
||||
self.kv_rank = self.config.kv_rank
|
||||
assert self.kv_rank is not None
|
||||
if device is None:
|
||||
self.device = self._select_device(self.config.kv_buffer_device)
|
||||
else:
|
||||
self.device = self._select_device(device)
|
||||
|
||||
self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
|
||||
self.transport_thread: ThreadPoolExecutor | None = None
|
||||
self.none_tensor = torch.tensor([NONE_INT], device=self.device)
|
||||
|
||||
def _select_device(self, device: str) -> torch.device:
|
||||
"""Select available device (CUDA or CPU)."""
|
||||
logger.info("Selecting device: %s", device)
|
||||
if device == "cuda":
|
||||
return torch.device(f"cuda:{self.local_rank}")
|
||||
else:
|
||||
return torch.device("cpu")
|
||||
|
||||
def tensor_hash(self, tensor: torch.Tensor) -> int:
|
||||
"""Calculate the hash value of the tensor."""
|
||||
return hash(tensor.data_ptr())
|
||||
|
||||
def _send_impl(self, tensor: torch.Tensor) -> None:
|
||||
"""Implement the tensor sending logic using safetensors."""
|
||||
self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
|
||||
|
||||
def _recv_impl(self) -> torch.Tensor:
|
||||
"""Implement the tensor receiving logic using safetensors."""
|
||||
data = self.transfer_engine.recv_bytes()
|
||||
return safetensors_load(data)["tensor"].to(self.device)
|
||||
|
||||
def send_tensor(self, tensor: torch.Tensor | None) -> None:
|
||||
"""Send tensor to the target process."""
|
||||
if self.transport_thread is None:
|
||||
self.transport_thread = ThreadPoolExecutor(max_workers=1)
|
||||
tensor = tensor if tensor is not None else self.none_tensor
|
||||
assert len(tensor.shape) > 0
|
||||
self.transport_thread.submit(self._send_impl, tensor)
|
||||
|
||||
def recv_tensor(self) -> torch.Tensor | None:
|
||||
"""Receive tensor from other processes."""
|
||||
if self.transport_thread is None:
|
||||
self.transport_thread = ThreadPoolExecutor(max_workers=1)
|
||||
tensor = self.transport_thread.submit(self._recv_impl).result()
|
||||
if tensor.numel() == 1 and tensor.item() == NONE_INT:
|
||||
return None
|
||||
else:
|
||||
return tensor
|
||||
|
||||
def close(self) -> None:
|
||||
"""Cleanup logic when closing the pipe."""
|
||||
self.transfer_engine.sender_socket.close()
|
||||
self.transfer_engine.receiver_socket.close()
|
||||
self.transfer_engine.sender_ack.close()
|
||||
self.transfer_engine.receiver_ack.close()
|
||||
self.transfer_engine.context.term() # Terminate the ZMQ context
|
||||
logger.info("Closed the transfer engine and cleaned up resources.")
|
||||
@ -1,285 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This module implements a PyNccl pipe for sending and receiving
|
||||
Optional[torch.Tensor] between distributed ranks with advanced
|
||||
communication features.
|
||||
|
||||
Key Features:
|
||||
- Supports sending and receiving tensors with metadata
|
||||
- Handles both CUDA and CPU device communications
|
||||
- Implements a non-blocking tensor transfer mechanism
|
||||
- Manages buffer size and provides backpressure control
|
||||
- Supports distributed process groups with configurable parameters
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config.kv_transfer import KVTransferConfig
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class BrokenPipeException(Exception):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
Metadata = dict[str, torch.Tensor | None]
|
||||
|
||||
|
||||
class PyNcclPipe(KVPipeBase):
|
||||
METADATA_LENGTH = 16
|
||||
MAX_TENSOR_DIMENSIONS = 14
|
||||
METADATA_DTYPE = torch.int64
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_rank: int,
|
||||
config: KVTransferConfig,
|
||||
device: str | None = None,
|
||||
port_offset: int = 0,
|
||||
):
|
||||
self.config = config
|
||||
self.local_rank = local_rank
|
||||
self.kv_rank = self.config.kv_rank
|
||||
assert self.kv_rank is not None
|
||||
self.kv_parallel_size = self.config.kv_parallel_size
|
||||
if device is None:
|
||||
self.device = self._select_device(self.config.kv_buffer_device)
|
||||
else:
|
||||
self.device = self._select_device(device)
|
||||
|
||||
# build distributed connection and send/recv implementation
|
||||
store_timeout = self.config.get_from_extra_config("store_timeout", 300)
|
||||
self.group = StatelessProcessGroup.create(
|
||||
host=self.config.kv_ip,
|
||||
port=self.config.kv_port + port_offset,
|
||||
rank=self.kv_rank,
|
||||
world_size=self.kv_parallel_size,
|
||||
store_timeout=store_timeout,
|
||||
)
|
||||
# add a barrier to make sure the connection is initiated properly
|
||||
self.group.barrier()
|
||||
impl = self._get_device_send_recv_impl(self.group)
|
||||
self.device_send_func, self.device_recv_func = impl
|
||||
# set target rank
|
||||
self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size
|
||||
self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
|
||||
|
||||
# transportation-related variables
|
||||
self.transport_thread: ThreadPoolExecutor | None = None
|
||||
self.buffer_size = 0
|
||||
self.buffer_size_lock = threading.Lock()
|
||||
self.buffer_size_thresh = self.config.kv_buffer_size
|
||||
|
||||
def _get_device_send_recv_impl(
|
||||
self, group: StatelessProcessGroup
|
||||
) -> tuple[
|
||||
Callable[[torch.Tensor, int], None], Callable[[torch.Tensor, int], None]
|
||||
]:
|
||||
send: Callable[[torch.Tensor, int], None]
|
||||
recv: Callable[[torch.Tensor, int], None]
|
||||
if self.device.type == "cuda":
|
||||
# use PyNCCL for send / recv
|
||||
comm = PyNcclCommunicator(group, device=self.local_rank)
|
||||
comm.disabled = False
|
||||
send, recv = comm.send, comm.recv # type: ignore
|
||||
else:
|
||||
# This send / recv implementation here is NOT intended to transfer
|
||||
# KV caches (and should NOT be repurposed to transfer KV caches).
|
||||
# Currently it is only used to transmit control-plane messages
|
||||
# for PyNcclBuffer.
|
||||
send = group.send_obj
|
||||
|
||||
def my_recv(x, src):
|
||||
x[...] = group.recv_obj(src)
|
||||
|
||||
recv = my_recv
|
||||
|
||||
return send, recv
|
||||
|
||||
def _select_device(self, device: str):
|
||||
logger.info("Selecting device: %s", device)
|
||||
if device == "cuda":
|
||||
return torch.device(f"cuda:{self.local_rank}")
|
||||
else:
|
||||
return torch.device("cpu")
|
||||
|
||||
def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
|
||||
"""
|
||||
Create the metadata as a dictionary based on the input tensor.
|
||||
|
||||
Args:
|
||||
tensor: The input tensor or None if no tensor is provided.
|
||||
|
||||
Returns:
|
||||
metadata: A dictionary with the following keys:
|
||||
- "dtype": The data type of the tensor or None.
|
||||
- "shape": The shape of the tensor or None.
|
||||
"""
|
||||
if tensor is None:
|
||||
return {"dtype": None, "shape": None}
|
||||
else:
|
||||
return {"dtype": tensor.dtype, "shape": tensor.shape}
|
||||
|
||||
def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
|
||||
"""
|
||||
Create a buffer to receive the tensor based on the provided metadata.
|
||||
|
||||
Args:
|
||||
metadata: A dictionary with keys "dtype" and "shape",
|
||||
describing the tensor's data type and shape.
|
||||
|
||||
Returns:
|
||||
buffer: A tensor of the specified type and shape,
|
||||
allocated on `self.device`.
|
||||
"""
|
||||
return torch.empty(
|
||||
metadata["shape"], dtype=metadata["dtype"], device=self.device
|
||||
)
|
||||
|
||||
def _send_metadata(self, metadata: Metadata):
|
||||
"""
|
||||
Send the metadata dictionary to the target rank.
|
||||
|
||||
Args:
|
||||
metadata: A dictionary with keys "dtype" and "shape".
|
||||
"""
|
||||
self.group.send_obj(metadata, self.target_rank_for_send)
|
||||
|
||||
def _recv_metadata(self) -> Metadata:
|
||||
"""
|
||||
Receive the metadata dictionary from the target rank.
|
||||
|
||||
Returns:
|
||||
metadata: A dictionary with keys "dtype" and "shape"
|
||||
describing the tensor.
|
||||
"""
|
||||
return self.group.recv_obj(self.target_rank_for_recv)
|
||||
|
||||
def _send_impl(self, tensor: torch.Tensor | None) -> None:
|
||||
"""
|
||||
The actual implementation of sending the tensor and its metadata to the
|
||||
target rank.
|
||||
|
||||
Args:
|
||||
tensor: The input tensor to be sent, or `None` if no tensor is
|
||||
being sent.
|
||||
"""
|
||||
metadata = self._make_metadata(tensor)
|
||||
self._send_metadata(metadata)
|
||||
if tensor is not None:
|
||||
self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
|
||||
|
||||
def _recv_impl(self) -> torch.Tensor | None:
|
||||
"""
|
||||
The actual implementation of receiving a tensor and its metadata from
|
||||
the target rank.
|
||||
|
||||
Returns:
|
||||
buffer: The received tensor, or `None` if no tensor is received.
|
||||
"""
|
||||
metadata = self._recv_metadata()
|
||||
if metadata["dtype"] is None:
|
||||
return None
|
||||
buffer = self._prepare_recv_buffer(metadata)
|
||||
self.device_recv_func(buffer, self.target_rank_for_recv)
|
||||
|
||||
return buffer
|
||||
|
||||
def send_tensor_wrapper(
|
||||
self, tensor: torch.Tensor | None, tensor_size: int
|
||||
) -> None:
|
||||
"""
|
||||
Wrapper for _send_impl to handle exceptions and update buffer size.
|
||||
"""
|
||||
try:
|
||||
self._send_impl(tensor)
|
||||
|
||||
with self.buffer_size_lock:
|
||||
self.buffer_size -= tensor_size
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"[rank%d]: Exception when trying to send %s, msg: %s",
|
||||
torch.distributed.get_rank(),
|
||||
str(tensor),
|
||||
str(e),
|
||||
)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
def block_if_full(self):
|
||||
"""
|
||||
Block the current thread if the buffer size is larger than the
|
||||
threshold.
|
||||
"""
|
||||
while self.buffer_size > self.buffer_size_thresh:
|
||||
logger.debug("KV cache transfer pipe is full. Waiting...")
|
||||
time.sleep(0.05)
|
||||
|
||||
def send_tensor(self, tensor: torch.Tensor | None) -> None:
|
||||
"""
|
||||
Sends a tensor and its metadata to the destination rank in a
|
||||
non-blocking way.
|
||||
|
||||
Args:
|
||||
tensor: The tensor to send, or `None` if no tensor is being sent.
|
||||
"""
|
||||
if self.transport_thread is None:
|
||||
self.transport_thread = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
if tensor is not None:
|
||||
tensor_size = tensor.element_size() * tensor.numel()
|
||||
else:
|
||||
tensor_size = 0
|
||||
|
||||
self.block_if_full()
|
||||
|
||||
with self.buffer_size_lock:
|
||||
self.buffer_size += tensor_size
|
||||
|
||||
self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
|
||||
|
||||
def recv_tensor(self) -> torch.Tensor | None:
|
||||
"""
|
||||
Receives a tensor and its metadata from the source rank. Blocking call.
|
||||
|
||||
Returns:
|
||||
The received tensor, or `None` if no tensor is received.
|
||||
"""
|
||||
if self.transport_thread is None:
|
||||
self.transport_thread = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
future = self.transport_thread.submit(self._recv_impl)
|
||||
|
||||
try:
|
||||
tensor = future.result()
|
||||
except Exception as e:
|
||||
logger.error("Encountering exception in KV receiving thread")
|
||||
logger.error("%s", e)
|
||||
logger.error("My device: %s", self.device)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
raise e
|
||||
|
||||
return tensor
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Close the pipe and release associated resources.
|
||||
"""
|
||||
if hasattr(self, "transport_thread") and self.transport_thread is not None:
|
||||
self.transport_thread.shutdown()
|
||||
@ -520,6 +520,9 @@ class EngineArgs:
|
||||
ObservabilityConfig, "kv_cache_metrics_sample"
|
||||
)
|
||||
cudagraph_metrics: bool = ObservabilityConfig.cudagraph_metrics
|
||||
enable_layerwise_nvtx_tracing: bool = (
|
||||
ObservabilityConfig.enable_layerwise_nvtx_tracing
|
||||
)
|
||||
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
|
||||
scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
|
||||
|
||||
@ -1032,6 +1035,10 @@ class EngineArgs:
|
||||
"--cudagraph-metrics",
|
||||
**observability_kwargs["cudagraph_metrics"],
|
||||
)
|
||||
observability_group.add_argument(
|
||||
"--enable-layerwise-nvtx-tracing",
|
||||
**observability_kwargs["enable_layerwise_nvtx_tracing"],
|
||||
)
|
||||
|
||||
# Scheduler arguments
|
||||
scheduler_kwargs = get_kwargs(SchedulerConfig)
|
||||
@ -1711,6 +1718,7 @@ class EngineArgs:
|
||||
kv_cache_metrics=self.kv_cache_metrics,
|
||||
kv_cache_metrics_sample=self.kv_cache_metrics_sample,
|
||||
cudagraph_metrics=self.cudagraph_metrics,
|
||||
enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing,
|
||||
)
|
||||
|
||||
# Compilation config overrides
|
||||
|
||||
@ -455,11 +455,13 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
|
||||
output_items.extend(_parse_function_call(message, recipient))
|
||||
|
||||
# Built-in tools on commentary channel are treated as reasoning for now
|
||||
elif recipient is not None and (
|
||||
recipient.startswith("python")
|
||||
or recipient.startswith("browser")
|
||||
or recipient.startswith("container")
|
||||
elif (
|
||||
recipient is None # Preambles: explanatory text before tool calls
|
||||
or recipient.startswith(("python", "browser", "container"))
|
||||
):
|
||||
# Per Harmony format, commentary channel can contain preambles to calling
|
||||
# multiple functions - explanatory text with no recipient. Built-in tool
|
||||
# recipients (python/browser/container) also generate reasoning output.
|
||||
output_items.extend(_parse_reasoning_content(message))
|
||||
else:
|
||||
raise ValueError(f"Unknown recipient: {recipient}")
|
||||
|
||||
@ -1072,10 +1072,15 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# wasn't ready to send a token, then
|
||||
# get the next token without streaming a chunk
|
||||
if delta_message is None:
|
||||
if output.finish_reason is None:
|
||||
# NOTE: If return_token_ids is enabled, we still need to
|
||||
# send a chunk with token_ids even if delta_message is None
|
||||
# to ensure all tokens are included in the response
|
||||
if (
|
||||
output.finish_reason is None
|
||||
and not request.return_token_ids
|
||||
):
|
||||
continue
|
||||
else:
|
||||
delta_message = DeltaMessage()
|
||||
delta_message = DeltaMessage()
|
||||
|
||||
# Log streaming delta if output logging is enabled
|
||||
if self.enable_log_outputs and self.request_logger:
|
||||
|
||||
@ -345,6 +345,10 @@ class FusedMoEQuantConfig:
|
||||
def use_mxfp4_w4a16(self) -> bool:
|
||||
return self._a1.dtype is None and self._w1.dtype == "mxfp4"
|
||||
|
||||
@property
|
||||
def use_mxfp4_w4a4(self) -> bool:
|
||||
return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"
|
||||
|
||||
@property
|
||||
def use_nvfp4_w4a4(self) -> bool:
|
||||
return self.quant_dtype == "nvfp4"
|
||||
|
||||
@ -863,7 +863,8 @@ class FusedMoE(CustomOp):
|
||||
use_chunked_impl: bool,
|
||||
) -> tuple[bool, torch.Tensor | None]:
|
||||
use_shared_experts_stream = (
|
||||
has_separate_shared_experts
|
||||
current_platform.is_cuda()
|
||||
and has_separate_shared_experts
|
||||
and not use_chunked_impl
|
||||
and self.shared_experts_stream is not None
|
||||
and (
|
||||
|
||||
@ -221,8 +221,8 @@ def rocm_aiter_fused_experts(
|
||||
|
||||
else:
|
||||
quant_method = QuantMethod.NO.value
|
||||
# quark moe for mxfp4 w_dtype
|
||||
if quant_config.use_mxfp4_w4a16:
|
||||
# quark moe for mxfp4 w_dtype mxfp4 a_dtype
|
||||
if quant_config.use_mxfp4_w4a4:
|
||||
quant_method = QuantMethod.BLOCK_1X32.value
|
||||
# w8a8 block-scaled
|
||||
if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
|
||||
|
||||
@ -124,12 +124,16 @@ class Fp8MoeBackend(Enum):
|
||||
|
||||
|
||||
def get_fp8_moe_backend(
|
||||
block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
|
||||
block_quant: bool,
|
||||
moe_parallel_config: FusedMoEParallelConfig,
|
||||
with_lora_support: bool,
|
||||
) -> Fp8MoeBackend:
|
||||
"""
|
||||
Select the primary FP8 MoE backend
|
||||
Note: Shape-specific fallbacks may still occur at runtime.
|
||||
"""
|
||||
if with_lora_support:
|
||||
return Fp8MoeBackend.TRITON
|
||||
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
@ -665,7 +669,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
self.weight_block_size = self.quant_config.weight_block_size
|
||||
self.block_quant: bool = self.weight_block_size is not None
|
||||
self.fp8_backend = get_fp8_moe_backend(
|
||||
self.block_quant, layer.moe_parallel_config
|
||||
self.block_quant, layer.moe_parallel_config, self.moe.is_lora_enabled
|
||||
)
|
||||
|
||||
self.marlin_input_dtype = None
|
||||
@ -1084,6 +1088,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
from vllm.model_executor.layers.fused_moe import (
|
||||
BatchedDeepGemmExperts,
|
||||
BatchedTritonExperts,
|
||||
TritonExperts,
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
|
||||
@ -1116,7 +1121,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
num_dispatchers=prepare_finalize.num_dispatchers(),
|
||||
quant_config=self.moe_quant_config,
|
||||
)
|
||||
|
||||
elif self.moe.is_lora_enabled:
|
||||
return TritonExperts(quant_config=self.moe_quant_config)
|
||||
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
|
||||
# Select GEMM experts with block-scale when weights are block-quantized
|
||||
experts = select_cutlass_fp8_gemm_impl(
|
||||
|
||||
@ -30,7 +30,6 @@ def get_rope(
|
||||
is_neox_style: bool = True,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
dtype: torch.dtype | None = None,
|
||||
partial_rotary_factor: float = 1.0,
|
||||
dual_chunk_attention_config: dict[str, Any] | None = None,
|
||||
) -> RotaryEmbedding:
|
||||
if dtype is None:
|
||||
@ -55,6 +54,10 @@ def get_rope(
|
||||
else:
|
||||
dual_chunk_attention_args = None
|
||||
|
||||
partial_rotary_factor = 1.0
|
||||
if rope_parameters is not None:
|
||||
partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
|
||||
if partial_rotary_factor < 1.0:
|
||||
rotary_dim = int(rotary_dim * partial_rotary_factor)
|
||||
key = (
|
||||
|
||||
@ -148,8 +148,6 @@ class ApertusAttention(nn.Module):
|
||||
if head_dim is None:
|
||||
head_dim = self.hidden_size // self.total_num_heads
|
||||
self.head_dim = head_dim
|
||||
# Phi models introduced a partial_rotary_factor parameter in the config
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -228,11 +226,10 @@ class ApertusAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=int(self.partial_rotary_factor * self.head_dim),
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -127,8 +127,6 @@ class BailingAttention(nn.Module):
|
||||
prefix=f"{prefix}.dense",
|
||||
)
|
||||
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
|
||||
|
||||
self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
@ -137,7 +135,6 @@ class BailingAttention(nn.Module):
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
self.attn = Attention(
|
||||
|
||||
@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if hasattr(config, "partial_rotary_factor"):
|
||||
rotary_dim = int(self.head_dim * config.partial_rotary_factor)
|
||||
elif hasattr(config, "attn_rotary_emb"):
|
||||
if hasattr(config, "attn_rotary_emb"):
|
||||
rotary_dim = config.attn_rotary_emb # for backward compatibility
|
||||
else:
|
||||
rotary_dim = self.head_dim # default
|
||||
|
||||
@ -8,7 +8,6 @@ import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import set_default_rope_theta
|
||||
from vllm.utils.math_utils import cdiv, round_up
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
|
||||
@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
|
||||
if not model_config.enforce_eager:
|
||||
max_position = round_up(max_position, 8)
|
||||
|
||||
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
|
||||
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
|
||||
@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
|
||||
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
|
||||
max_trained_positions = getattr(config, "max_trained_positions", 2048)
|
||||
|
||||
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
|
||||
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": rotary_emb_dim,
|
||||
@ -490,6 +485,26 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
|
||||
logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
|
||||
|
||||
|
||||
class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
|
||||
@staticmethod
|
||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||
"""Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
|
||||
(or not explicitly set), to the value specified in the HF config, or to
|
||||
float16 if not specified.
|
||||
"""
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config.mamba_ssm_cache_dtype == "auto":
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
mamba_ssm_cache_dtype = getattr(
|
||||
hf_config, "mamba_ssm_cache_dtype", "float16"
|
||||
)
|
||||
logger.info(
|
||||
"Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
|
||||
mamba_ssm_cache_dtype,
|
||||
)
|
||||
cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
|
||||
|
||||
|
||||
MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"GteModel": SnowflakeGteNewModelConfig,
|
||||
"GteNewModel": GteNewModelConfig,
|
||||
@ -507,4 +522,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"Mamba2ForCausalLM": MambaModelConfig,
|
||||
"FalconMambaForCausalLM": MambaModelConfig,
|
||||
"DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
|
||||
"NemotronHForCausalLM": NemotronHForCausalLMConfig,
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
NestedTensors,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
|
||||
@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if hasattr(config, "partial_rotary_factor"):
|
||||
rotary_dim = self.head_dim * config.partial_rotary_factor
|
||||
elif hasattr(config, "attn_rotary_emb"):
|
||||
if hasattr(config, "attn_rotary_emb"):
|
||||
rotary_dim = config.attn_rotary_emb # for backward compatibility
|
||||
else:
|
||||
rotary_dim = self.head_dim # default
|
||||
|
||||
@ -10,7 +10,8 @@ from .utils import PPMissingLayer
|
||||
|
||||
class GlmForCausalLM(LlamaForCausalLM):
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
hf_config.rope_parameters["partial_rotary_factor"] = 0.5
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
# Hack Llama model to fit HF format GLM implementation
|
||||
# Attention difference between GLM and Llama:
|
||||
|
||||
@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
|
||||
# Number of KV heads is less than TP size, so we replicate
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
|
||||
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = head_dim or hidden_size // self.total_num_heads
|
||||
self.rotary_dim = self.head_dim
|
||||
@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
|
||||
rotary_dim=self.rotary_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=partial_rotary_factor,
|
||||
is_neox_style=False,
|
||||
)
|
||||
self.attn = Attention(
|
||||
|
||||
@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
|
||||
prefix=f"{prefix}.o_proj",
|
||||
)
|
||||
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
|
||||
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=partial_rotary_factor,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
|
||||
@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.dense",
|
||||
)
|
||||
scaling = self.head_size**-0.5
|
||||
rotary_dim = int(self.head_size * config.rotary_pct)
|
||||
assert rotary_dim % 2 == 0
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_size,
|
||||
rotary_dim=rotary_dim,
|
||||
rotary_dim=self.head_size,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
scaling = self.head_size**-0.5
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
self.head_size,
|
||||
|
||||
@ -78,7 +78,7 @@ class SupportsMultiModal(Protocol):
|
||||
`multimodal_config.mm_encoder_tp_mode="data"`.
|
||||
"""
|
||||
|
||||
merge_by_field_config: ClassVar[bool] = False
|
||||
merge_by_field_config: ClassVar[bool] = True
|
||||
"""
|
||||
A flag that indicates which implementation of
|
||||
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
|
||||
|
||||
@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import (
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
|
||||
if head_dim is None:
|
||||
head_dim = self.hidden_size // self.total_num_heads
|
||||
self.head_dim = head_dim
|
||||
# Phi models introduced a partial_rotary_factor parameter in the config
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -52,7 +52,6 @@ from vllm.multimodal.evs import (
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
VideoItem,
|
||||
)
|
||||
@ -849,17 +848,18 @@ class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
if "image_num_patches" in out_mm_kwargs:
|
||||
image_num_patches = out_mm_kwargs["image_num_patches"]
|
||||
out_mm_data = out_mm_kwargs.get_data()
|
||||
if "image_num_patches" in out_mm_data:
|
||||
image_num_patches = out_mm_data["image_num_patches"]
|
||||
assert isinstance(image_num_patches, torch.Tensor)
|
||||
image_num_patches = image_num_patches.tolist()
|
||||
elif "image_embeds" in out_mm_kwargs:
|
||||
elif "image_embeds" in out_mm_data:
|
||||
# to compute num_patches (similar to Qwen2-VL)
|
||||
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
|
||||
image_num_patches = [None] * len(out_mm_data["image_embeds"])
|
||||
else:
|
||||
image_num_patches = []
|
||||
|
||||
|
||||
@ -178,7 +178,6 @@ class NemotronAttention(nn.Module):
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.partial_rotary_factor = config.partial_rotary_factor
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
@ -203,7 +202,6 @@ class NemotronAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
|
||||
@ -122,7 +122,6 @@ class DeciLMAttention(LlamaAttention):
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
|
||||
from vllm.multimodal.processing import (
|
||||
@ -153,7 +153,7 @@ class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo])
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@ -62,7 +62,7 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
@ -307,7 +307,7 @@ class PaddleOCRVLMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
@ -40,7 +40,6 @@ from .siglip import SiglipVisionModel
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
WeightsMapper,
|
||||
flatten_bn,
|
||||
init_vllm_registered_model,
|
||||
maybe_prefix,
|
||||
)
|
||||
@ -252,6 +251,8 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
|
||||
dummy_inputs=PaliGemmaDummyInputsBuilder,
|
||||
)
|
||||
class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
merge_by_field_config = True
|
||||
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
@ -327,9 +328,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
|
||||
return None
|
||||
|
||||
if pixel_values is not None:
|
||||
pixel_values = flatten_bn(pixel_values, concat=True)
|
||||
|
||||
h = w = self.config.vision_config.image_size
|
||||
|
||||
return PaliGemmaImagePixelInputs(
|
||||
type="pixel_values",
|
||||
data=pixel_values,
|
||||
@ -337,8 +337,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
|
||||
)
|
||||
|
||||
if image_embeds is not None:
|
||||
image_embeds = flatten_bn(image_embeds, concat=True)
|
||||
|
||||
return PaliGemmaImageEmbeddingInputs(
|
||||
type="image_embeds",
|
||||
data=image_embeds,
|
||||
|
||||
@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
|
||||
self.num_heads = self.total_num_heads // tensor_parallel_world_size
|
||||
self.head_dim = self.hidden_size // self.total_num_heads
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.partial_rotary_factor = config.partial_rotary_factor
|
||||
self.is_causal = True
|
||||
|
||||
assert (self.head_dim * self.total_num_heads) == self.hidden_size
|
||||
@ -138,7 +137,6 @@ class PersimmonAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.attn = Attention(
|
||||
|
||||
@ -109,10 +109,7 @@ class PhiAttention(nn.Module):
|
||||
)
|
||||
|
||||
scaling = self.head_size**-0.5
|
||||
rotary_dim = int(
|
||||
config.partial_rotary_factor
|
||||
* (config.hidden_size // config.num_attention_heads)
|
||||
)
|
||||
rotary_dim = config.hidden_size // config.num_attention_heads
|
||||
assert rotary_dim % 2 == 0
|
||||
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
|
||||
|
||||
@ -77,7 +77,7 @@ from vllm.multimodal.evs import (
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
||||
@ -973,7 +973,7 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@ -750,7 +750,6 @@ class Qwen3NextAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
partial_rotary_factor=config.partial_rotary_factor,
|
||||
dual_chunk_attention_config=self.dual_chunk_attention_config,
|
||||
)
|
||||
|
||||
|
||||
@ -103,7 +103,7 @@ from .qwen2_5_vl import (
|
||||
Qwen2_5_VLVideoInputs,
|
||||
Qwen2_5_VLVideoPixelInputs,
|
||||
)
|
||||
from .qwen2_vl import Qwen2VLProcessingInfo
|
||||
from .qwen2_vl import Qwen2VLMultiModalDataParser, Qwen2VLProcessingInfo
|
||||
from .qwen3 import Qwen3ForCausalLM, Qwen3Model
|
||||
from .utils import (
|
||||
AutoWeightsLoader,
|
||||
@ -884,7 +884,10 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
|
||||
|
||||
class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
return MultiModalDataParser(video_needs_metadata=True)
|
||||
return Qwen2VLMultiModalDataParser(
|
||||
self.info.get_hf_config().vision_config.spatial_merge_size,
|
||||
video_needs_metadata=True,
|
||||
)
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
|
||||
@ -119,9 +119,6 @@ class StablelmAttention(nn.Module):
|
||||
self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
|
||||
self.head_dim = self.hidden_size // self.total_num_heads
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.partial_rotary_factor = getattr(
|
||||
config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_key_value_heads * self.head_dim
|
||||
@ -154,7 +151,6 @@ class StablelmAttention(nn.Module):
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.config.max_position_embeddings,
|
||||
rope_parameters=self.config.rope_parameters,
|
||||
partial_rotary_factor=self.partial_rotary_factor,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
|
||||
@ -25,7 +25,6 @@ from .inputs import (
|
||||
MultiModalBatchedField,
|
||||
MultiModalFeatureSpec,
|
||||
MultiModalFieldElem,
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalKwargsItems,
|
||||
NestedTensors,
|
||||
@ -90,7 +89,6 @@ MultiModalCacheValue: TypeAlias = (
|
||||
| MultiModalProcessorCacheItemMetadata
|
||||
| MultiModalKwargsItems
|
||||
| MultiModalKwargsItem
|
||||
| MultiModalKwargs
|
||||
| Mapping[str, NestedTensors]
|
||||
)
|
||||
|
||||
@ -108,12 +106,7 @@ class MultiModalCache:
|
||||
# These are not subclasses of dict
|
||||
if isinstance(
|
||||
leaf,
|
||||
(
|
||||
MultiModalKwargs,
|
||||
MultiModalKwargsItems,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalFieldElem,
|
||||
),
|
||||
(MultiModalKwargsItems, MultiModalKwargsItem, MultiModalFieldElem),
|
||||
):
|
||||
return cls.get_item_size(leaf.data) # type: ignore
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import UserDict, defaultdict
|
||||
from collections.abc import Mapping, Sequence
|
||||
from collections.abc import Mapping, Sequence, Set
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from itertools import accumulate
|
||||
@ -201,8 +201,10 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
|
||||
|
||||
|
||||
def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
||||
"""Equality check between
|
||||
[`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
|
||||
"""
|
||||
Equality check between
|
||||
[`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.
|
||||
"""
|
||||
if isinstance(a, torch.Tensor):
|
||||
return isinstance(b, torch.Tensor) and torch.equal(a, b)
|
||||
elif isinstance(b, torch.Tensor):
|
||||
@ -224,10 +226,24 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
|
||||
BatchedTensorInputs: TypeAlias = dict[str, NestedTensors]
|
||||
"""
|
||||
A dictionary containing nested tensors which have been batched via
|
||||
[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
|
||||
[`MultiModalKwargsItems.get_data`][vllm.multimodal.inputs.MultiModalKwargsItems.get_data].
|
||||
"""
|
||||
|
||||
|
||||
def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> bool:
|
||||
"""
|
||||
Equality check between
|
||||
[`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
|
||||
"""
|
||||
for k in a:
|
||||
if k not in b:
|
||||
return False
|
||||
if not nested_tensors_equal(a[k], b[k]):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class MultiModalFeatureSpec:
|
||||
"""
|
||||
@ -823,7 +839,14 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
|
||||
|
||||
return self # type: ignore[return-value]
|
||||
|
||||
def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
|
||||
def get_data(
|
||||
self,
|
||||
*,
|
||||
device: torch.types.Device = None,
|
||||
pin_memory: bool = False,
|
||||
cpu_fields: Set[str] = frozenset(),
|
||||
) -> BatchedTensorInputs:
|
||||
"""Construct a dictionary of keyword arguments to pass to the model."""
|
||||
elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
|
||||
for modality, items in self.items():
|
||||
for i, item in enumerate(items):
|
||||
@ -835,12 +858,23 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
|
||||
for key, elem in item.items():
|
||||
elems_by_key[key].append(elem)
|
||||
|
||||
return MultiModalKwargs(
|
||||
{
|
||||
key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
|
||||
for key, elems in elems_by_key.items()
|
||||
}
|
||||
)
|
||||
data = {
|
||||
key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
|
||||
for key, elems in elems_by_key.items()
|
||||
}
|
||||
|
||||
if device is not None:
|
||||
for k in data.keys() - cpu_fields:
|
||||
data[k] = json_map_leaves(
|
||||
(
|
||||
lambda x: x.to(device=device, non_blocking=True)
|
||||
if isinstance(x, torch.Tensor)
|
||||
else x
|
||||
),
|
||||
data[k],
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
MultiModalKwargsOptionalItems: TypeAlias = (
|
||||
@ -849,6 +883,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
|
||||
)
|
||||
|
||||
|
||||
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
|
||||
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
@ -882,91 +917,6 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
):
|
||||
return MultiModalKwargsItems.from_seq(items).get_data(pin_memory=pin_memory)
|
||||
|
||||
@staticmethod
|
||||
def _try_stack(
|
||||
nested_tensors: NestedTensors, pin_memory: bool = False
|
||||
) -> NestedTensors:
|
||||
"""
|
||||
Stack the inner dimensions that have the same shape in
|
||||
a nested list of tensors.
|
||||
|
||||
Thus, a dimension represented by a list means that the inner
|
||||
dimensions are different for each element along that dimension.
|
||||
"""
|
||||
if isinstance(nested_tensors, torch.Tensor):
|
||||
return nested_tensors
|
||||
|
||||
# TODO: Remove these once all models have been migrated
|
||||
if isinstance(nested_tensors, np.ndarray):
|
||||
return torch.from_numpy(nested_tensors)
|
||||
if isinstance(nested_tensors, (int, float)):
|
||||
return torch.tensor(nested_tensors)
|
||||
|
||||
stacked = [MultiModalKwargs._try_stack(t, pin_memory) for t in nested_tensors]
|
||||
if not is_list_of(stacked, torch.Tensor, check="all"):
|
||||
# Only tensors (not lists) can be stacked.
|
||||
return stacked
|
||||
|
||||
tensors_ = cast(list[torch.Tensor], stacked)
|
||||
if len(tensors_) == 1:
|
||||
# An optimization when `tensors_` contains only one tensor:
|
||||
# - produce exactly same result as `torch.stack(tensors_)`
|
||||
# - will achieve zero-copy if the tensor is contiguous
|
||||
return tensors_[0].unsqueeze(0).contiguous()
|
||||
|
||||
if any(t.shape != tensors_[0].shape for t in tensors_):
|
||||
# The tensors have incompatible shapes and can't be stacked.
|
||||
return tensors_
|
||||
|
||||
outputs = torch.empty(
|
||||
len(tensors_),
|
||||
*tensors_[0].shape,
|
||||
dtype=tensors_[0].dtype,
|
||||
device=tensors_[0].device,
|
||||
pin_memory=pin_memory,
|
||||
)
|
||||
return torch.stack(tensors_, out=outputs)
|
||||
|
||||
@staticmethod
|
||||
def batch(
|
||||
inputs_list: list["MultiModalKwargs"], pin_memory: bool = False
|
||||
) -> BatchedTensorInputs:
|
||||
"""
|
||||
Batch multiple inputs together into a dictionary.
|
||||
|
||||
The resulting dictionary has the same keys as the inputs.
|
||||
If the corresponding value from each input is a tensor and they all
|
||||
share the same shape, the output value is a single batched tensor;
|
||||
otherwise, the output value is a list containing the original value
|
||||
from each input.
|
||||
"""
|
||||
if len(inputs_list) == 0:
|
||||
return {}
|
||||
|
||||
# We need to consider the case where each item in the batch
|
||||
# contains different modalities (i.e. different keys).
|
||||
item_lists = defaultdict[str, list[NestedTensors]](list)
|
||||
|
||||
for inputs in inputs_list:
|
||||
for k, v in inputs.items():
|
||||
item_lists[k].append(v)
|
||||
|
||||
return {
|
||||
k: MultiModalKwargs._try_stack(item_list, pin_memory)
|
||||
for k, item_list in item_lists.items()
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def as_kwargs(
|
||||
batched_inputs: BatchedTensorInputs,
|
||||
*,
|
||||
device: torch.types.Device,
|
||||
) -> BatchedTensorInputs:
|
||||
return json_map_leaves(
|
||||
lambda x: x.to(device=device, non_blocking=True),
|
||||
batched_inputs,
|
||||
)
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
if key not in self:
|
||||
raise KeyError(
|
||||
|
||||
@ -19,7 +19,6 @@ from PIL import Image, UnidentifiedImageError
|
||||
import vllm.envs as envs
|
||||
from vllm.connections import HTTPConnection, global_http_connection
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
from vllm.utils.registry import ExtensionManager
|
||||
|
||||
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
|
||||
@ -427,59 +426,25 @@ def group_mm_kwargs_by_modality(
|
||||
Yields:
|
||||
A tuple `(modality, num_items, grouped_kwargs)`.
|
||||
"""
|
||||
if merge_by_field_config is None:
|
||||
raise RuntimeError(
|
||||
"`group_mm_kwargs_by_modality` now requires "
|
||||
"`merge_by_field_config` arg, please update your model runner "
|
||||
"according to https://github.com/vllm-project/vllm/pull/25676."
|
||||
)
|
||||
if merge_by_field_config is False:
|
||||
# TODO: After v0.13, remove merge_by_field_config attribute from model impls
|
||||
if merge_by_field_config is not None:
|
||||
logger.warning_once(
|
||||
"The legacy code for batching multi-modal kwargs is deprecated and "
|
||||
"will be removed in v0.12. Please update your model with "
|
||||
"`merge_by_field_config=True` to use the new code defined by "
|
||||
"`MultiModalFieldConfig`. You can refer to "
|
||||
"https://github.com/vllm-project/vllm/issues/26149 "
|
||||
"for some examples on how to do this."
|
||||
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
|
||||
"is deprecated and will be removed in v0.13."
|
||||
)
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItems
|
||||
|
||||
for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
|
||||
items_lst = list(items)
|
||||
mm_kwargs_items = MultiModalKwargsItems.from_seq(items_lst)
|
||||
mm_kwargs_data = mm_kwargs_items.get_data(
|
||||
device=device,
|
||||
pin_memory=pin_memory,
|
||||
cpu_fields=multimodal_cpu_fields,
|
||||
)
|
||||
|
||||
if merge_by_field_config:
|
||||
mm_kwargs_group: BatchedTensorInputs = dict(
|
||||
MultiModalKwargsItems.from_seq(items_lst).get_data(
|
||||
pin_memory=pin_memory
|
||||
)
|
||||
)
|
||||
|
||||
if device is not None:
|
||||
mm_kwargs_group = {
|
||||
k: json_map_leaves(
|
||||
lambda x: x.to(device=device, non_blocking=True)
|
||||
if isinstance(x, torch.Tensor)
|
||||
else x,
|
||||
v,
|
||||
)
|
||||
if k not in multimodal_cpu_fields
|
||||
else v
|
||||
for k, v in mm_kwargs_group.items()
|
||||
}
|
||||
else:
|
||||
mm_kwargs_group = MultiModalKwargs.as_kwargs(
|
||||
MultiModalKwargs.batch(
|
||||
[
|
||||
MultiModalKwargsItems.from_seq([item]).get_data()
|
||||
for item in items_lst
|
||||
],
|
||||
pin_memory=pin_memory,
|
||||
),
|
||||
device=device,
|
||||
)
|
||||
|
||||
yield modality, len(items_lst), mm_kwargs_group
|
||||
yield modality, len(items_lst), mm_kwargs_data
|
||||
|
||||
|
||||
def fetch_audio(
|
||||
|
||||
@ -233,6 +233,23 @@ class CudaPlatformBase(Platform):
|
||||
from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if compilation_config.cudagraph_mode.has_full_cudagraphs():
|
||||
# decode context parallel does not support full cudagraphs
|
||||
if parallel_config.decode_context_parallel_size > 1:
|
||||
logger.warning_once(
|
||||
"Decode context parallel (DCP) is enabled, which is "
|
||||
"incompatible with full CUDA graphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
# prefill context parallel do not support full cudagraphs
|
||||
elif parallel_config.prefill_context_parallel_size > 1:
|
||||
logger.warning_once(
|
||||
"Prefill context parallel (PCP) is enabled, which is "
|
||||
"incompatible with full CUDA graphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
if (
|
||||
parallel_config.all2all_backend == "deepep_high_throughput"
|
||||
and parallel_config.data_parallel_size > 1
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user