mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-03 10:57:02 +08:00
Merge branch 'main' into rename_file_info_to_pkg/file
This commit is contained in:
commit
0da34283f7
@ -1068,7 +1068,7 @@ steps:
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
@ -1095,10 +1095,11 @@ steps:
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
- label: ROCm GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
agent_pool: mi325_1
|
||||
mirror_hardwares: [amdproduction]
|
||||
optional: true # run on nightlies
|
||||
source_file_dependencies:
|
||||
- tests/evals/gpt_oss
|
||||
@ -1107,7 +1108,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
|
||||
@ -478,10 +478,11 @@ steps:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||
# Limit to no custom ops to reduce running time
|
||||
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
|
||||
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||
|
||||
- label: Cudagraph test
|
||||
timeout_in_minutes: 20
|
||||
@ -925,7 +926,7 @@ steps:
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
- label: Blackwell Fusion and Compile Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
@ -946,7 +947,9 @@ steps:
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||
# Wrap with quotes to escape yaml
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell Fusion E2E Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
@ -969,8 +972,6 @@ steps:
|
||||
- nvidia-smi
|
||||
# Run all e2e fusion tests
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@ -1266,7 +1267,8 @@ steps:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@ -57,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
# Transformers modeling backend
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
|
||||
81
.github/workflows/macos-smoke-test.yml
vendored
Normal file
81
.github/workflows/macos-smoke-test.yml
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
name: macOS Apple Silicon Smoke Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch: # Manual trigger
|
||||
|
||||
jobs:
|
||||
macos-m1-smoke-test:
|
||||
runs-on: macos-latest
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
enable-cache: true
|
||||
cache-dependency-glob: |
|
||||
requirements/**/*.txt
|
||||
pyproject.toml
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Create virtual environment
|
||||
run: |
|
||||
uv venv
|
||||
echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
|
||||
|
||||
- name: Install dependencies and build vLLM
|
||||
run: |
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
uv pip install -e .
|
||||
env:
|
||||
CMAKE_BUILD_PARALLEL_LEVEL: 4
|
||||
|
||||
- name: Verify installation
|
||||
run: |
|
||||
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
|
||||
|
||||
- name: Smoke test vllm serve
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
# Start server in background
|
||||
vllm serve Qwen/Qwen3-0.6B \
|
||||
--max-model-len=2048 \
|
||||
--load-format=dummy \
|
||||
--enforce-eager \
|
||||
--port 8000 &
|
||||
|
||||
SERVER_PID=$!
|
||||
|
||||
# Wait for server to start
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:8000/health > /dev/null; then
|
||||
echo "Server started successfully"
|
||||
break
|
||||
fi
|
||||
if [ "$i" -eq 30 ]; then
|
||||
echo "Server failed to start"
|
||||
kill "$SERVER_PID"
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Test health endpoint
|
||||
curl -f http://localhost:8000/health
|
||||
|
||||
# Test completion
|
||||
curl -f http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Qwen/Qwen3-0.6B",
|
||||
"prompt": "Hello",
|
||||
"max_tokens": 5
|
||||
}'
|
||||
|
||||
# Cleanup
|
||||
kill "$SERVER_PID"
|
||||
@ -3,10 +3,9 @@ MD007:
|
||||
MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD031:
|
||||
list_items: false
|
||||
MD033: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
MD052: false
|
||||
MD053: false
|
||||
MD059: false
|
||||
|
||||
@ -861,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# Hadacore kernels
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if(HADACORE_ARCHS)
|
||||
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
|
||||
380
benchmarks/benchmark_batch_invariance.py
Executable file
380
benchmarks/benchmark_batch_invariance.py
Executable file
@ -0,0 +1,380 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
|
||||
|
||||
This benchmark runs the same workload twice:
|
||||
1. With VLLM_BATCH_INVARIANT=0 (baseline)
|
||||
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
|
||||
|
||||
And reports the timing and throughput metrics for comparison.
|
||||
|
||||
Environment variables:
|
||||
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
|
||||
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
|
||||
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
|
||||
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
|
||||
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
|
||||
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
|
||||
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
|
||||
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
|
||||
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
|
||||
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
|
||||
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
|
||||
|
||||
Example usage:
|
||||
# Benchmark qwen3 (default)
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
|
||||
# Benchmark deepseek with 8 GPUs
|
||||
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
|
||||
# Quick test with fewer trials
|
||||
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
|
||||
python benchmarks/benchmark_batch_invariance.py
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
"""Generate a random prompt for benchmarking."""
|
||||
prompt_templates = [
|
||||
"Question: What is the capital of France?\nAnswer: The capital of France is",
|
||||
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
|
||||
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
|
||||
"Once upon a time in a distant galaxy, there lived",
|
||||
"The old man walked slowly down the street, remembering",
|
||||
"In the year 2157, humanity finally discovered",
|
||||
"To implement a binary search tree in Python, first we need to",
|
||||
"The algorithm works by iterating through the array and",
|
||||
"Here's how to optimize database queries using indexing:",
|
||||
"The Renaissance was a period in European history that",
|
||||
"Climate change is caused by several factors including",
|
||||
"The human brain contains approximately 86 billion neurons which",
|
||||
"I've been thinking about getting a new laptop because",
|
||||
"Yesterday I went to the store and bought",
|
||||
"My favorite thing about summer is definitely",
|
||||
]
|
||||
|
||||
base_prompt = random.choice(prompt_templates)
|
||||
|
||||
if max_words < min_words:
|
||||
max_words = min_words
|
||||
target_words = random.randint(min_words, max_words)
|
||||
|
||||
if target_words > 50:
|
||||
padding_text = (
|
||||
" This is an interesting topic that deserves more explanation. "
|
||||
* (target_words // 50)
|
||||
)
|
||||
base_prompt = base_prompt + padding_text
|
||||
|
||||
return base_prompt
|
||||
|
||||
|
||||
def run_benchmark_with_batch_invariant(
|
||||
model: str,
|
||||
tp_size: int,
|
||||
max_batch_size: int,
|
||||
num_trials: int,
|
||||
min_prompt: int,
|
||||
max_prompt: int,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
gpu_mem_util: float,
|
||||
max_model_len: int,
|
||||
backend: str,
|
||||
batch_invariant: bool,
|
||||
seed: int = 12345,
|
||||
) -> dict:
|
||||
"""
|
||||
Run the benchmark with the specified configuration.
|
||||
|
||||
Returns a dict with timing and throughput metrics.
|
||||
"""
|
||||
random.seed(seed)
|
||||
|
||||
# Set environment variables
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
||||
if batch_invariant:
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
||||
else:
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "0"
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
|
||||
print(f" Model: {model}")
|
||||
print(f" TP Size: {tp_size}")
|
||||
print(f" Backend: {backend}")
|
||||
print(f" Max Batch Size: {max_batch_size}")
|
||||
print(f" Trials: {num_trials}")
|
||||
print(f" Max Tokens: {max_tokens}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
sampling = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=0.95,
|
||||
max_tokens=max_tokens,
|
||||
seed=20240919,
|
||||
)
|
||||
|
||||
needle_prompt = "There once was a "
|
||||
|
||||
llm = None
|
||||
try:
|
||||
# Create LLM engine
|
||||
start_init = time.perf_counter()
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
dtype="bfloat16",
|
||||
tensor_parallel_size=tp_size,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
init_time = time.perf_counter() - start_init
|
||||
print(f"Engine initialization time: {init_time:.2f}s\n")
|
||||
|
||||
# Generate baseline
|
||||
print("Generating baseline (warmup)...")
|
||||
baseline_out = llm.generate([needle_prompt], sampling)
|
||||
assert len(baseline_out) == 1
|
||||
baseline_text = baseline_out[0].outputs[0].text
|
||||
print(f"Baseline output: '{baseline_text[:50]}...'\n")
|
||||
|
||||
# Run trials and measure timing
|
||||
trial_times: list[float] = []
|
||||
total_tokens = 0
|
||||
total_prompts = 0
|
||||
|
||||
for trial in range(num_trials):
|
||||
# Create a batch
|
||||
prompts: list[str] = []
|
||||
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
||||
needle_pos = random.randint(0, batch_size - 1)
|
||||
for i in range(batch_size):
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(_random_prompt(min_prompt, max_prompt))
|
||||
|
||||
# Measure time for this trial
|
||||
start_time = time.perf_counter()
|
||||
outputs = llm.generate(prompts, sampling)
|
||||
trial_time = time.perf_counter() - start_time
|
||||
|
||||
trial_times.append(trial_time)
|
||||
total_prompts += len(prompts)
|
||||
|
||||
# Count tokens
|
||||
for output in outputs:
|
||||
if output.outputs:
|
||||
total_tokens += len(output.outputs[0].token_ids)
|
||||
|
||||
print(
|
||||
f"Trial {trial + 1}/{num_trials}: "
|
||||
f"batch_size={batch_size}, "
|
||||
f"time={trial_time:.2f}s"
|
||||
)
|
||||
|
||||
# Verify needle output still matches
|
||||
needle_output = outputs[needle_pos]
|
||||
assert needle_output.prompt == needle_prompt
|
||||
|
||||
# Compute statistics
|
||||
avg_time = sum(trial_times) / len(trial_times)
|
||||
min_time = min(trial_times)
|
||||
max_time = max(trial_times)
|
||||
throughput = total_tokens / sum(trial_times)
|
||||
prompts_per_sec = total_prompts / sum(trial_times)
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print("RESULTS:")
|
||||
print(f" Average time per trial: {avg_time:.2f}s")
|
||||
print(f" Min time: {min_time:.2f}s")
|
||||
print(f" Max time: {max_time:.2f}s")
|
||||
print(f" Total tokens generated: {total_tokens}")
|
||||
print(f" Total prompts processed: {total_prompts}")
|
||||
print(f" Throughput: {throughput:.2f} tokens/s")
|
||||
print(f" Prompts/s: {prompts_per_sec:.2f}")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
return {
|
||||
"init_time": init_time,
|
||||
"avg_time": avg_time,
|
||||
"min_time": min_time,
|
||||
"max_time": max_time,
|
||||
"total_tokens": total_tokens,
|
||||
"total_prompts": total_prompts,
|
||||
"throughput": throughput,
|
||||
"prompts_per_sec": prompts_per_sec,
|
||||
"trial_times": trial_times,
|
||||
}
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if llm is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
llm.shutdown()
|
||||
|
||||
|
||||
def main():
|
||||
# Check platform support
|
||||
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
|
||||
print("ERROR: Requires CUDA and >= Hopper (SM90)")
|
||||
print(f"Current platform: {current_platform.device_type}")
|
||||
if current_platform.is_cuda():
|
||||
print(f"Device capability: {current_platform.get_device_capability()}")
|
||||
return 1
|
||||
|
||||
# Read configuration from environment
|
||||
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
|
||||
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
|
||||
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
|
||||
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
|
||||
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
|
||||
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
|
||||
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
|
||||
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
|
||||
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
|
||||
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
|
||||
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("VLLM BATCH INVARIANCE BENCHMARK")
|
||||
print("=" * 80)
|
||||
print("\nConfiguration:")
|
||||
print(f" Model: {model}")
|
||||
print(f" Tensor Parallel Size: {tp_size}")
|
||||
print(f" Attention Backend: {backend}")
|
||||
print(f" Max Batch Size: {max_batch_size}")
|
||||
print(f" Number of Trials: {num_trials}")
|
||||
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
|
||||
print(f" Max Tokens to Generate: {max_tokens}")
|
||||
print(f" Temperature: {temperature}")
|
||||
print(f" GPU Memory Utilization: {gpu_mem_util}")
|
||||
print(f" Max Model Length: {max_model_len}")
|
||||
print("=" * 80)
|
||||
|
||||
# Run benchmark WITHOUT batch invariance (baseline)
|
||||
print("\n" + "=" * 80)
|
||||
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
|
||||
print("=" * 80)
|
||||
baseline_results = run_benchmark_with_batch_invariant(
|
||||
model=model,
|
||||
tp_size=tp_size,
|
||||
max_batch_size=max_batch_size,
|
||||
num_trials=num_trials,
|
||||
min_prompt=min_prompt,
|
||||
max_prompt=max_prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
gpu_mem_util=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
backend=backend,
|
||||
batch_invariant=False,
|
||||
)
|
||||
|
||||
# Run benchmark WITH batch invariance
|
||||
print("\n" + "=" * 80)
|
||||
print("PHASE 2: Running WITH batch invariance")
|
||||
print("=" * 80)
|
||||
batch_inv_results = run_benchmark_with_batch_invariant(
|
||||
model=model,
|
||||
tp_size=tp_size,
|
||||
max_batch_size=max_batch_size,
|
||||
num_trials=num_trials,
|
||||
min_prompt=min_prompt,
|
||||
max_prompt=max_prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
gpu_mem_util=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
backend=backend,
|
||||
batch_invariant=True,
|
||||
)
|
||||
|
||||
# Compare results
|
||||
print("\n" + "=" * 80)
|
||||
print("COMPARISON: Batch Invariance vs Baseline")
|
||||
print("=" * 80)
|
||||
|
||||
init_overhead_pct = (
|
||||
(batch_inv_results["init_time"] - baseline_results["init_time"])
|
||||
/ baseline_results["init_time"]
|
||||
* 100
|
||||
)
|
||||
time_overhead_pct = (
|
||||
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
|
||||
/ baseline_results["avg_time"]
|
||||
* 100
|
||||
)
|
||||
throughput_change_pct = (
|
||||
(batch_inv_results["throughput"] - baseline_results["throughput"])
|
||||
/ baseline_results["throughput"]
|
||||
* 100
|
||||
)
|
||||
|
||||
print("\nInitialization Time:")
|
||||
print(f" Baseline: {baseline_results['init_time']:.2f}s")
|
||||
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
|
||||
print(f" Overhead: {init_overhead_pct:+.2f}%")
|
||||
|
||||
print("\nAverage Trial Time:")
|
||||
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
|
||||
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
|
||||
print(f" Overhead: {time_overhead_pct:+.2f}%")
|
||||
|
||||
print("\nThroughput (tokens/s):")
|
||||
print(f" Baseline: {baseline_results['throughput']:.2f}")
|
||||
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
|
||||
print(f" Change: {throughput_change_pct:+.2f}%")
|
||||
|
||||
print("\nPrompts/s:")
|
||||
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
|
||||
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
if time_overhead_pct > 0:
|
||||
print(
|
||||
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
|
||||
"overhead"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
|
||||
"faster (unexpected!)"
|
||||
)
|
||||
|
||||
if abs(throughput_change_pct) < 1.0:
|
||||
print("Throughput difference is negligible (< 1%)")
|
||||
elif throughput_change_pct < 0:
|
||||
print(
|
||||
f"Throughput decreased by {-throughput_change_pct:.1f}% "
|
||||
"with batch invariance"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Throughput increased by {throughput_change_pct:.1f}% "
|
||||
"with batch invariance (unexpected!)"
|
||||
)
|
||||
|
||||
print("=" * 80 + "\n")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
||||
----------------------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
|
||||
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
|
||||
benchmark-only runtime so the reported throughput stays comparable).
|
||||
|
||||
### JSON configuration file for synthetic conversations generation
|
||||
|
||||
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||
|
||||
@ -561,8 +561,11 @@ async def client_main(
|
||||
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
# Set unique seed per client (each client runs in its own process)
|
||||
# Add 1 to ensure no client uses the same seed as the main process
|
||||
client_seed = args.seed + client_id + 1
|
||||
random.seed(client_seed)
|
||||
np.random.seed(client_seed)
|
||||
|
||||
# Active conversations
|
||||
active_convs: ConversationsMap = {}
|
||||
@ -1073,6 +1076,7 @@ def process_statistics(
|
||||
verbose: bool,
|
||||
gen_conv_args: GenConvArgs | None = None,
|
||||
excel_output: bool = False,
|
||||
warmup_runtime_sec: float | None = None,
|
||||
) -> None:
|
||||
if len(client_metrics) == 0:
|
||||
logger.info("No samples to process")
|
||||
@ -1166,8 +1170,13 @@ def process_statistics(
|
||||
# Convert milliseconds to seconds
|
||||
runtime_sec = runtime_sec / 1000.0
|
||||
requests_per_sec = float(len(df)) / runtime_sec
|
||||
|
||||
params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
|
||||
params = {
|
||||
"runtime_sec": runtime_sec,
|
||||
"requests_per_sec": requests_per_sec,
|
||||
}
|
||||
if warmup_runtime_sec is not None:
|
||||
params["warmup_runtime_sec"] = warmup_runtime_sec
|
||||
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
|
||||
|
||||
# Generate a summary of relevant metrics (and drop irrelevant data)
|
||||
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
|
||||
@ -1490,6 +1499,7 @@ async def main() -> None:
|
||||
f"Invalid --warmup-percentage={args.warmup_percentage}"
|
||||
) from None
|
||||
|
||||
# Set global seeds for main process
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
|
||||
@ -1548,6 +1558,8 @@ async def main() -> None:
|
||||
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
|
||||
)
|
||||
|
||||
warmup_runtime_sec: float | None = None
|
||||
|
||||
# Warm-up step
|
||||
if args.warmup_step:
|
||||
# Only send a single user prompt from every conversation.
|
||||
@ -1562,26 +1574,56 @@ async def main() -> None:
|
||||
# all clients should finish their work before exiting
|
||||
warmup_bench_args = bench_args._replace(early_stop=False)
|
||||
|
||||
logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
|
||||
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
|
||||
warmup_start_ns = time.perf_counter_ns()
|
||||
conversations, _ = await main_mp(
|
||||
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
|
||||
)
|
||||
logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
|
||||
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
|
||||
logger.info(
|
||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
||||
Color.PURPLE,
|
||||
warmup_runtime_sec,
|
||||
warmup_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
|
||||
|
||||
# Run the benchmark
|
||||
start_time = time.perf_counter_ns()
|
||||
benchmark_start_ns = time.perf_counter_ns()
|
||||
client_convs, client_metrics = await main_mp(
|
||||
client_args, req_args, bench_args, tokenizer, conversations
|
||||
)
|
||||
total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
|
||||
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
|
||||
|
||||
# Calculate requests per second
|
||||
total_runtime_sec = total_runtime_ms / 1000.0
|
||||
rps = len(client_metrics) / total_runtime_sec
|
||||
requests_per_sec = len(client_metrics) / benchmark_runtime_sec
|
||||
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
|
||||
logger.info(
|
||||
f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
|
||||
f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
|
||||
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
|
||||
"requests per second: %.3f%s",
|
||||
Color.GREEN,
|
||||
benchmark_runtime_sec,
|
||||
benchmark_runtime_ms,
|
||||
requests_per_sec,
|
||||
Color.RESET,
|
||||
)
|
||||
if warmup_runtime_sec is not None:
|
||||
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
|
||||
logger.info(
|
||||
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
|
||||
Color.GREEN,
|
||||
warmup_runtime_sec,
|
||||
warmup_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
logger.info(
|
||||
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
|
||||
Color.GREEN,
|
||||
total_runtime_sec,
|
||||
total_runtime_sec * 1000,
|
||||
Color.RESET,
|
||||
)
|
||||
|
||||
# Benchmark parameters
|
||||
params = {
|
||||
@ -1606,6 +1648,7 @@ async def main() -> None:
|
||||
verbose=args.verbose,
|
||||
gen_conv_args=gen_conv_args,
|
||||
excel_output=args.excel_output,
|
||||
warmup_runtime_sec=warmup_runtime_sec,
|
||||
)
|
||||
|
||||
if args.output_file is not None:
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
|
||||
GIT_TAG 58e0626a692f09241182582659e3bf8f16472659
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@ -754,7 +754,7 @@ class AttentionScheduler {
|
||||
return l2_cache_size >> 1; // use 50% of L2 cache
|
||||
}
|
||||
// Fallback if sysctlbyname fails
|
||||
return 128 * 1024 >> 1; // use 50% of 128KB
|
||||
return 128LL * 1024 >> 1; // use 50% of 128KB
|
||||
#else
|
||||
long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
|
||||
TORCH_CHECK_NE(l2_cache_size, -1);
|
||||
|
||||
@ -100,6 +100,9 @@ void cpu_attention_with_kv_cache(
|
||||
const torch::Tensor& scheduler_metadata,
|
||||
const std::optional<torch::Tensor>& s_aux);
|
||||
|
||||
// Note: just for avoiding importing errors
|
||||
void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
@ -275,6 +278,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
|
||||
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
|
||||
&cpu_attention_with_kv_cache);
|
||||
|
||||
// placeholders
|
||||
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
|
||||
|
||||
@ -37,6 +37,16 @@
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#define FINAL_MASK 0xffffffffffffffffULL
|
||||
|
||||
#if defined(HIP_VERSION) && HIP_VERSION < 70000000
|
||||
// On ROCm versions before 7.0, __syncwarp isn't defined. The below
|
||||
// implementation is copy/pasted from the implementation in ROCm 7.0
|
||||
__device__ inline void __syncwarp() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
|
||||
__builtin_amdgcn_wave_barrier();
|
||||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#define FINAL_MASK 0xffffffff
|
||||
#endif
|
||||
|
||||
@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
|
||||
});
|
||||
|
||||
if (numel % 256 != 0) {
|
||||
out = out.index({torch::indexing::Slice(0, numel / had_size)});
|
||||
out = out.narrow(0, 0, numel / had_size);
|
||||
}
|
||||
|
||||
if (inplace && out.data_ptr() != x.data_ptr()) {
|
||||
|
||||
@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
|
||||
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||
};
|
||||
|
||||
template <typename InType, typename OutType, bool EnableBias>
|
||||
struct sm90_fp8_config_M8192_K6144 {
|
||||
// M >= 8192, K >= 6144
|
||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||
using KernelSchedule =
|
||||
cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
|
||||
using EpilogueSchedule =
|
||||
typename cutlass::epilogue::TmaWarpSpecializedCooperative;
|
||||
using TileShape = Shape<_256, _128, _128>;
|
||||
using ClusterShape = Shape<_2, _1, _1>;
|
||||
|
||||
using Cutlass3xGemm = conditional_t<
|
||||
EnableBias,
|
||||
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
|
||||
TileShape, ClusterShape, KernelSchedule,
|
||||
EpilogueSchedule>,
|
||||
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||
};
|
||||
|
||||
template <typename InType, typename OutType, bool EnableBias>
|
||||
struct sm90_fp8_config_M128 {
|
||||
// M in (64, 128]
|
||||
@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
using Cutlass3xGemmDefault =
|
||||
typename sm90_fp8_config_default<InType, OutType,
|
||||
EnableBias>::Cutlass3xGemm;
|
||||
using Cutlass3xGemmM8192_K6144 =
|
||||
typename sm90_fp8_config_M8192_K6144<InType, OutType,
|
||||
EnableBias>::Cutlass3xGemm;
|
||||
using Cutlass3xGemmM128 =
|
||||
typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
|
||||
|
||||
@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const n = b.size(1);
|
||||
uint32_t const k = a.size(1);
|
||||
|
||||
if (m <= 16) {
|
||||
// m in [1, 16]
|
||||
@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||
// m in (64, 128]
|
||||
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
|
||||
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||
} else if (m >= 8192 && k >= 6144) {
|
||||
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
|
||||
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||
} else {
|
||||
// m in (128, inf)
|
||||
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
|
||||
|
||||
@ -17,10 +17,7 @@ RUN python3 -m pip install --upgrade pip
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
|
||||
# Install UV
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Activate virtual environment and add uv to PATH
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
|
||||
ARG TRITON_BRANCH="57c693b6"
|
||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||
ARG PYTORCH_BRANCH="1c57644d"
|
||||
@ -7,7 +7,7 @@ ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="9716b1b8"
|
||||
ARG AITER_BRANCH="59bd8ff2"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
@ -19,6 +19,9 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||
|
||||
# Required for RCCL in ROCm7.1
|
||||
ENV HSA_NO_SCRATCH_RECLAIM=1
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
RUN mkdir -p /app
|
||||
|
||||
@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
|
||||
libxext6 \
|
||||
libgl1 \
|
||||
lsb-release \
|
||||
libaio-dev \
|
||||
numactl \
|
||||
wget \
|
||||
vim \
|
||||
@ -68,8 +69,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# install nixl from source code
|
||||
ENV NIXL_VERSION=0.7.0
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip uninstall oneccl oneccl-devel -y
|
||||
|
||||
@ -46,7 +46,10 @@ nav:
|
||||
- contributing/model/multimodal.md
|
||||
- contributing/model/transcription.md
|
||||
- CI: contributing/ci
|
||||
- Design Documents: design
|
||||
- Design Documents:
|
||||
- Plugins:
|
||||
- design/*plugin*.md
|
||||
- design/*
|
||||
- API Reference:
|
||||
- api/README.md
|
||||
- api/vllm
|
||||
|
||||
@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
|
||||
Where to get started with vLLM depends on the type of user. If you are looking to:
|
||||
|
||||
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
|
||||
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
|
||||
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
|
||||
- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
|
||||
- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
|
||||
|
||||
For information about the development of vLLM, see:
|
||||
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_latency.md"
|
||||
--8<-- "docs/argparse/bench_latency.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_serve.md"
|
||||
--8<-- "docs/argparse/bench_serve.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_plot.md"
|
||||
--8<-- "docs/argparse/bench_sweep_plot.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_serve.md"
|
||||
--8<-- "docs/argparse/bench_sweep_serve.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_sweep_serve_sla.md"
|
||||
--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/bench_throughput.md"
|
||||
--8<-- "docs/argparse/bench_throughput.inc.md"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# vllm chat
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/chat.md"
|
||||
--8<-- "docs/argparse/chat.inc.md"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# vllm complete
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/complete.md"
|
||||
--8<-- "docs/argparse/complete.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/run-batch.md"
|
||||
--8<-- "docs/argparse/run-batch.inc.md"
|
||||
|
||||
@ -4,6 +4,6 @@
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
## Arguments
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
--8<-- "docs/argparse/serve.inc.md"
|
||||
|
||||
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
## CLI Arguments
|
||||
|
||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
|
||||
To see the available options, take a look at the [CLI Reference](../cli/README.md)!
|
||||
|
||||
## Configuration file
|
||||
|
||||
|
||||
@ -10,8 +10,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
|
||||
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
## Benchmark CLI
|
||||
|
||||
This section guides you through running benchmark tests with the extensive
|
||||
@ -985,7 +983,7 @@ each document has close to 512 tokens.
|
||||
|
||||
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
|
||||
with an embedding model, also set `--no_reranker`. Because in this case the query is
|
||||
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
to account for the extra prompt which is the query. The token accounting to report the
|
||||
throughput numbers correctly is also adjusted.
|
||||
|
||||
|
||||
@ -95,7 +95,7 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
|
||||
to warm it up so that future builds are faster.
|
||||
|
||||
<p align="center" width="100%">
|
||||
<img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
</p>
|
||||
|
||||
## Update dependencies
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Summary
|
||||
|
||||
!!! important
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||
|
||||
|
||||
@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
|
||||
## Examples in-tree
|
||||
|
||||
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
|
||||
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
||||
|
||||
## Test with the API
|
||||
|
||||
@ -224,6 +224,6 @@ snakeviz expensive_function.prof
|
||||
|
||||
Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
|
||||
|
||||
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
|
||||
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times
|
||||
- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
|
||||
collected objects for each gc.collect
|
||||
|
||||
@ -29,8 +29,8 @@ pip install vllm
|
||||
- API Path: `/chat/completions`
|
||||
- Model: `qwen/Qwen1.5-0.5B-Chat`
|
||||
|
||||

|
||||

|
||||
|
||||
1. Go to `Just chat`, and start to chat:
|
||||
|
||||

|
||||

|
||||
|
||||
@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
|
||||
- **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
|
||||
- **Completion Mode**: `Completion`
|
||||
|
||||

|
||||

|
||||
|
||||
1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
|
||||
|
||||

|
||||

|
||||
|
||||
1. Click the chatbot you just created to open the chat interface and start interacting with the model:
|
||||
|
||||

|
||||

|
||||
|
||||
@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
|
||||
|
||||
## Advanced Deployment Details
|
||||
|
||||
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
||||
With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
||||
|
||||
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
|
||||
|
||||
@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
|
||||
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
|
||||
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
|
||||
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
|
||||
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
||||
- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
||||
|
||||
@ -128,7 +128,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
|
||||
3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
|
||||
a new entry and cache it) or replay (if key exists in the cache).
|
||||
|
||||
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
|
||||
The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
|
||||
|
||||
#### Nested Wrapper design
|
||||
|
||||
|
||||
@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
|
||||
|
||||
The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||
!!! note
|
||||
The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
|
||||
@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
|
||||
The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions.
|
||||
The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
|
||||
|
||||

|
||||

|
||||
|
||||
### FusedMoEPermuteExpertsUnpermute
|
||||
|
||||
@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
|
||||
It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
|
||||
`FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
|
||||
|
||||

|
||||

|
||||
|
||||
### FusedMoEModularKernel
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# IO Processor Plugins
|
||||
|
||||
IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
|
||||
IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
|
||||
|
||||
When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
|
||||
|
||||
|
||||
@ -411,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo
|
||||
|
||||
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
|
||||
|
||||
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
* **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
|
||||
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
|
||||
|
||||
@ -548,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th
|
||||
|
||||
Review these logits processor implementations for guidance on writing built-in logits processors.
|
||||
|
||||
Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
|
||||
Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model.
|
||||
|
||||
* Allowed token IDs
|
||||
|
||||
|
||||
220
docs/design/lora_resolver_plugins.md
Normal file
220
docs/design/lora_resolver_plugins.md
Normal file
@ -0,0 +1,220 @@
|
||||
# LoRA Resolver Plugins
|
||||
|
||||
This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
|
||||
They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
|
||||
|
||||
## Overview
|
||||
|
||||
LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
|
||||
receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
|
||||
to locate and load the adapter from their configured storage locations. This enables:
|
||||
|
||||
- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
|
||||
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
|
||||
- **Automatic Discovery**: Seamless integration with existing LoRA workflows
|
||||
- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
|
||||
|
||||
### Required Environment Variables
|
||||
|
||||
1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
|
||||
```bash
|
||||
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
|
||||
```
|
||||
|
||||
2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
|
||||
```bash
|
||||
export VLLM_PLUGINS=lora_filesystem_resolver
|
||||
```
|
||||
|
||||
3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
|
||||
```bash
|
||||
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
|
||||
```
|
||||
|
||||
### Optional Environment Variables
|
||||
|
||||
- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
|
||||
|
||||
## Available Resolvers
|
||||
|
||||
### lora_filesystem_resolver
|
||||
|
||||
The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
|
||||
|
||||
#### Setup Steps
|
||||
|
||||
1. **Create the LoRA adapter storage directory**:
|
||||
```bash
|
||||
mkdir -p /path/to/lora/adapters
|
||||
```
|
||||
|
||||
2. **Set environment variables**:
|
||||
```bash
|
||||
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
|
||||
export VLLM_PLUGINS=lora_filesystem_resolver
|
||||
export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
|
||||
```
|
||||
|
||||
3. **Start vLLM server**:
|
||||
Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model your-base-model \
|
||||
--enable-lora
|
||||
```
|
||||
|
||||
#### Directory Structure Requirements
|
||||
|
||||
The filesystem resolver expects LoRA adapters to be organized in the following structure:
|
||||
|
||||
```text
|
||||
/path/to/lora/adapters/
|
||||
├── adapter1/
|
||||
│ ├── adapter_config.json
|
||||
│ ├── adapter_model.bin
|
||||
│ └── tokenizer files (if applicable)
|
||||
├── adapter2/
|
||||
│ ├── adapter_config.json
|
||||
│ ├── adapter_model.bin
|
||||
│ └── tokenizer files (if applicable)
|
||||
└── ...
|
||||
```
|
||||
|
||||
Each adapter directory must contain:
|
||||
|
||||
- **`adapter_config.json`**: Required configuration file with the following structure:
|
||||
```json
|
||||
{
|
||||
"peft_type": "LORA",
|
||||
"base_model_name_or_path": "your-base-model-name",
|
||||
"r": 16,
|
||||
"lora_alpha": 32,
|
||||
"target_modules": ["q_proj", "v_proj"],
|
||||
"bias": "none",
|
||||
"modules_to_save": null,
|
||||
"use_rslora": false,
|
||||
"use_dora": false
|
||||
}
|
||||
```
|
||||
|
||||
- **`adapter_model.bin`**: The LoRA adapter weights file
|
||||
|
||||
#### Usage Example
|
||||
|
||||
1. **Prepare your LoRA adapter**:
|
||||
```bash
|
||||
# Assuming you have a LoRA adapter in /tmp/my_lora_adapter
|
||||
cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
|
||||
```
|
||||
|
||||
2. **Verify the directory structure**:
|
||||
```bash
|
||||
ls -la /path/to/lora/adapters/my_sql_adapter/
|
||||
# Should show: adapter_config.json, adapter_model.bin, etc.
|
||||
```
|
||||
|
||||
3. **Make a request using the adapter**:
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "my_sql_adapter",
|
||||
"prompt": "Generate a SQL query for:",
|
||||
"max_tokens": 50,
|
||||
"temperature": 0.1
|
||||
}'
|
||||
```
|
||||
|
||||
#### How It Works
|
||||
|
||||
1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
|
||||
2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
|
||||
3. If found, it validates the `adapter_config.json` file
|
||||
4. If the configuration matches the base model and is valid, the adapter is loaded
|
||||
5. The request is processed normally with the newly loaded adapter
|
||||
6. The adapter remains available for future requests
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Multiple Resolvers
|
||||
|
||||
You can configure multiple resolver plugins to load adapters from different sources:
|
||||
|
||||
'lora_s3_resolver' is an example of a custom resolver you would need to implement
|
||||
|
||||
```bash
|
||||
export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
|
||||
```
|
||||
|
||||
All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
|
||||
|
||||
### Custom Resolver Implementation
|
||||
|
||||
To implement your own resolver plugin:
|
||||
|
||||
1. **Create a new resolver class**:
|
||||
```python
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
class CustomResolver(LoRAResolver):
|
||||
async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
|
||||
# Your custom resolution logic here
|
||||
pass
|
||||
```
|
||||
|
||||
2. **Register the resolver**:
|
||||
```python
|
||||
def register_custom_resolver():
|
||||
resolver = CustomResolver()
|
||||
LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
|
||||
- Ensure the directory exists and is accessible
|
||||
- Check file permissions on the directory
|
||||
|
||||
2. **"LoRA adapter not found"**
|
||||
- Verify the adapter directory name matches the requested model name
|
||||
- Check that `adapter_config.json` exists and is valid JSON
|
||||
- Ensure `adapter_model.bin` exists in the directory
|
||||
|
||||
3. **"Invalid adapter configuration"**
|
||||
- Verify `peft_type` is set to "LORA"
|
||||
- Check that `base_model_name_or_path` matches your base model
|
||||
- Ensure `target_modules` is properly configured
|
||||
|
||||
4. **"LoRA rank exceeds maximum"**
|
||||
- Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
|
||||
|
||||
### Debugging Tips
|
||||
|
||||
1. **Enable debug logging**:
|
||||
```bash
|
||||
export VLLM_LOGGING_LEVEL=DEBUG
|
||||
```
|
||||
|
||||
2. **Verify environment variables**:
|
||||
```bash
|
||||
echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
|
||||
echo $VLLM_PLUGINS
|
||||
echo $VLLM_LORA_RESOLVER_CACHE_DIR
|
||||
```
|
||||
|
||||
3. **Test adapter configuration**:
|
||||
```bash
|
||||
python -c "
|
||||
import json
|
||||
with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
|
||||
config = json.load(f)
|
||||
print('Config valid:', config)
|
||||
"
|
||||
```
|
||||
@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
## Fused MoE Experts Kernels
|
||||
|
||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
|
||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
|
||||
Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
|
||||
|
||||
!!! note
|
||||
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
|
||||
Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
|
||||
|
||||
## Offline Custom Arguments
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
|
||||
|
||||
* **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
|
||||
|
||||
* **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
* **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
|
||||
|
||||
5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
|
||||
|
||||
@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
|
||||
|
||||
## Ways to Load Your Custom Logits Processor in vLLM
|
||||
|
||||
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
|
||||
Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
|
||||
|
||||
This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
|
||||
|
||||
@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
|
||||
|
||||
## Best Practices for Writing Custom Logits Processors
|
||||
|
||||
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
|
||||
Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
|
||||
|
||||
* Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
|
||||
* For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
|
||||
@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
|
||||
|
||||
* **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
|
||||
|
||||
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
|
||||
* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
|
||||
|
||||
@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on
|
||||
|
||||
We recommend three ways of implementations:
|
||||
|
||||
- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
|
||||
- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
|
||||
- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
|
||||
- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
|
||||
|
||||
@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09
|
||||
|
||||
LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
|
||||
|
||||
Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
|
||||
Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save
|
||||
them locally with
|
||||
|
||||
```python
|
||||
|
||||
@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit
|
||||
|
||||
Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
|
||||
|
||||
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
|
||||
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
|
||||
|
||||
- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
|
||||
- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
|
||||
|
||||
@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
|
||||
|
||||
### Which `dtype` should be used?
|
||||
|
||||
- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
|
||||
- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
|
||||
|
||||
### How to launch a vLLM service on CPU?
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
|
||||
|
||||
Currently the CPU implementation for s390x architecture supports FP32 datatype only.
|
||||
Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
|
||||
@ -83,7 +83,7 @@ uv pip install dist/*.whl
|
||||
!!! example "Troubleshooting"
|
||||
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
|
||||
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
|
||||
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
|
||||
- `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
|
||||
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
|
||||
```toml title="pyproject.toml"
|
||||
[build-system]
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
import importlib.util
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
from argparse import SUPPRESS, HelpFormatter
|
||||
from argparse import SUPPRESS, Action, HelpFormatter
|
||||
from collections.abc import Iterable
|
||||
from importlib.machinery import ModuleSpec
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
from typing import TYPE_CHECKING, Literal
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from pydantic_core import core_schema
|
||||
@ -19,6 +22,11 @@ ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"
|
||||
sys.path.insert(0, str(ROOT_DIR))
|
||||
|
||||
|
||||
def mock_if_no_torch(mock_module: str, mock: MagicMock):
|
||||
if not importlib.util.find_spec("torch"):
|
||||
sys.modules[mock_module] = mock
|
||||
|
||||
|
||||
# Mock custom op code
|
||||
class MockCustomOp:
|
||||
@staticmethod
|
||||
@ -29,18 +37,21 @@ class MockCustomOp:
|
||||
return decorator
|
||||
|
||||
|
||||
noop = lambda *a, **k: None
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
|
||||
sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
|
||||
mock_if_no_torch("vllm._C", MagicMock())
|
||||
mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
|
||||
mock_if_no_torch(
|
||||
"vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
|
||||
)
|
||||
|
||||
|
||||
# Mock any version checks by reading from compiled CI requirements
|
||||
with open(ROOT_DIR / "requirements/test.txt") as f:
|
||||
VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
|
||||
importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
|
||||
|
||||
|
||||
# Make torch.nn.Parameter safe to inherit from
|
||||
sys.modules["torch.nn"] = MagicMock(Parameter=object)
|
||||
mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
|
||||
|
||||
|
||||
class PydanticMagicMock(MagicMock):
|
||||
@ -49,31 +60,34 @@ class PydanticMagicMock(MagicMock):
|
||||
def __init__(self, *args, **kwargs):
|
||||
name = kwargs.pop("name", None)
|
||||
super().__init__(*args, **kwargs)
|
||||
self.__spec__ = importlib.machinery.ModuleSpec(name, None)
|
||||
self.__spec__ = ModuleSpec(name, None)
|
||||
|
||||
def __get_pydantic_core_schema__(self, source_type, handler):
|
||||
return core_schema.any_schema()
|
||||
|
||||
|
||||
def auto_mock(module, attr, max_mocks=100):
|
||||
def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
|
||||
"""Function that automatically mocks missing modules during imports."""
|
||||
logger.info("Importing %s from %s", attr, module)
|
||||
logger.info("Importing %s from %s", attr, module_name)
|
||||
|
||||
for _ in range(max_mocks):
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
|
||||
# First treat attr as an attr, then as a submodule
|
||||
return getattr(
|
||||
importlib.import_module(module),
|
||||
attr,
|
||||
importlib.import_module(f"{module}.{attr}"),
|
||||
)
|
||||
if hasattr(module, attr):
|
||||
return getattr(module, attr)
|
||||
|
||||
return importlib.import_module(f"{module_name}.{attr}")
|
||||
except ModuleNotFoundError as e:
|
||||
assert e.name is not None
|
||||
logger.info("Mocking %s for argparse doc generation", e.name)
|
||||
sys.modules[e.name] = PydanticMagicMock(name=e.name)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to import %s.%s: %s", module, attr, e)
|
||||
except Exception:
|
||||
logger.exception("Failed to import %s.%s: %s", module_name, attr)
|
||||
|
||||
raise ImportError(
|
||||
f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
|
||||
f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
|
||||
)
|
||||
|
||||
|
||||
@ -91,21 +105,26 @@ ChatCommand = auto_mock("vllm.entrypoints.cli.openai", "ChatCommand")
|
||||
CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
|
||||
openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
|
||||
openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
|
||||
FlexibleArgumentParser = auto_mock(
|
||||
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = auto_mock(
|
||||
"vllm.utils.argparse_utils", "FlexibleArgumentParser"
|
||||
)
|
||||
|
||||
|
||||
class MarkdownFormatter(HelpFormatter):
|
||||
"""Custom formatter that generates markdown for argument groups."""
|
||||
|
||||
def __init__(self, prog, starting_heading_level=3):
|
||||
super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
|
||||
def __init__(self, prog: str, starting_heading_level: int = 3):
|
||||
super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
|
||||
|
||||
self._section_heading_prefix = "#" * starting_heading_level
|
||||
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
|
||||
self._markdown_output = []
|
||||
|
||||
def start_section(self, heading):
|
||||
def start_section(self, heading: str):
|
||||
if heading not in {"positional arguments", "options"}:
|
||||
heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
|
||||
self._markdown_output.append(heading_md)
|
||||
@ -113,14 +132,14 @@ class MarkdownFormatter(HelpFormatter):
|
||||
def end_section(self):
|
||||
pass
|
||||
|
||||
def add_text(self, text):
|
||||
def add_text(self, text: str):
|
||||
if text:
|
||||
self._markdown_output.append(f"{text.strip()}\n\n")
|
||||
|
||||
def add_usage(self, usage, actions, groups, prefix=None):
|
||||
pass
|
||||
|
||||
def add_arguments(self, actions):
|
||||
def add_arguments(self, actions: Iterable[Action]):
|
||||
for action in actions:
|
||||
if len(action.option_strings) == 0 or "--help" in action.option_strings:
|
||||
continue
|
||||
@ -169,7 +188,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
|
||||
# Auto-mock runtime imports
|
||||
if tb_list := traceback.extract_tb(e.__traceback__):
|
||||
path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
|
||||
auto_mock(module=".".join(path.parent.parts), attr=path.stem)
|
||||
auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
|
||||
return create_parser(add_cli_args, **kwargs)
|
||||
else:
|
||||
raise e
|
||||
@ -209,7 +228,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
|
||||
# Generate documentation for each parser
|
||||
for stem, parser in parsers.items():
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
|
||||
# Specify encoding for building on Windows
|
||||
with open(doc_path, "w", encoding="utf-8") as f:
|
||||
f.write(super(type(parser), parser).format_help())
|
||||
|
||||
@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
|
||||
|
||||
Currently, the Transformers backend works for the following:
|
||||
Currently, the Transformers modeling backend works for the following:
|
||||
|
||||
- Modalities: embedding models, language models and vision-language models*
|
||||
- Architectures: encoder-only, decoder-only, mixture-of-experts
|
||||
@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
|
||||
|
||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
|
||||
|
||||
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
|
||||
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
|
||||
!!! note
|
||||
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
|
||||
|
||||
If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
|
||||
|
||||
For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
For a model to be compatible with the Transformers modeling backend for vLLM it must:
|
||||
|
||||
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
|
||||
- The model directory must have the correct structure (e.g. `config.json` is present).
|
||||
- `config.json` must contain `auto_map.AutoModel`.
|
||||
- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
|
||||
- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
|
||||
- Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
|
||||
|
||||
If the compatible model is:
|
||||
@ -66,13 +66,13 @@ If the compatible model is:
|
||||
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
|
||||
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
|
||||
#### Writing custom models
|
||||
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
To make your model compatible with the Transformers modeling backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
- If your model is encoder-only:
|
||||
@ -134,7 +134,7 @@ Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
@ -182,7 +182,7 @@ To determine whether a given model is natively supported, you can check the `con
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
|
||||
|
||||
Models do not _need_ to be natively supported to be used in vLLM.
|
||||
The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
|
||||
!!! tip
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
@ -351,6 +351,7 @@ th {
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
|
||||
| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
|
||||
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
|
||||
| `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
|
||||
@ -451,7 +452,7 @@ th {
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
|
||||
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|
|
||||
@ -669,7 +670,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
|
||||
| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
|
||||
@ -684,7 +685,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
|
||||
| `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
|
||||
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
|
||||
| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
|
||||
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
|
||||
@ -720,7 +721,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
|
||||
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
|
||||
@ -785,6 +786,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
!!! note
|
||||
`VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Using vLLM
|
||||
|
||||
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
|
||||
First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
|
||||
|
||||
Then, vLLM supports the following usage patterns:
|
||||
|
||||
|
||||
@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
|
||||
|
||||
|
||||
# Voxtral
|
||||
# Make sure to install mistral-common[audio].
|
||||
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.chunk import (
|
||||
|
||||
49
examples/online_serving/token_generation_client.py
Normal file
49
examples/online_serving/token_generation_client.py
Normal file
@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import httpx
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
|
||||
DUMMY_API_KEY = "empty"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
transport = httpx.HTTPTransport()
|
||||
headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
|
||||
client = httpx.Client(
|
||||
transport=transport,
|
||||
base_url=GEN_ENDPOINT,
|
||||
timeout=600,
|
||||
headers=headers,
|
||||
)
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "How many countries are in the EU?"},
|
||||
]
|
||||
|
||||
|
||||
def main(client):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False,
|
||||
)
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
|
||||
"stream": False,
|
||||
}
|
||||
resp = client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
print(data)
|
||||
print("-" * 50)
|
||||
print("Token generation results:")
|
||||
res = tokenizer.decode(data["choices"][0]["token_ids"])
|
||||
print(res)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(client)
|
||||
@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
|
||||
echo "Please run 'pip install -r requirements/lint.txt', followed by"
|
||||
echo "'pre-commit install' to install the pre-commit hooks."
|
||||
echo "Then linters will run automatically before each commit."
|
||||
@ -30,8 +30,8 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
pyzmq >= 25.0.0
|
||||
msgspec
|
||||
gguf >= 0.13.0
|
||||
mistral_common[image,audio] >= 1.8.5
|
||||
gguf >= 0.17.0
|
||||
mistral_common[image] >= 1.8.5
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
|
||||
@ -4,8 +4,9 @@ packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
setuptools-scm>=8
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
|
||||
@ -10,3 +10,7 @@ mkdocs-minify-plugin
|
||||
regex
|
||||
ruff
|
||||
pydantic
|
||||
|
||||
# For generating argparse docs.
|
||||
# Adding requirements here should only be used as a last resort.
|
||||
msgspec # Need for multiple inheritance involving msgspec.Struct
|
||||
@ -22,6 +22,8 @@ from vllm.config import (
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
|
||||
@ -193,7 +195,14 @@ def run_model(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
|
||||
def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_multi_graph_piecewise_compile(
|
||||
use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
|
||||
):
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
|
||||
@ -21,6 +21,8 @@ from vllm.config import (
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from ..silly_attention import get_global_counter, reset_global_counter
|
||||
|
||||
@ -124,6 +126,7 @@ def _run_simple_model(
|
||||
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@torch.inference_mode()
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly::attention"],
|
||||
|
||||
@ -29,6 +29,8 @@ from vllm.config import (
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from .. import silly_attention # noqa: F401
|
||||
|
||||
@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
|
||||
("inductor", True), # Inductor, Inductor partition
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_toy_llama(
|
||||
backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
|
||||
):
|
||||
@ -513,4 +516,8 @@ def benchmark():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark()
|
||||
# Protect against subprocess reimport when using spawn_new_process_for_each_test
|
||||
import os
|
||||
|
||||
if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
|
||||
benchmark()
|
||||
|
||||
@ -20,13 +20,22 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ..utils import flat_product, multi_gpu_test
|
||||
|
||||
is_blackwell = lambda: current_platform.is_device_capability(100)
|
||||
"""Are we running on Blackwell, a lot of tests depend on it"""
|
||||
|
||||
|
||||
class Matches(NamedTuple):
|
||||
attention_fusion: int = 0
|
||||
allreduce_fusion: int = 0
|
||||
sequence_parallel: int = 0
|
||||
async_tp: int = 0
|
||||
|
||||
|
||||
class ModelBackendTestCase(NamedTuple):
|
||||
model_name: str
|
||||
model_kwargs: dict[str, Any]
|
||||
backend: AttentionBackendEnum
|
||||
attention_fusions: int
|
||||
allreduce_fusions: int | None = None
|
||||
matches: Matches
|
||||
|
||||
|
||||
MODELS_FP8: list[ModelBackendTestCase] = []
|
||||
@ -38,17 +47,33 @@ if current_platform.is_cuda():
|
||||
ModelBackendTestCase(
|
||||
# Use smaller model for L40s in CI
|
||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=32,
|
||||
allreduce_fusions=65,
|
||||
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
|
||||
# so FI attention+fp8_quant is at least tested once
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER
|
||||
if is_blackwell()
|
||||
else AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=32,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
attention_fusions=48,
|
||||
allreduce_fusions=96,
|
||||
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
|
||||
# https://github.com/vllm-project/vllm/issues/28568
|
||||
# TODO FlashInfer attn broken on Blackwell for llama4:
|
||||
# https://github.com/vllm-project/vllm/issues/28604
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
matches=Matches(
|
||||
attention_fusion=48,
|
||||
allreduce_fusion=96,
|
||||
sequence_parallel=96,
|
||||
async_tp=95, # mlp is moe, no fusion there
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@ -57,8 +82,12 @@ if current_platform.is_cuda():
|
||||
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
|
||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||
backend=AttentionBackendEnum.FLASHINFER,
|
||||
attention_fusions=32,
|
||||
allreduce_fusions=65,
|
||||
matches=Matches(
|
||||
attention_fusion=32,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@ -68,15 +97,23 @@ if current_platform.is_cuda():
|
||||
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=0,
|
||||
allreduce_fusions=65,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=65,
|
||||
sequence_parallel=65,
|
||||
async_tp=128,
|
||||
),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="Qwen/Qwen3-30B-A3B",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=0,
|
||||
allreduce_fusions=97,
|
||||
matches=Matches(
|
||||
attention_fusion=0,
|
||||
allreduce_fusion=97,
|
||||
sequence_parallel=97,
|
||||
async_tp=96, # MLP is MoE, half the fusions of dense
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
@ -86,19 +123,19 @@ elif current_platform.is_rocm():
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||
attention_fusions=32,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.ROCM_ATTN,
|
||||
attention_fusions=32,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
ModelBackendTestCase(
|
||||
model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
|
||||
model_kwargs=dict(max_model_len=1024),
|
||||
backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
|
||||
attention_fusions=32,
|
||||
matches=Matches(attention_fusion=32),
|
||||
),
|
||||
]
|
||||
|
||||
@ -106,8 +143,7 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, "
|
||||
"attention_fusions, allreduce_fusions, custom_ops",
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
|
||||
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
|
||||
# quant_fp4 only has the custom impl
|
||||
@ -118,15 +154,14 @@ def test_attn_quant(
|
||||
model_name: str,
|
||||
model_kwargs: dict[str, Any],
|
||||
backend: AttentionBackendEnum,
|
||||
attention_fusions: int,
|
||||
allreduce_fusions: int,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if backend == AttentionBackendEnum.FLASHINFER and (
|
||||
not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
|
||||
not is_blackwell() or not has_flashinfer()
|
||||
):
|
||||
pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
@ -169,12 +204,12 @@ def test_attn_quant(
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(compilation_config, model_name, **model_kwargs)
|
||||
|
||||
matches = re.findall(
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 1, log_holder.text
|
||||
assert int(matches[0]) == attention_fusions
|
||||
assert len(log_matches) == 1, log_holder.text
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
|
||||
|
||||
CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
|
||||
@ -187,8 +222,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, "
|
||||
"attention_fusions, allreduce_fusions, custom_ops",
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
@ -209,8 +243,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: AttentionBackendEnum,
|
||||
attention_fusions: int,
|
||||
allreduce_fusions: int,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
@ -219,6 +252,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
if "fp4" in model_name.lower() and not is_blackwell():
|
||||
pytest.skip("NVFP4 quant requires Blackwell")
|
||||
|
||||
if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
|
||||
# FlashInfer attn fusion requires Blackwell
|
||||
matches = matches._replace(attention_fusion=0)
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
@ -258,23 +298,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
matches = re.findall(
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 2, log_holder.text
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(matches[0]) == attention_fusions
|
||||
assert int(matches[1]) == attention_fusions
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
assert int(log_matches[1]) == matches.attention_fusion
|
||||
|
||||
matches = re.findall(
|
||||
log_matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(matches) == 2, log_holder.text
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(matches[0]) == allreduce_fusions
|
||||
assert int(matches[1]) == allreduce_fusions
|
||||
assert int(log_matches[0]) == matches.allreduce_fusion
|
||||
assert int(log_matches[1]) == matches.allreduce_fusion
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, model_kwargs, backend, matches, custom_ops",
|
||||
# Toggle RMSNorm and QuantFP8 for FP8 models
|
||||
list(
|
||||
flat_product(
|
||||
MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
|
||||
)
|
||||
)
|
||||
# Toggle RMSNorm for FP4 models and unquant models
|
||||
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
|
||||
)
|
||||
@pytest.mark.parametrize("inductor_graph_partition", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="sequence parallel only tested on CUDA",
|
||||
)
|
||||
def test_tp2_attn_quant_async_tp(
|
||||
model_name: str,
|
||||
model_kwargs: dict,
|
||||
backend: AttentionBackendEnum,
|
||||
matches: Matches,
|
||||
custom_ops: str,
|
||||
inductor_graph_partition: bool,
|
||||
caplog_mp_spawn,
|
||||
monkeypatch,
|
||||
):
|
||||
if is_blackwell():
|
||||
# TODO: https://github.com/vllm-project/vllm/issues/27893
|
||||
pytest.skip("Blackwell is not supported for AsyncTP pass")
|
||||
|
||||
if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("Inductor graph partition requires torch>=2.9")
|
||||
|
||||
if "fp4" in model_name.lower() and not is_blackwell():
|
||||
pytest.skip("NVFP4 quant requires Blackwell")
|
||||
|
||||
if backend == AttentionBackendEnum.FLASHINFER:
|
||||
if not has_flashinfer():
|
||||
pytest.skip("FlashInfer backend requires flashinfer installed")
|
||||
if not is_blackwell():
|
||||
# FlashInfer attn fusion requires Blackwell
|
||||
matches = matches._replace(attention_fusion=0)
|
||||
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
|
||||
if inductor_graph_partition:
|
||||
mode = CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
splitting_ops: list[str] | None = None
|
||||
else:
|
||||
mode = CUDAGraphMode.FULL_DECODE_ONLY
|
||||
splitting_ops = []
|
||||
|
||||
# Disable, compile cache to make sure custom passes run.
|
||||
# Otherwise, we can't verify fusion happened through the logs.
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
# To capture subprocess logs, we need to know whether spawn or fork is used.
|
||||
# Force spawn as it is more general.
|
||||
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
# Testing properties
|
||||
use_inductor_graph_partition=inductor_graph_partition,
|
||||
cudagraph_mode=mode,
|
||||
custom_ops=custom_ops_list,
|
||||
splitting_ops=splitting_ops,
|
||||
# Common
|
||||
level=CompilationMode.VLLM_COMPILE,
|
||||
pass_config=PassConfig(
|
||||
enable_attn_fusion=True,
|
||||
enable_noop=True,
|
||||
enable_sequence_parallelism=True,
|
||||
enable_async_tp=True,
|
||||
),
|
||||
# Inductor caches custom passes by default as well via uuid
|
||||
inductor_compile_config={"force_disable_caches": True},
|
||||
)
|
||||
|
||||
with caplog_mp_spawn(logging.DEBUG) as log_holder:
|
||||
run_model(
|
||||
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
|
||||
)
|
||||
log_matches = re.findall(
|
||||
r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.attention_fusion
|
||||
assert int(log_matches[1]) == matches.attention_fusion
|
||||
|
||||
log_matches = re.findall(
|
||||
r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.sequence_parallel
|
||||
assert int(log_matches[1]) == matches.sequence_parallel
|
||||
|
||||
log_matches = re.findall(
|
||||
r"collective_fusion.py:\d+] Replaced (\d+) patterns",
|
||||
log_holder.text,
|
||||
)
|
||||
assert len(log_matches) == 2, log_holder.text
|
||||
|
||||
assert int(log_matches[0]) == matches.async_tp
|
||||
assert int(log_matches[1]) == matches.async_tp
|
||||
|
||||
|
||||
def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
|
||||
|
||||
@ -5,15 +5,15 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.fusion import RMSNormQuantFusionPass
|
||||
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
|
||||
from vllm.compilation.fx_utils import find_auto_fn
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
|
||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CUDAGraphMode,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
@ -27,6 +27,7 @@ from vllm.distributed.parallel_state import (
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
@ -43,172 +44,157 @@ prompts = [
|
||||
]
|
||||
|
||||
|
||||
class TestModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
class TestAllReduceRMSNormModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, eps=1e-6):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size))
|
||||
)
|
||||
self.norm = RMSNorm(intermediate_size, 1e-05)
|
||||
# Initialize weights
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
|
||||
|
||||
def forward(self, hidden_states, residual):
|
||||
"""
|
||||
Forward pass implementing the operations in the FX graph
|
||||
def forward(self, x):
|
||||
z = torch.relu(x)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
Args:
|
||||
hidden_states: Input tensor
|
||||
residual: Residual tensor from previous layer
|
||||
z2 = torch.mm(y, self.w[0])
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
|
||||
Returns:
|
||||
Tuple containing the output tensor
|
||||
"""
|
||||
# Reshape input
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
# matrix multiplication
|
||||
permute = self.gate_proj.permute(1, 0)
|
||||
mm = torch.mm(view, permute)
|
||||
z3 = torch.mm(y2, self.w[1])
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
|
||||
# Tensor parallel all-reduce
|
||||
all_reduce = tensor_model_parallel_all_reduce(mm)
|
||||
y3, resid = self.norm[2](x3, resid)
|
||||
|
||||
# layer normalization
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
z4 = torch.mm(y3, self.w[2])
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
|
||||
return norm_output, residual_output
|
||||
y4, resid = self.norm[3](x4, resid)
|
||||
return y4
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [torch.ops.vllm.all_reduce.default]
|
||||
|
||||
def ops_in_model_after(self):
|
||||
return [
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
torch.ops.vllm.all_gather.default,
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
]
|
||||
|
||||
def ops_in_model(self):
|
||||
return [torch.ops._C.fused_add_rms_norm.default]
|
||||
if RMSNorm.enabled():
|
||||
return [
|
||||
torch.ops._C.rms_norm.default,
|
||||
torch.ops._C.fused_add_rms_norm.default,
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
class TestQuantModel(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
|
||||
def __init__(self, hidden_size=16, eps=1e-6):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.vllm_config = get_current_vllm_config()
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size)), requires_grad=False
|
||||
)
|
||||
self.norm = RMSNorm(intermediate_size, 1e-05)
|
||||
# Initialize weights
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
self.hidden_size = hidden_size
|
||||
self.eps = eps
|
||||
self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
|
||||
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
self.w = [
|
||||
torch.rand(hidden_size, hidden_size)
|
||||
.to(dtype=current_platform.fp8_dtype())
|
||||
.t()
|
||||
for _ in range(3)
|
||||
]
|
||||
|
||||
self.fp8_linear = Fp8LinearOp(act_quant_static=True)
|
||||
|
||||
self.scale = torch.rand(1, dtype=torch.float32)
|
||||
# Create a weight that is compatible with torch._scaled_mm,
|
||||
# which expects a column-major layout.
|
||||
self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
|
||||
self.wscale = torch.rand(1, dtype=torch.float32)
|
||||
|
||||
def forward(self, hidden_states, residual):
|
||||
"""
|
||||
Forward pass implementing the operations in the FX graph
|
||||
|
||||
Args:
|
||||
hidden_states: Input tensor
|
||||
residual: Residual tensor from previous layer
|
||||
|
||||
Returns:
|
||||
Tuple containing the output tensor
|
||||
"""
|
||||
# Reshape input
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
|
||||
# matrix multiplication
|
||||
permute = self.gate_proj.permute(1, 0)
|
||||
mm = torch.mm(view, permute)
|
||||
|
||||
# Tensor parallel all-reduce
|
||||
all_reduce = tensor_model_parallel_all_reduce(mm)
|
||||
|
||||
# layer normalization
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
|
||||
# scaled_mm with static input quantization
|
||||
fp8_linear_result = self.fp8_linear.apply(
|
||||
norm_output,
|
||||
self.w,
|
||||
self.wscale,
|
||||
input_scale=self.scale.to(norm_output.device),
|
||||
self.fp8_linear = Fp8LinearOp(
|
||||
act_quant_static=True,
|
||||
act_quant_group_shape=GroupShape.PER_TENSOR,
|
||||
)
|
||||
|
||||
return fp8_linear_result, residual_output
|
||||
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
|
||||
|
||||
def ops_in_model_before(self):
|
||||
ops_to_remove = [torch.ops.vllm.all_reduce.default] # Always removed by SP
|
||||
# The following are only removed if fusion happens
|
||||
if (
|
||||
self.vllm_config
|
||||
and self.vllm_config.compilation_config.pass_config.enable_fusion
|
||||
):
|
||||
ops_to_remove.extend(
|
||||
[
|
||||
torch.ops._C.fused_add_rms_norm.default,
|
||||
torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
)
|
||||
return ops_to_remove
|
||||
def forward(self, hidden_states):
|
||||
# avoid having graph input be an arg to a pattern directly
|
||||
z = torch.relu(hidden_states)
|
||||
x = resid = tensor_model_parallel_all_reduce(z)
|
||||
y = self.norm[0](x)
|
||||
|
||||
z2 = self.fp8_linear.apply(
|
||||
y, self.w[0], self.wscale[0], input_scale=self.scale[0]
|
||||
)
|
||||
|
||||
x2 = tensor_model_parallel_all_reduce(z2)
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
z3 = self.fp8_linear.apply(
|
||||
y2, self.w[1], self.wscale[1], input_scale=self.scale[1]
|
||||
)
|
||||
|
||||
x3 = tensor_model_parallel_all_reduce(z3)
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
|
||||
z4 = self.fp8_linear.apply(
|
||||
y3, self.w[2], self.wscale[2], input_scale=self.scale[2]
|
||||
)
|
||||
x4 = tensor_model_parallel_all_reduce(z4)
|
||||
y4, resid = self.norm[3](x4, resid) # use resid here
|
||||
return y4
|
||||
|
||||
def ops_in_model_after(self):
|
||||
ops_to_add = [
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
return [
|
||||
torch.ops.vllm.all_gather.default,
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
]
|
||||
|
||||
def ops_in_model_before(self):
|
||||
return [
|
||||
torch.ops.vllm.all_reduce.default,
|
||||
]
|
||||
# The following is only added if fusion happens
|
||||
if (
|
||||
self.vllm_config
|
||||
and self.vllm_config.compilation_config.pass_config.enable_fusion
|
||||
):
|
||||
ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
|
||||
return ops_to_add
|
||||
|
||||
def ops_in_model(self):
|
||||
if (
|
||||
self.vllm_config
|
||||
and self.vllm_config.compilation_config.pass_config.enable_fusion
|
||||
):
|
||||
# If fusion happens, the fused op is the one
|
||||
# we check for (de)functionalization
|
||||
if self.vllm_config.compilation_config.pass_config.enable_fusion:
|
||||
return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
|
||||
else:
|
||||
# If no fusion, the original ops are checked
|
||||
elif RMSNorm.enabled():
|
||||
return [
|
||||
torch.ops._C.fused_add_rms_norm.default,
|
||||
# TODO functionalization pass does not handle this yet
|
||||
# torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
elif self.fp8_linear.quant_fp8.enabled():
|
||||
return [
|
||||
torch.ops._C.static_scaled_fp8_quant.default,
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
|
||||
@pytest.mark.parametrize(
|
||||
"test_model_cls, custom_ops",
|
||||
[
|
||||
(TestAllReduceRMSNormModel, "+rms_norm"),
|
||||
(TestAllReduceRMSNormModel, "-rms_norm"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
|
||||
(TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("seq_len", [16])
|
||||
@pytest.mark.parametrize("hidden_size", [16])
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("enable_fusion", [True, False])
|
||||
@pytest.mark.parametrize("dynamic", [False, True])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_sequence_parallelism_pass(
|
||||
test_model_cls: type[torch.nn.Module],
|
||||
custom_ops: str,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
enable_fusion: bool,
|
||||
dynamic: bool,
|
||||
):
|
||||
num_processes = 2
|
||||
|
||||
@ -220,11 +206,13 @@ def test_sequence_parallelism_pass(
|
||||
args=(
|
||||
num_processes,
|
||||
test_model_cls,
|
||||
custom_ops,
|
||||
batch_size,
|
||||
seq_len,
|
||||
hidden_size,
|
||||
dtype,
|
||||
enable_fusion,
|
||||
dynamic,
|
||||
),
|
||||
nprocs=nprocs,
|
||||
)
|
||||
@ -236,11 +224,13 @@ def sequence_parallelism_pass_on_test_model(
|
||||
local_rank: int,
|
||||
world_size: int,
|
||||
test_model_cls: type[torch.nn.Module],
|
||||
custom_ops: str,
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
enable_fusion: bool,
|
||||
dynamic: bool,
|
||||
):
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
@ -264,12 +254,16 @@ def sequence_parallelism_pass_on_test_model(
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
# configure vllm config for SequenceParallelismPass
|
||||
custom_ops_list = custom_ops.split(",") if custom_ops else []
|
||||
compilation_config = CompilationConfig(
|
||||
splitting_ops=[], # avoid automatic rms_norm enablement
|
||||
cudagraph_mode=CUDAGraphMode.NONE, # avoid piecewise warnings
|
||||
custom_ops=custom_ops_list,
|
||||
pass_config=PassConfig(
|
||||
enable_sequence_parallelism=True,
|
||||
enable_fusion=enable_fusion,
|
||||
enable_noop=True,
|
||||
)
|
||||
),
|
||||
) # NoOp needed for fusion
|
||||
device_config = DeviceConfig(device=torch.device("cuda"))
|
||||
|
||||
@ -289,7 +283,6 @@ def sequence_parallelism_pass_on_test_model(
|
||||
with set_current_vllm_config(vllm_config):
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
assert (
|
||||
sequence_parallelism_pass.compilation_config.splitting_ops
|
||||
@ -310,38 +303,29 @@ def sequence_parallelism_pass_on_test_model(
|
||||
|
||||
passes_for_backend.append(cleanup_pass)
|
||||
|
||||
backend_no_func = TestBackend(*passes_for_backend)
|
||||
backend_func = TestBackend(*passes_for_backend, func_pass)
|
||||
backend = TestBackend(*passes_for_backend)
|
||||
|
||||
model = test_model_cls(hidden_size, hidden_size * 2)
|
||||
model = test_model_cls(hidden_size)
|
||||
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
|
||||
compiled_model_no_func(hidden_states, residual)
|
||||
compiled_model_func = torch.compile(model, backend=backend_func)
|
||||
compiled_model_func(hidden_states, residual)
|
||||
if dynamic:
|
||||
torch._dynamo.mark_dynamic(hidden_states, 0)
|
||||
|
||||
assert sequence_parallelism_pass.matched_count == 1
|
||||
compiled_model = torch.compile(model, backend=backend)
|
||||
compiled_model(hidden_states)
|
||||
|
||||
assert sequence_parallelism_pass.matched_count == 4
|
||||
|
||||
# In pre-nodes, all reduce should be there,
|
||||
# reduce scatter and all gather should not
|
||||
backend_no_func.check_before_ops(model.ops_in_model_before())
|
||||
for op in model.ops_in_model_before():
|
||||
assert backend.op_count(op, before=True) == 4
|
||||
|
||||
# In post-nodes, reduce scatter and all gather should be there,
|
||||
# all reduce should not
|
||||
backend_no_func.check_after_ops(model.ops_in_model_after())
|
||||
for op in model.ops_in_model_after():
|
||||
assert backend.op_count(op, before=False) == 4
|
||||
|
||||
# check if the functionalization pass is applied
|
||||
for op in model.ops_in_model():
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in model.ops_in_model():
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in model.ops_in_model())
|
||||
find_auto_fn(backend.graph_post_pass.nodes, op)
|
||||
|
||||
@ -2,59 +2,134 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
|
||||
from vllm.config import CompilationMode
|
||||
from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
|
||||
from vllm.config import (
|
||||
CompilationConfig,
|
||||
CompilationMode,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
|
||||
|
||||
class MyMod(torch.nn.Module):
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
if cache is not None:
|
||||
return x + cache
|
||||
return x * 2
|
||||
if x.size()[0] >= 4:
|
||||
return x * 2
|
||||
else:
|
||||
return x * 100
|
||||
|
||||
|
||||
class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
||||
class MyWrapper(TorchCompileWithNoGuardsWrapper):
|
||||
def __init__(self, model):
|
||||
self.model = model
|
||||
compiled_callable = torch.compile(self.forward, backend="eager")
|
||||
super().__init__(
|
||||
compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x: torch.Tensor): # type: ignore[override]
|
||||
# this is the function to be compiled
|
||||
return self.model(x)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
|
||||
"""Test basic functionality of TorchCompileWithNoGuardsWrapper."""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
# Create a proper vLLM config instead of mocking
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.compilation_config = CompilationConfig()
|
||||
vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
vllm_config.compilation_config.backend = "inductor"
|
||||
|
||||
# Test DYNAMO_TRACE_ONCE
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch._dynamo.reset()
|
||||
mod = MyMod()
|
||||
wrapper = MyWrapper(mod)
|
||||
|
||||
# First call should trigger compilation
|
||||
x = torch.tensor([1, 2, 3, 4])
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result1 = wrapper(x)
|
||||
expected1 = torch.tensor([2, 4, 6, 8])
|
||||
assert torch.allclose(result1, expected1), (
|
||||
f"Expected {expected1}, got {result1}"
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
# this is the function to be compiled
|
||||
return self.model(x, cache)
|
||||
# Second call should use compiled code
|
||||
x2 = torch.tensor([1, 2, 3])
|
||||
result2 = wrapper(x2)
|
||||
expected2 = torch.tensor([2, 4, 6])
|
||||
assert torch.allclose(result2, expected2), (
|
||||
f"Expected {expected2}, got {result2}"
|
||||
)
|
||||
|
||||
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||
# let torch.compile compile twice
|
||||
if len(self.compiled_codes) == 2:
|
||||
dispatch_id = 0 if cache is None else 1
|
||||
with self.dispatch_to_code(dispatch_id):
|
||||
return self.forward(x, cache)
|
||||
else:
|
||||
return self.compiled_callable(x, cache)
|
||||
# without the wrapper result would be different.
|
||||
result3 = mod(x2)
|
||||
expected3 = torch.tensor([100, 200, 300])
|
||||
|
||||
assert torch.allclose(result3, expected3), (
|
||||
f"Expected {result3}, got {expected3}"
|
||||
)
|
||||
|
||||
def test_torch_compile_wrapper():
|
||||
mod = MyMod()
|
||||
wrappers = []
|
||||
for i in range(3):
|
||||
torch._dynamo.reset()
|
||||
# with STOCK_TORCH_COMPILE we do not remove guards.
|
||||
vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
|
||||
torch._dynamo.reset()
|
||||
with set_current_vllm_config(vllm_config):
|
||||
mod = MyMod()
|
||||
wrapper = MyWrapper(mod)
|
||||
wrappers.append(wrapper)
|
||||
x = torch.tensor([1])
|
||||
wrapper(x, None) # profile run, compile
|
||||
# create a cache tensor
|
||||
cache = torch.tensor([2])
|
||||
wrapper(x, cache) # warm up with cache, recompile
|
||||
|
||||
# for new input, dispatch to the compiled code directly
|
||||
new_x = torch.tensor([3])
|
||||
assert wrapper(new_x, None).item() == 6 # dispatch to the first compiled code
|
||||
assert wrapper(new_x, cache).item() == 5 # dispatch to the second compiled code
|
||||
# First call should trigger compilation
|
||||
x = torch.tensor([1, 2, 3, 4])
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
for wrapper in wrappers:
|
||||
# make sure they have independent compiled codes
|
||||
assert len(wrapper.compiled_codes) == 2
|
||||
result1 = wrapper(x)
|
||||
expected1 = torch.tensor([2, 4, 6, 8])
|
||||
assert torch.allclose(result1, expected1), (
|
||||
f"Expected {expected1}, got {result1}"
|
||||
)
|
||||
|
||||
# Second call should triger another compilation
|
||||
x2 = torch.tensor([1, 2, 3])
|
||||
result2 = wrapper(x2)
|
||||
expected2 = torch.tensor([100, 200, 300])
|
||||
assert torch.allclose(result2, expected2), (
|
||||
f"Expected {expected2}, got {result2}"
|
||||
)
|
||||
|
||||
# NO_COMPILATION level not supported.
|
||||
vllm_config.compilation_config.mode = None
|
||||
torch._dynamo.reset()
|
||||
with set_current_vllm_config(vllm_config):
|
||||
torch._dynamo.reset()
|
||||
mod = MyMod()
|
||||
|
||||
try:
|
||||
wrapper = MyWrapper(mod)
|
||||
except Exception:
|
||||
return
|
||||
raise AssertionError("expected an exception to be raised")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run with both parameter values
|
||||
|
||||
class MockMonkeypatch:
|
||||
def setenv(self, name, value):
|
||||
os.environ[name] = value
|
||||
|
||||
mp = MockMonkeypatch()
|
||||
|
||||
print("Testing with VLLM_USE_BYTECODE_HOOK=False")
|
||||
test_torch_compile_wrapper(False, mp)
|
||||
|
||||
print("Testing with VLLM_USE_BYTECODE_HOOK=True")
|
||||
test_torch_compile_wrapper(True, mp)
|
||||
|
||||
print("All tests passed!")
|
||||
|
||||
437
tests/distributed/test_multiproc_executor.py
Normal file
437
tests/distributed/test_multiproc_executor.py
Normal file
@ -0,0 +1,437 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Integration tests for MultiprocExecutor at the executor level.
|
||||
This test directly tests the executor without going through the LLM interface,
|
||||
focusing on executor initialization, RPC calls, and distributed execution.
|
||||
"""
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import get_open_port
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
|
||||
|
||||
def create_vllm_config(
|
||||
tensor_parallel_size: int = 1,
|
||||
pipeline_parallel_size: int = 1,
|
||||
max_model_len: int = 256,
|
||||
gpu_memory_utilization: float = 0.3,
|
||||
distributed_executor_backend: str = "mp",
|
||||
nnodes: int = 1,
|
||||
node_rank: int = 0,
|
||||
master_port: int = 0,
|
||||
) -> VllmConfig:
|
||||
"""Create a VllmConfig for testing using EngineArgs."""
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
pipeline_parallel_size=pipeline_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
|
||||
# Override distributed node settings if needed
|
||||
if nnodes > 1 or node_rank > 0:
|
||||
vllm_config.parallel_config.nnodes = nnodes
|
||||
vllm_config.parallel_config.node_rank = node_rank
|
||||
vllm_config.parallel_config.master_port = master_port
|
||||
if nnodes > 1:
|
||||
vllm_config.parallel_config.disable_custom_all_reduce = True
|
||||
|
||||
return vllm_config
|
||||
|
||||
|
||||
def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
|
||||
"""Create a minimal SchedulerOutput for testing."""
|
||||
# This is a simplified version - in practice you'd need proper
|
||||
# SchedulerOutput construction based on the actual vLLM v1 API
|
||||
return SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_resumed_reqs=[],
|
||||
scheduled_running_reqs=[],
|
||||
num_scheduled_tokens={},
|
||||
total_num_scheduled_tokens=0,
|
||||
)
|
||||
|
||||
|
||||
def test_multiproc_executor_initialization():
|
||||
"""Test that MultiprocExecutor can be initialized with proper config."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor - this should initialize workers
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 1, "World size should be 1 for single GPU"
|
||||
assert executor.local_world_size == 1, "Local world size should be 1"
|
||||
assert hasattr(executor, "workers"), "Executor should have workers"
|
||||
assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
|
||||
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_initialization_tensor_parallel():
|
||||
"""Test MultiprocExecutor initialization with tensor parallelism."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 2, "World size should be 2 for TP=2"
|
||||
assert executor.local_world_size == 2, "Local world size should be 2"
|
||||
assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
|
||||
|
||||
# Verify output rank calculation
|
||||
output_rank = executor._get_output_rank()
|
||||
assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
|
||||
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_collective_rpc():
|
||||
"""Test collective RPC calls to all workers."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
# Create executor
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test check_health RPC - should work without errors
|
||||
executor.check_health()
|
||||
|
||||
# Test that RPC works correctly
|
||||
# Note: We're just testing that the RPC mechanism works,
|
||||
# not testing actual model execution here
|
||||
assert not executor.is_failed, "Executor should not be in failed state"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_failure_callback():
|
||||
"""Test failure callback registration and invocation."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test callback registration
|
||||
callback_invoked = []
|
||||
|
||||
def test_callback():
|
||||
callback_invoked.append(True)
|
||||
|
||||
# Register callback
|
||||
executor.register_failure_callback(test_callback)
|
||||
|
||||
# Callback should not be invoked yet
|
||||
assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
|
||||
|
||||
# Simulate failure
|
||||
executor.is_failed = True
|
||||
|
||||
# Register another callback - should be invoked immediately
|
||||
executor.register_failure_callback(test_callback)
|
||||
assert len(callback_invoked) == 1, (
|
||||
"Callback should be invoked when executor is failed"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_worker_monitor():
|
||||
"""Test that worker monitor is set up correctly."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Verify all worker processes are alive
|
||||
for worker in executor.workers:
|
||||
assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
|
||||
|
||||
# Verify executor is not in failed state
|
||||
assert not executor.is_failed, "Executor should not be in failed state"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
# After shutdown, workers should be terminated
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # Give processes time to terminate
|
||||
for worker in executor.workers:
|
||||
assert not worker.proc.is_alive(), (
|
||||
f"Worker rank {worker.rank} should terminate after shutdown"
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multiproc_executor_get_response_message_queues():
|
||||
"""Test message queue retrieval for different ranks."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Get all message queues
|
||||
all_queues = executor.get_response_mqs()
|
||||
assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
|
||||
|
||||
# Get message queue for specific rank
|
||||
rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
|
||||
assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
|
||||
|
||||
rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
|
||||
assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_shutdown_cleanup():
|
||||
"""Test that shutdown properly cleans up resources."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify executor is set up
|
||||
assert hasattr(executor, "workers"), "Executor should have workers"
|
||||
assert len(executor.workers) > 0, "Should have at least one worker"
|
||||
|
||||
# Shutdown
|
||||
executor.shutdown()
|
||||
|
||||
# Verify cleanup
|
||||
import time
|
||||
|
||||
time.sleep(0.5) # Give processes time to terminate
|
||||
|
||||
for worker in executor.workers:
|
||||
assert not worker.proc.is_alive(), "Worker processes should be terminated"
|
||||
|
||||
# Verify shutdown event is set
|
||||
assert executor.shutdown_event.is_set(), "Shutdown event should be set"
|
||||
|
||||
# Multiple shutdowns should be safe (idempotent)
|
||||
executor.shutdown()
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_multiproc_executor_pipeline_parallel():
|
||||
"""Test MultiprocExecutor with pipeline parallelism."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Verify executor properties
|
||||
assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
|
||||
assert len(executor.workers) == 4, "Should have 4 workers"
|
||||
|
||||
# Verify output rank calculation
|
||||
# For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
|
||||
# Specifically rank 2 (first rank of last PP stage)
|
||||
output_rank = executor._get_output_rank()
|
||||
assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
|
||||
|
||||
# Verify max_concurrent_batches for pipeline parallel
|
||||
assert executor.max_concurrent_batches == 2, (
|
||||
"Max concurrent batches should equal PP size"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def test_multiproc_executor_properties():
|
||||
"""Test various executor properties and configurations."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=1,
|
||||
pipeline_parallel_size=1,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
try:
|
||||
# Test supports_pp property
|
||||
assert MultiprocExecutor.supports_pp is True, (
|
||||
"MultiprocExecutor should support pipeline parallelism"
|
||||
)
|
||||
|
||||
# Test world_size calculation
|
||||
assert executor.world_size == (
|
||||
executor.parallel_config.tensor_parallel_size
|
||||
* executor.parallel_config.pipeline_parallel_size
|
||||
), "World size should equal TP * PP"
|
||||
|
||||
# Test local_world_size calculation
|
||||
assert executor.local_world_size == (
|
||||
executor.parallel_config.world_size // executor.parallel_config.nnodes
|
||||
), "Local world size should be world_size / nnodes"
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_multiproc_executor_multi_node():
|
||||
"""
|
||||
Test MultiprocExecutor with multi-node configuration.
|
||||
This simulates 2 nodes with TP=4:
|
||||
- Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
|
||||
- Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
|
||||
Total world_size = 4, nnodes = 2
|
||||
"""
|
||||
port = get_open_port()
|
||||
# symm_mem does not work for simulating multi instance in single node
|
||||
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||
|
||||
def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
|
||||
"""Run a single node's executor."""
|
||||
executor = None
|
||||
try:
|
||||
# Set CUDA_VISIBLE_DEVICES for this node
|
||||
if node_rank == 0:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
||||
|
||||
# Create config for this node
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=4, # Total TP across all nodes
|
||||
pipeline_parallel_size=1,
|
||||
nnodes=2, # 2 nodes
|
||||
node_rank=node_rank,
|
||||
master_port=port, # same port
|
||||
)
|
||||
|
||||
# Create executor for this node
|
||||
executor = MultiprocExecutor(vllm_config=vllm_config)
|
||||
|
||||
# Verify node-specific properties
|
||||
assert executor.world_size == 4, (
|
||||
f"World size should be 4 on node {node_rank}"
|
||||
)
|
||||
assert executor.local_world_size == 2, (
|
||||
f"Local world size should be 2 on node {node_rank}"
|
||||
)
|
||||
assert len(executor.workers) == 2, (
|
||||
f"Should have 2 local workers on node {node_rank}"
|
||||
)
|
||||
|
||||
# Verify worker ranks are correct for this node
|
||||
expected_ranks = [node_rank * 2, node_rank * 2 + 1]
|
||||
actual_ranks = sorted([w.rank for w in executor.workers])
|
||||
assert actual_ranks == expected_ranks, (
|
||||
f"Node {node_rank} should have workers "
|
||||
f"with ranks {expected_ranks}, got {actual_ranks}"
|
||||
)
|
||||
# Verify all workers are alive
|
||||
for worker in executor.workers:
|
||||
assert worker.proc.is_alive(), (
|
||||
f"Worker rank {worker.rank} should be alive on node {node_rank}"
|
||||
)
|
||||
# executor.gen
|
||||
# Put success result in queue BEFORE shutdown to avoid hanging
|
||||
result_queue.put({"node": node_rank, "success": True})
|
||||
import time
|
||||
|
||||
time.sleep(2)
|
||||
executor.shutdown()
|
||||
except Exception as e:
|
||||
# Put failure result in queue
|
||||
result_queue.put({"node": node_rank, "success": False, "error": str(e)})
|
||||
raise e
|
||||
finally:
|
||||
if executor is not None:
|
||||
executor.shutdown()
|
||||
|
||||
# Create a queue to collect results from both processes
|
||||
result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
|
||||
|
||||
# Start both node processes
|
||||
processes = []
|
||||
for node_rank in range(2):
|
||||
p = multiprocessing.Process(
|
||||
target=run_node,
|
||||
args=(node_rank, result_queue, port),
|
||||
name=f"Node{node_rank}",
|
||||
)
|
||||
p.start()
|
||||
processes.append(p)
|
||||
|
||||
# Wait for both processes to complete
|
||||
all_completed = True
|
||||
for p in processes:
|
||||
p.join(timeout=60)
|
||||
if p.is_alive():
|
||||
p.terminate()
|
||||
p.join(timeout=20)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
p.join()
|
||||
all_completed = False
|
||||
|
||||
# Check results from both nodes
|
||||
results: list[dict[str, int | bool]] = []
|
||||
while len(results) < 2:
|
||||
try:
|
||||
result = result_queue.get(timeout=1)
|
||||
results.append(result)
|
||||
except Exception:
|
||||
pass
|
||||
assert all_completed, "Not all processes completed successfully"
|
||||
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
|
||||
assert results[0]["success"], f"Node 0 failed: {results[0]}"
|
||||
assert results[1]["success"], f"Node 1 failed: {results[1]}"
|
||||
@ -18,6 +18,7 @@ import pytest
|
||||
from vllm.config.compilation import CompilationMode
|
||||
from vllm.config.model import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
@ -161,6 +162,7 @@ def _compare_sp(
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available: int,
|
||||
use_inductor_graph_partition: bool,
|
||||
enable_async_tp: bool,
|
||||
*,
|
||||
method: Literal["generate", "encode"],
|
||||
is_multimodal: bool,
|
||||
@ -244,10 +246,10 @@ def _compare_sp(
|
||||
|
||||
compilation_config = {
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"custom_ops": ["+rms_norm"],
|
||||
"compile_sizes": [4, 8],
|
||||
"pass_config": {
|
||||
"enable_sequence_parallelism": True,
|
||||
"enable_async_tp": enable_async_tp,
|
||||
"enable_fusion": enable_fusion,
|
||||
"enable_noop": True,
|
||||
},
|
||||
@ -307,6 +309,7 @@ SP_TEST_MODELS = [
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
|
||||
@pytest.mark.parametrize("enable_async_tp", [False]) # TODO: enable async TP
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_sp_generation(
|
||||
model_id: str,
|
||||
@ -316,10 +319,19 @@ def test_tp_sp_generation(
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available,
|
||||
use_inductor_graph_partition: bool,
|
||||
enable_async_tp: bool,
|
||||
):
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
# Skip FP8 SP-only test on sm89 (compute capability 8.9)
|
||||
if (
|
||||
"fp8" in model_id.lower()
|
||||
and current_platform.get_device_capability() < (9, 0)
|
||||
and (not enable_async_tp)
|
||||
):
|
||||
pytest.skip("FP8 reduction support begins with sm90 capable devices.")
|
||||
|
||||
_compare_sp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
@ -328,6 +340,7 @@ def test_tp_sp_generation(
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
use_inductor_graph_partition,
|
||||
enable_async_tp=enable_async_tp,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request): # noqa: F811
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"4",
|
||||
"--enable-force-include-usage",
|
||||
"--port",
|
||||
"55857",
|
||||
@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage():
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"4",
|
||||
"--enforce-eager",
|
||||
"--enable-force-include-usage",
|
||||
"--gpu-memory-utilization",
|
||||
|
||||
@ -16,6 +16,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
from vllm import version
|
||||
|
||||
from ...conftest import LocalAssetServer
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODELS = {
|
||||
@ -69,7 +70,6 @@ async def client(server):
|
||||
|
||||
|
||||
_PROMPT = "Hello my name is Robert and I love magic"
|
||||
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
|
||||
def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
|
||||
@ -250,6 +250,7 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_exist(
|
||||
local_asset_server: LocalAssetServer,
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
model_key: str,
|
||||
@ -265,13 +266,21 @@ async def test_metrics_exist(
|
||||
temperature=0.0,
|
||||
)
|
||||
else:
|
||||
# https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": _IMAGE_URL}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": local_asset_server.url_for(
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
),
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
|
||||
262
tests/entrypoints/openai/test_serving_tokens.py
Normal file
262
tests/entrypoints/openai/test_serving_tokens.py
Normal file
@ -0,0 +1,262 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.v1.engine.detokenizer import check_stop_strings
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
GEN_ENDPOINT = "/inference/v1/generate"
|
||||
|
||||
|
||||
def get_vocab_size(model_name):
|
||||
config = ModelConfig(
|
||||
model=model_name,
|
||||
seed=0,
|
||||
dtype="bfloat16",
|
||||
)
|
||||
return config.get_vocab_size()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def messages():
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "How many countries are in the EU?"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(request):
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"1024",
|
||||
"--enforce-eager",
|
||||
]
|
||||
|
||||
extra_args = getattr(request, "param", None)
|
||||
if extra_args is not None:
|
||||
args = args + (
|
||||
list(extra_args)
|
||||
if isinstance(extra_args, (list, tuple))
|
||||
else [str(extra_args)]
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server: RemoteOpenAIServer):
|
||||
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
async with httpx.AsyncClient(
|
||||
transport=transport,
|
||||
base_url=server.url_root,
|
||||
timeout=600,
|
||||
headers=headers,
|
||||
) as c:
|
||||
yield c
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_endpoint(client):
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": [1, 2, 3],
|
||||
"sampling_params": {"max_tokens": 5},
|
||||
"stream": False,
|
||||
}
|
||||
resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_same_response_as_chat_completions(client, tokenizer, messages):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
for ignore_eos in [True, False]:
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
# NOTE coordinator will set this to skip detokenization
|
||||
"detokenize": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
},
|
||||
"stream": False,
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"ignore_eos": ignore_eos,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stop_string_workflow(client, tokenizer, messages):
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"detokenize": False,
|
||||
# stop strings are only supported when detokenize is True.
|
||||
"stop": ["27 member"],
|
||||
},
|
||||
# TODO stream test is much more interesting
|
||||
"stream": False,
|
||||
}
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_resp.raise_for_status()
|
||||
|
||||
payload["sampling_params"]["stop"] = None
|
||||
generate_resp = await client.post(
|
||||
GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
|
||||
)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
# NOTE This is under the responsibility of the coordinator
|
||||
# stop_checker = StopChecker(
|
||||
# max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
|
||||
# )
|
||||
stop_str, truncate_to = check_stop_strings(
|
||||
generate_res, len(generate_res), ["27 member"], False
|
||||
)
|
||||
assert stop_str == "27 member"
|
||||
# abort request that hit stop string (requires tokens-only mode)
|
||||
# res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
|
||||
# res.raise_for_status()
|
||||
generate_res = generate_res[:truncate_to]
|
||||
|
||||
# Get stop_str response from chat completions
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"stop": ["27 member"],
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
assert generate_res == completions_res
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"server",
|
||||
[
|
||||
[
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
"Alice=charent/self_cognition_Alice",
|
||||
"Bob=charent/self_cognition_Bob",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_generate_with_lora_adapter(client, tokenizer, messages):
|
||||
# Verify adapters are listed
|
||||
models_resp = await client.get("/v1/models")
|
||||
models_resp.raise_for_status()
|
||||
models = {m["id"] for m in models_resp.json().get("data", [])}
|
||||
assert {"Alice", "Bob"}.issubset(models)
|
||||
|
||||
# Generate using a LoRA adapter by specifying its name as the model
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"token_ids": [1, 2, 3],
|
||||
"sampling_params": {"max_tokens": 5},
|
||||
"stream": False,
|
||||
}
|
||||
resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
|
||||
token_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # default with Qwen3
|
||||
)
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"token_ids": token_ids,
|
||||
"sampling_params": {
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"detokenize": False,
|
||||
},
|
||||
"stream": False,
|
||||
}
|
||||
generate_resp = await client.post(GEN_ENDPOINT, json=payload)
|
||||
generate_data = generate_resp.json()
|
||||
generate_res = tokenizer.decode(
|
||||
generate_data["choices"][0]["token_ids"], skip_special_tokens=True
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": "Alice",
|
||||
"messages": messages,
|
||||
"max_tokens": 24,
|
||||
"temperature": 0.0,
|
||||
"stream": False,
|
||||
"chat_template_kwargs": dict(enable_thinking=False),
|
||||
}
|
||||
completions_resp = await client.post("/v1/chat/completions", json=payload)
|
||||
completions_data = completions_resp.json()
|
||||
completions_res = completions_data["choices"][0]["message"]["content"]
|
||||
|
||||
assert generate_res == completions_res
|
||||
@ -40,8 +40,6 @@ NUM_EXPERTS = [8, 64]
|
||||
TOP_KS = [1, 2, 6]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -33,8 +33,6 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
# Test configurations
|
||||
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||
|
||||
@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (7, 0):
|
||||
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
DTYPES = [torch.bfloat16]
|
||||
|
||||
|
||||
@ -42,8 +42,6 @@ MNK_FACTORS = [
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
|
||||
@ -7,6 +7,7 @@ fp8 block-quantized case.
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
from contextlib import contextmanager
|
||||
|
||||
import pytest
|
||||
import torch.distributed
|
||||
@ -14,6 +15,7 @@ from torch.distributed import ProcessGroup
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
fp8_w8a8_moe_quant_config,
|
||||
@ -61,6 +63,23 @@ requires_deep_gemm = pytest.mark.skipif(
|
||||
P = ParamSpec("P")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def with_dp_metadata(M: int, world_size: int):
|
||||
num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.data_parallel_size = world_size
|
||||
vllm_config.parallel_config.enable_expert_parallel = True
|
||||
|
||||
with set_forward_context(
|
||||
None,
|
||||
vllm_config,
|
||||
num_tokens=M,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
):
|
||||
yield
|
||||
|
||||
|
||||
def next_power_of_2(x):
|
||||
import math
|
||||
|
||||
@ -285,18 +304,21 @@ def deepep_deepgemm_moe_impl(
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
out = mk.forward(
|
||||
hidden_states=test_tensors.rank_tokens,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
topk_weights=test_tensors.topk_weights,
|
||||
topk_ids=test_tensors.topk,
|
||||
inplace=False,
|
||||
activation="silu",
|
||||
global_num_experts=num_experts,
|
||||
expert_map=build_expert_map(),
|
||||
apply_router_weight_on_input=False,
|
||||
)
|
||||
with with_dp_metadata(
|
||||
M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
|
||||
):
|
||||
out = mk.forward(
|
||||
hidden_states=test_tensors.rank_tokens,
|
||||
w1=w1,
|
||||
w2=w2,
|
||||
topk_weights=test_tensors.topk_weights,
|
||||
topk_ids=test_tensors.topk,
|
||||
inplace=False,
|
||||
activation="silu",
|
||||
global_num_experts=num_experts,
|
||||
expert_map=build_expert_map(),
|
||||
apply_router_weight_on_input=False,
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
|
||||
@ -45,8 +45,6 @@ MNK_FACTORS = [
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def quant_fp8_per_tensor_batches(a):
|
||||
@ -79,10 +77,14 @@ class TestData:
|
||||
|
||||
@staticmethod
|
||||
def make_moe_tensors_8bit(
|
||||
m: int, k: int, n: int, e: int, reorder: bool
|
||||
m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
|
||||
) -> "TestData":
|
||||
is_gated = activation != "relu2_no_mul"
|
||||
|
||||
hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
|
||||
w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
|
||||
w13 = torch.randn(
|
||||
(e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
|
||||
)
|
||||
w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
# Scale to fp8
|
||||
@ -192,18 +194,22 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
|
||||
@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
|
||||
def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
activation: str,
|
||||
monkeypatch,
|
||||
):
|
||||
current_platform.seed_everything(7)
|
||||
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
|
||||
with set_current_vllm_config(vllm_config):
|
||||
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
|
||||
td = TestData.make_moe_tensors_8bit(
|
||||
m, k, n, e, reorder=False, activation=activation
|
||||
)
|
||||
|
||||
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
|
||||
topk_weights, topk_ids, _ = FusedMoE.select_experts(
|
||||
@ -235,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
inplace=False,
|
||||
activation="silu",
|
||||
activation=activation,
|
||||
global_num_experts=e,
|
||||
expert_map=None,
|
||||
apply_router_weight_on_input=True,
|
||||
@ -255,7 +261,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
|
||||
td.layer,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
activation="silu",
|
||||
activation=activation,
|
||||
global_num_experts=e,
|
||||
expert_map=None,
|
||||
apply_router_weight_on_input=True,
|
||||
|
||||
@ -81,8 +81,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [
|
||||
]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def run_moe_test(
|
||||
|
||||
@ -192,8 +192,6 @@ def pplx_cutlass_moe(
|
||||
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def _pplx_moe(
|
||||
|
||||
@ -81,8 +81,6 @@ TOP_KS = [1, 2, 6]
|
||||
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def torch_prepare(
|
||||
|
||||
@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
|
||||
def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
|
||||
|
||||
@ -29,8 +29,6 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
# Test configurations
|
||||
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||
|
||||
@ -18,8 +18,6 @@ if current_platform.get_device_capability() < (7, 0):
|
||||
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.scheduler_config.max_num_seqs = 128
|
||||
vllm_config.scheduler_config.max_model_len = 8192
|
||||
|
||||
DTYPES = [torch.half, torch.bfloat16]
|
||||
M = [1, 33, 64, 222]
|
||||
|
||||
169
tests/model_executor/test_eagle_quantization.py
Normal file
169
tests/model_executor/test_eagle_quantization.py
Normal file
@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.model_executor.models.utils import get_draft_quant_config
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DEVICES = (
|
||||
[f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
if current_platform.is_cuda_alike()
|
||||
else ["cpu"]
|
||||
)
|
||||
|
||||
|
||||
def test_get_draft_quant_config_with_draft_model():
|
||||
mock_draft_model_config = Mock(spec=ModelConfig)
|
||||
mock_load_config = Mock(spec=LoadConfig)
|
||||
mock_speculative_config = Mock(spec=SpeculativeConfig)
|
||||
mock_speculative_config.draft_model_config = mock_draft_model_config
|
||||
|
||||
mock_vllm_config = Mock(spec=VllmConfig)
|
||||
mock_vllm_config.speculative_config = mock_speculative_config
|
||||
mock_vllm_config.load_config = mock_load_config
|
||||
|
||||
mock_quant_config = Mock()
|
||||
with patch.object(
|
||||
VllmConfig, "get_quantization_config", return_value=mock_quant_config
|
||||
):
|
||||
result = get_draft_quant_config(mock_vllm_config)
|
||||
|
||||
# Verify the function calls get_quantization_config with draft model config
|
||||
VllmConfig.get_quantization_config.assert_called_once_with(
|
||||
mock_draft_model_config, mock_load_config
|
||||
)
|
||||
assert result == mock_quant_config
|
||||
|
||||
|
||||
def test_get_draft_quant_config_without_draft_model():
|
||||
mock_speculative_config = Mock(spec=SpeculativeConfig)
|
||||
mock_speculative_config.draft_model_config = None
|
||||
|
||||
mock_vllm_config = Mock(spec=VllmConfig)
|
||||
mock_vllm_config.speculative_config = mock_speculative_config
|
||||
mock_vllm_config.load_config = Mock(spec=LoadConfig)
|
||||
|
||||
result = get_draft_quant_config(mock_vllm_config)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_fc_layer_quant_config_usage(dist_init, device) -> None:
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
|
||||
if current_platform.is_cuda_alike():
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
torch.set_default_device(device)
|
||||
|
||||
input_size = 256
|
||||
output_size = 128
|
||||
|
||||
fc_no_quant = ReplicatedLinear(
|
||||
input_size=input_size,
|
||||
output_size=output_size,
|
||||
bias=False,
|
||||
params_dtype=torch.float16,
|
||||
quant_config=None,
|
||||
prefix="fc",
|
||||
)
|
||||
|
||||
assert fc_no_quant.quant_config is None
|
||||
assert fc_no_quant.input_size == input_size
|
||||
assert fc_no_quant.output_size == output_size
|
||||
|
||||
mock_quant_config = Mock()
|
||||
fc_with_quant = ReplicatedLinear(
|
||||
input_size=input_size,
|
||||
output_size=output_size,
|
||||
bias=False,
|
||||
params_dtype=torch.float16,
|
||||
quant_config=mock_quant_config,
|
||||
prefix="fc",
|
||||
)
|
||||
|
||||
assert fc_with_quant.quant_config == mock_quant_config
|
||||
|
||||
# Check forward pass
|
||||
x = torch.randn(2, input_size, dtype=torch.float16)
|
||||
output, _ = fc_no_quant(x)
|
||||
assert output.shape == (2, output_size)
|
||||
|
||||
|
||||
def test_kv_cache_scale_name_handling():
|
||||
# Mock a quant config that supports cache scales
|
||||
mock_quant_config = Mock()
|
||||
mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
|
||||
|
||||
# Condition check in load_weights
|
||||
name = "layers.0.self_attn.k_proj.weight"
|
||||
scale_name = mock_quant_config.get_cache_scale(name)
|
||||
|
||||
# Check if get_cache_scale is called and returns expected value
|
||||
mock_quant_config.get_cache_scale.assert_called_once_with(name)
|
||||
assert scale_name == "layers.0.self_attn.kv_scale"
|
||||
|
||||
|
||||
def test_kv_cache_scale_name_no_scale():
|
||||
# Mock a quant config that returns None for get_cache_scale
|
||||
mock_quant_config = Mock()
|
||||
mock_quant_config.get_cache_scale = Mock(return_value=None)
|
||||
|
||||
name = "layers.0.mlp.gate_proj.weight"
|
||||
scale_name = mock_quant_config.get_cache_scale(name)
|
||||
|
||||
# Should return None for weights that don't have cache scales
|
||||
assert scale_name is None
|
||||
|
||||
|
||||
def test_maybe_remap_kv_scale_name():
|
||||
from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
|
||||
|
||||
params_dict = {
|
||||
"layers.0.self_attn.kv_scale": Mock(),
|
||||
"layers.1.self_attn.kv_scale": Mock(),
|
||||
}
|
||||
|
||||
name = "layers.0.self_attn.some_scale"
|
||||
remapped = maybe_remap_kv_scale_name(name, params_dict)
|
||||
|
||||
assert remapped in params_dict or remapped == name or remapped is None
|
||||
|
||||
|
||||
def test_load_weights_kv_scale_handling():
|
||||
kv_scale_param = Mock()
|
||||
kv_scale_param.weight_loader = Mock()
|
||||
|
||||
params_dict = {
|
||||
"layers.0.self_attn.kv_scale": kv_scale_param,
|
||||
}
|
||||
|
||||
mock_quant_config = Mock()
|
||||
mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
|
||||
|
||||
# Load_weights logic for KV cache scales
|
||||
name = "layers.0.self_attn.k_proj.weight"
|
||||
loaded_weight_tensor = torch.tensor([1.0, 2.0])
|
||||
|
||||
if mock_quant_config is not None:
|
||||
scale_name = mock_quant_config.get_cache_scale(name)
|
||||
if scale_name:
|
||||
param = params_dict[scale_name]
|
||||
assert param is kv_scale_param
|
||||
weight_to_load = (
|
||||
loaded_weight_tensor
|
||||
if loaded_weight_tensor.dim() == 0
|
||||
else loaded_weight_tensor[0]
|
||||
)
|
||||
|
||||
assert scale_name == "layers.0.self_attn.kv_scale"
|
||||
assert weight_to_load == loaded_weight_tensor[0]
|
||||
@ -348,9 +348,14 @@ def test_fp32_cache_state(
|
||||
|
||||
|
||||
# Helper functions for the APC tests
|
||||
def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
|
||||
def _get_vllm_runner_params(
|
||||
model: str,
|
||||
max_model_len: int,
|
||||
tensor_parallel_size: int = 1,
|
||||
):
|
||||
return {
|
||||
"model_name": model,
|
||||
"enable_chunked_prefill": True,
|
||||
"enable_prefix_caching": False,
|
||||
"max_model_len": max_model_len,
|
||||
"tensor_parallel_size": tensor_parallel_size,
|
||||
|
||||
@ -11,7 +11,7 @@ from vllm import TokensPrompt
|
||||
["Qwen/Qwen3-0.6B"],
|
||||
)
|
||||
@torch.inference_mode
|
||||
def test_embed_models(hf_runner, vllm_runner, model: str):
|
||||
def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
|
||||
n_prompt_tokens = [55, 56, 57]
|
||||
token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
|
||||
|
||||
@ -21,7 +21,7 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
|
||||
enforce_eager=True,
|
||||
runner="pooling",
|
||||
enable_chunked_prefill=False,
|
||||
enable_prefix_caching=False,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
@ -30,4 +30,29 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
|
||||
|
||||
for n, output in zip(n_prompt_tokens, pooling_outputs):
|
||||
assert len(output.prompt_token_ids) == n
|
||||
assert len(output.outputs.data) == n
|
||||
assert output.num_cached_tokens == 0
|
||||
|
||||
# test enable_prefix_caching plus all pooling
|
||||
# we need to skip reading cache at this request by
|
||||
# request.skip_reading_prefix_cache
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
pooling_task="token_embed",
|
||||
)
|
||||
|
||||
for n, output in zip(n_prompt_tokens, pooling_outputs):
|
||||
assert len(output.prompt_token_ids) == n
|
||||
assert len(output.outputs.data) == n
|
||||
assert output.num_cached_tokens == 0
|
||||
|
||||
# skip_reading_prefix_cache can still write to cache
|
||||
# to accelerate following requests
|
||||
pooling_outputs = vllm_model.llm.encode(
|
||||
[TokensPrompt(prompt_token_ids=t) for t in token_prompts],
|
||||
pooling_task="embed",
|
||||
)
|
||||
|
||||
for n, output in zip(n_prompt_tokens, pooling_outputs):
|
||||
assert len(output.prompt_token_ids) == n
|
||||
assert output.num_cached_tokens > 0
|
||||
|
||||
@ -12,6 +12,7 @@ import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForTextToWaveform,
|
||||
)
|
||||
@ -691,6 +692,23 @@ VLM_TEST_SETTINGS = {
|
||||
patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
|
||||
hf_model_kwargs={"revision": "refs/pr/5"},
|
||||
),
|
||||
"paddleocr_vl": VLMTestInfo(
|
||||
models=["PaddlePaddle/PaddleOCR-VL"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
img_idx_to_prompt=lambda idx: (
|
||||
"<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
|
||||
),
|
||||
multi_image_prompt=(
|
||||
"Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
|
||||
"Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
|
||||
"Describe these two images separately."
|
||||
),
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
image_size_factors=[(), (0.25,)],
|
||||
),
|
||||
"phi3v": VLMTestInfo(
|
||||
models=["microsoft/Phi-3.5-vision-instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
|
||||
115
tests/models/multimodal/generation/test_multimodal_gguf.py
Normal file
115
tests/models/multimodal/generation/test_multimodal_gguf.py
Normal file
@ -0,0 +1,115 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Literal, NamedTuple
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pytest import MarkDecorator
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.utils.torch_utils import set_default_torch_num_threads
|
||||
|
||||
from ....conftest import PromptImageInput, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
class GGUFMMTestConfig(NamedTuple):
|
||||
original_model: str
|
||||
gguf_repo: str
|
||||
gguf_backbone: str
|
||||
gguf_mmproj: str
|
||||
prompt: list[str]
|
||||
mm_data: dict[Literal["images"], PromptImageInput]
|
||||
max_model_len: int = 4096
|
||||
marks: list[MarkDecorator] = []
|
||||
|
||||
@property
|
||||
def gguf_model(self):
|
||||
hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
|
||||
return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
|
||||
|
||||
|
||||
GEMMA3_CONFIG = GGUFMMTestConfig(
|
||||
original_model="google/gemma-3-4b-it",
|
||||
gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
|
||||
gguf_backbone="gemma-3-4b-it-q4_0.gguf",
|
||||
gguf_mmproj="mmproj-model-f16-4B.gguf",
|
||||
prompt=["<start_of_image>Describe this image in detail:"],
|
||||
mm_data={"images": [ImageAsset("stop_sign").pil_image]},
|
||||
marks=[pytest.mark.core_model],
|
||||
)
|
||||
|
||||
MODELS_TO_TEST = [GEMMA3_CONFIG]
|
||||
|
||||
|
||||
def run_multimodal_gguf_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: GGUFMMTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
):
|
||||
# Run gguf model.
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=model.max_model_len,
|
||||
) as gguf_model,
|
||||
):
|
||||
gguf_outputs = gguf_model.generate_greedy_logprobs(
|
||||
prompts=model.prompt,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**model.mm_data,
|
||||
)
|
||||
|
||||
# Run unquantized model.
|
||||
with vllm_runner(
|
||||
model_name=model.original_model,
|
||||
enforce_eager=True, # faster tests
|
||||
dtype=dtype,
|
||||
max_model_len=model.max_model_len,
|
||||
) as original_model:
|
||||
original_outputs = original_model.generate_greedy_logprobs(
|
||||
prompts=model.prompt,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**model.mm_data,
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=original_outputs,
|
||||
outputs_1_lst=gguf_outputs,
|
||||
name_0="original",
|
||||
name_1="gguf",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
pytest.param(test_config, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: GGUFMMTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)
|
||||
@ -34,6 +34,7 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_qwen2_5_vl_evs_functionality(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
@ -42,10 +43,14 @@ def test_qwen2_5_vl_evs_functionality(
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
use_bytecode_hook: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
@ -86,6 +91,7 @@ def test_qwen2_5_vl_evs_functionality(
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
def test_qwen2_5_vl_evs_batched_videos(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
@ -94,6 +100,8 @@ def test_qwen2_5_vl_evs_batched_videos(
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
use_bytecode_hook: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
@ -102,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos(
|
||||
2. Both pruning configurations work with multiple videos
|
||||
3. The model doesn't crash when processing multiple videos simultaneously
|
||||
"""
|
||||
# Set the environment variable for this test
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
# Sample frames from video assets
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
|
||||
@ -78,6 +78,12 @@ DOLPHIN_CONFIG = GGUFTestConfig(
|
||||
gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
|
||||
)
|
||||
|
||||
GEMMA3_CONFIG = GGUFTestConfig(
|
||||
original_model="google/gemma-3-270m-it",
|
||||
gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
|
||||
gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
|
||||
QWEN2_CONFIG,
|
||||
@ -85,6 +91,7 @@ MODELS = [
|
||||
GPT2_CONFIG,
|
||||
STABLELM_CONFIG,
|
||||
DOLPHIN_CONFIG,
|
||||
GEMMA3_CONFIG,
|
||||
# STARCODER_CONFIG, # broken
|
||||
]
|
||||
|
||||
@ -148,7 +155,7 @@ def check_model_outputs(
|
||||
"model",
|
||||
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
|
||||
@ -173,6 +173,10 @@ class _HfExamplesInfo:
|
||||
|
||||
_TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
"AfmoeForCausalLM": _HfExamplesInfo(
|
||||
"arcee-ai/Trinity-Nano",
|
||||
is_available_online=False,
|
||||
),
|
||||
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
|
||||
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
|
||||
"AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test the functionality of the Transformers backend."""
|
||||
"""Test the functionality of the Transformers modeling backend."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
@ -85,7 +85,7 @@ def test_models(
|
||||
required = Version("5.0.0.dev")
|
||||
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
|
||||
pytest.skip(
|
||||
"MoE models with the Transformers backend require "
|
||||
"MoE models with the Transformers modeling backend require "
|
||||
f"transformers>={required}, but got {installed}"
|
||||
)
|
||||
|
||||
|
||||
@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
@pytest.mark.parametrize(
|
||||
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model_path, dtype=dtype) as vllm_model:
|
||||
with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
|
||||
# Test loading a quantized checkpoint
|
||||
compare_two_settings(
|
||||
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
|
||||
# Test GPTQ Marlin
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
|
||||
# Test AWQ Marlin
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2-1.5B-Instruct-AWQ",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
|
||||
# Test wNa16
|
||||
compare_two_settings(
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
[],
|
||||
["--cpu-offload-gb", "1"],
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
def test_model_experts_int8_startup(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
|
||||
with vllm_runner(
|
||||
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@ -45,10 +45,10 @@ def test_model_load_and_run(
|
||||
if force_marlin:
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(model_id) as llm:
|
||||
with vllm_runner(model_id, enforce_eager=True) as llm:
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
|
||||
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
|
||||
with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
attn = model.model.layers[0].self_attn.attn
|
||||
@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(
|
||||
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
@ -142,7 +142,10 @@ def test_load_fp16_model(
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(
|
||||
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
|
||||
"facebook/opt-125m",
|
||||
quantization="fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
|
||||
@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
@ -49,4 +49,4 @@ def test_lm_head(
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
|
||||
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user