mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 03:24:56 +08:00
[V0 Deprecation] Remove LLMEngine (#25033)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
367a480bd3
commit
52c2a8d4ad
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
@ -110,7 +110,7 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration Test (API Server) # 100min
|
||||
timeout_in_minutes: 130
|
||||
@ -163,7 +163,6 @@ steps:
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
@ -314,12 +313,11 @@ steps:
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
@ -894,7 +892,7 @@ steps:
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||
|
||||
|
||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -5,7 +5,6 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
|
||||
@ -1,510 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from argparse import RawTextHelpFormatter
|
||||
from collections.abc import Generator
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Optional, TypeAlias
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.profiler.layerwise_profile import layerwise_profile
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
BATCH_SIZE_DEFAULT = 1
|
||||
PROMPT_LEN_DEFAULT = 256
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProfileContext:
|
||||
engine_args: EngineArgs
|
||||
prompt_len: int
|
||||
batch_size: int
|
||||
|
||||
# The profiler can run in 2 modes,
|
||||
# 1. Run profiler for user specified num_steps
|
||||
num_steps: Optional[int] = None
|
||||
# 2. Run profiler until all requests complete
|
||||
complete_num_requests_per_step: Optional[int] = None
|
||||
|
||||
save_chrome_traces_folder: Optional[str] = None
|
||||
|
||||
|
||||
def get_dtype(dtype: str):
|
||||
if dtype == "torch.float":
|
||||
return torch.float
|
||||
else:
|
||||
return dtype
|
||||
|
||||
|
||||
OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
|
||||
|
||||
|
||||
def compute_request_output_lengths(
|
||||
batch_size: int, step_requests: list[int]
|
||||
) -> OutputLen_NumReqs_Map:
|
||||
"""
|
||||
Given the number of requests, batch_size, and the number of requests
|
||||
that each engine-step should process, step_requests, determine the
|
||||
output lengths of the requests such that step_request is honoured.
|
||||
|
||||
Example:
|
||||
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
|
||||
then return,
|
||||
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
|
||||
32 requests should have output length 2,
|
||||
32 requests should have output length 3,
|
||||
32 requests should have output length 4,
|
||||
31 requests should have output length 5,
|
||||
1 request should have output length 6.
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of requests submitted for profile. This is
|
||||
args.batch_size.
|
||||
step_requests (list[int]): step_requests[i] is the number of requests
|
||||
that the ith engine step should process.
|
||||
|
||||
Returns:
|
||||
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
|
||||
number of requests required to have that output-length as values.
|
||||
"""
|
||||
ol_nr: OutputLen_NumReqs_Map = {}
|
||||
|
||||
# Number of request that are assigned an output-length
|
||||
num_reqs_assigned: int = 0
|
||||
num_steps: int = len(step_requests)
|
||||
|
||||
# sanity check. The first step (prefill-step), must process all requests.
|
||||
assert step_requests[0] == batch_size
|
||||
|
||||
# Begin assignments from the last step.
|
||||
output_length: int = num_steps
|
||||
for num_requests_at_step in reversed(step_requests):
|
||||
if num_reqs_assigned == batch_size:
|
||||
break
|
||||
|
||||
assert num_reqs_assigned < batch_size
|
||||
|
||||
# Remove the number of requests that have been determined
|
||||
# to participate in this step and beyond.
|
||||
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
|
||||
assert num_reqs_unassigned_at_step >= 0
|
||||
|
||||
if num_reqs_unassigned_at_step > 0:
|
||||
ol_nr[output_length] = num_reqs_unassigned_at_step
|
||||
num_reqs_assigned += num_reqs_unassigned_at_step
|
||||
|
||||
output_length -= 1
|
||||
|
||||
# sanity checks.
|
||||
assert sum(ol_nr.values()) == batch_size, (
|
||||
"Number of requests in output-length assignment does not match "
|
||||
f"batch-size.\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}"
|
||||
)
|
||||
|
||||
# Check that the output-length is in [1, num-steps]. Output length must be
|
||||
# at least 1 as all requests must participate in the prefill-step.
|
||||
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
|
||||
"Output lengths of requests should be in range "
|
||||
f"[1, num-engine-steps].\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}"
|
||||
)
|
||||
|
||||
return ol_nr
|
||||
|
||||
|
||||
def determine_requests_per_step(context: ProfileContext) -> list[int]:
|
||||
"""
|
||||
Determine number of requests each engine step should process.
|
||||
If context.num_steps is set, then all engine steps process the
|
||||
same number of requests and the output list is of length
|
||||
context.num_steps.
|
||||
|
||||
If context.complete_num_requests_per_step is set, then each decode step
|
||||
processes fewer and fewer requests until there are no requests to process.
|
||||
In this case, the output list is as big as the number of steps
|
||||
required to process all requests.
|
||||
|
||||
Args:
|
||||
context: ProfileContext object.
|
||||
|
||||
Returns:
|
||||
list[int]: Number of requests to process for all engine-steps.
|
||||
output[i], contains the number of requests that the ith step
|
||||
should process.
|
||||
"""
|
||||
if context.num_steps:
|
||||
# All requests must run until num_engine_steps. This implies
|
||||
# that their output lengths must be equal to num_engine_steps.
|
||||
return [context.batch_size] * context.num_steps
|
||||
|
||||
assert (
|
||||
context.complete_num_requests_per_step
|
||||
and context.complete_num_requests_per_step > 0
|
||||
), (
|
||||
f"Expected a positive complete_num_requests_per_step argument."
|
||||
f"Instead got {context.complete_num_requests_per_step}"
|
||||
)
|
||||
|
||||
# We start dropping after the first decode step.
|
||||
step_requests = [
|
||||
context.batch_size, # prefill
|
||||
context.batch_size, # decode
|
||||
]
|
||||
|
||||
num_running_requests = context.batch_size
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
while num_running_requests > 0:
|
||||
step_requests.append(num_running_requests)
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
|
||||
if step_requests[-1] != 1:
|
||||
# have 1 request running at the last step. This is often
|
||||
# useful
|
||||
step_requests.append(1)
|
||||
|
||||
return step_requests
|
||||
|
||||
|
||||
def run_profile(
|
||||
context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
|
||||
):
|
||||
print("Run profile with:")
|
||||
for key, value in asdict(context).items():
|
||||
print(f" {key} = {value}")
|
||||
|
||||
requests_per_step: list[int] = determine_requests_per_step(context)
|
||||
|
||||
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
|
||||
context.batch_size, requests_per_step
|
||||
)
|
||||
|
||||
num_steps_to_profile: int = len(requests_per_step)
|
||||
max_output_len: int = max(ol_nr.keys())
|
||||
assert max_output_len >= 1
|
||||
|
||||
# Create sampling params
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
# max_tokens is set on a per-request basis.
|
||||
max_tokens=None,
|
||||
ignore_eos=True,
|
||||
)
|
||||
|
||||
# Create LLM
|
||||
llm = LLM(**asdict(context.engine_args))
|
||||
batch_size = context.batch_size
|
||||
prompt_len = context.prompt_len
|
||||
|
||||
scheduler_config = llm.llm_engine.vllm_config.scheduler_config
|
||||
max_model_len = llm.llm_engine.model_config.max_model_len
|
||||
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = scheduler_config.max_num_seqs
|
||||
|
||||
if batch_size * prompt_len > max_num_batched_tokens:
|
||||
print(
|
||||
f"ERROR: chosen batch_size * prompt_len "
|
||||
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
|
||||
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
|
||||
f"and therefore cannot be run in a single profile step, please "
|
||||
f"choose a smaller batch size or prompt length, or increase "
|
||||
f"--max-num-batched-tokens"
|
||||
)
|
||||
sys.exit(-1)
|
||||
if batch_size > max_num_seqs:
|
||||
print(
|
||||
f"ERROR: chosen batch_size ({batch_size}) is larger than "
|
||||
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
|
||||
f"single profile step, please choose a smaller batch size"
|
||||
)
|
||||
sys.exit(-1)
|
||||
print(
|
||||
"llm.llm_engine.model_config.max_model_len: ",
|
||||
llm.llm_engine.model_config.max_model_len,
|
||||
)
|
||||
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
|
||||
print(
|
||||
f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
|
||||
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
|
||||
f"than the model's max_model_len ({max_model_len}), please "
|
||||
f"choose a smaller prompt_len or max_output_len, or increase "
|
||||
f"--max-model-len"
|
||||
)
|
||||
sys.exit(-1)
|
||||
|
||||
def add_requests():
|
||||
def get_output_len_generator() -> Generator[int, Any, Any]:
|
||||
for output_len, num_reqs in ol_nr.items():
|
||||
for _ in range(num_reqs):
|
||||
yield output_len
|
||||
|
||||
output_len_generator = get_output_len_generator()
|
||||
for i in range(batch_size):
|
||||
sampling_params.max_tokens = next(output_len_generator)
|
||||
assert isinstance(sampling_params.max_tokens, int)
|
||||
|
||||
prompt_token_ids = torch.randint(
|
||||
llm.get_tokenizer().vocab_size, size=(prompt_len,)
|
||||
).tolist()
|
||||
|
||||
llm.llm_engine.add_request(
|
||||
request_id=f"seq{i}",
|
||||
prompt={"prompt_token_ids": prompt_token_ids},
|
||||
params=sampling_params,
|
||||
)
|
||||
|
||||
def abort_requests():
|
||||
for i in range(batch_size):
|
||||
llm.llm_engine.abort_request(f"seq{i}")
|
||||
|
||||
# Warm up run
|
||||
print("Warm up run ...")
|
||||
add_requests()
|
||||
llm.llm_engine.step() # Prefill
|
||||
llm.llm_engine.step() # Decode
|
||||
abort_requests()
|
||||
|
||||
print("Profile run ...")
|
||||
add_requests()
|
||||
|
||||
with layerwise_profile() as prefill_prof:
|
||||
llm.llm_engine.step() # First step is prefill
|
||||
|
||||
decode_profs = []
|
||||
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
|
||||
num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
|
||||
with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
|
||||
llm.llm_engine.step()
|
||||
decode_profs.append(decode_prof)
|
||||
|
||||
decode_results_list = [prof.results for prof in decode_profs]
|
||||
prefill_results = prefill_prof.results
|
||||
has_decode = len(decode_results_list) > 0
|
||||
|
||||
LINE_WIDTH = 80
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_model_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(
|
||||
f"= First Decode Step Model Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})"
|
||||
)
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_model_table()
|
||||
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_summary_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(
|
||||
f"= First Decode Step Summary Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})"
|
||||
)
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_summary_table()
|
||||
|
||||
if csv_output:
|
||||
csv_filename_base = (
|
||||
csv_output[:-4] if csv_output.endswith(".csv") else csv_output
|
||||
)
|
||||
prefill_results.export_model_stats_table_csv(
|
||||
csv_filename_base + "_prefill_model_table.csv"
|
||||
)
|
||||
prefill_results.export_summary_stats_table_csv(
|
||||
csv_filename_base + "_prefill_summary_table.csv"
|
||||
)
|
||||
|
||||
if has_decode:
|
||||
decode_results_list[0].export_model_stats_table_csv(
|
||||
csv_filename_base + "_decode_model_table.csv"
|
||||
)
|
||||
decode_results_list[0].export_summary_stats_table_csv(
|
||||
csv_filename_base + "_decode_summary_table.csv"
|
||||
)
|
||||
|
||||
if json_output:
|
||||
cuda_devices = [
|
||||
torch.cuda.get_device_properties(dev_idx)
|
||||
for dev_idx in range(torch.cuda.device_count())
|
||||
]
|
||||
|
||||
json_dict = {
|
||||
"context": {
|
||||
"python_version": f"{sys.version}",
|
||||
"torch_version": f"{torch.__version__}",
|
||||
"torch_cuda_version": f"{torch.version.cuda}",
|
||||
"cuda_devices": f"{cuda_devices}",
|
||||
**asdict(context),
|
||||
},
|
||||
"prefill": prefill_results.convert_stats_to_dict(),
|
||||
}
|
||||
|
||||
if has_decode:
|
||||
for idx, dr in enumerate(decode_results_list):
|
||||
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
|
||||
|
||||
# Add .json to json_output filename if it doesn't exist already.
|
||||
json_output_file = (
|
||||
json_output if json_output.endswith(".json") else json_output + ".json"
|
||||
)
|
||||
with open(json_output_file, "w+") as f:
|
||||
json.dump(json_dict, f, indent=2)
|
||||
pass
|
||||
|
||||
if context.save_chrome_traces_folder is not None:
|
||||
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
|
||||
prefill_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + "/prefill.json"
|
||||
)
|
||||
for idx, decode_prof in enumerate(decode_profs):
|
||||
decode_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
|
||||
)
|
||||
print(
|
||||
"Traces saved as prefill.json and decode_1.json, etc."
|
||||
f" in folder {context.save_chrome_traces_folder}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Profile a model
|
||||
|
||||
example:
|
||||
```
|
||||
python examples/offline_inference/profiling.py \\
|
||||
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
|
||||
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
|
||||
--enforce-eager run_num_steps -n 2
|
||||
```
|
||||
|
||||
then you can use various tools to analyze the json output
|
||||
terminal ascii tables:
|
||||
```
|
||||
python tools/profiler/print_layerwise_table.py \\
|
||||
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
|
||||
```
|
||||
or create matplotlib stacked bar charts:
|
||||
```
|
||||
python tools/profiler/visualize_layerwise_profile.py \\
|
||||
--json-trace Llama31-8b-FP8.json \\
|
||||
--output-directory profile_breakdown --plot-metric pct_cuda_time
|
||||
```
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as multiple csv file. This should be the root "
|
||||
"filename, will create <filename>_prefill_model_table.csv, "
|
||||
"<filename>_prefill_summary_table.csv, "
|
||||
"<filename>_decode_model_table.csv, and "
|
||||
"<filename>_decode_summary_table.csv",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as a json file. This should be the filename",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-chrome-traces-folder",
|
||||
type=str,
|
||||
help="Save chrome traces for the prefill and decode "
|
||||
"will save traces as prefill.json and decode_1.json, "
|
||||
"etc. inside this folder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt-len",
|
||||
type=int,
|
||||
default=PROMPT_LEN_DEFAULT,
|
||||
help=f"Length of the random prompt to use when profiling, all batched "
|
||||
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=BATCH_SIZE_DEFAULT,
|
||||
help=f"Number of requests to run as a single batch, "
|
||||
f"default={BATCH_SIZE_DEFAULT}",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="cmd")
|
||||
|
||||
run_num_steps_parser = subparsers.add_parser(
|
||||
"run_num_steps", help="This variation profiles n engine.step() invocations."
|
||||
)
|
||||
run_num_steps_parser.add_argument(
|
||||
"-n",
|
||||
"--num-steps",
|
||||
type=int,
|
||||
help="Number of engine steps to profile.\n"
|
||||
"Setting it to 1, profiles only the prefill step.\n"
|
||||
"Setting it to 2, profiles the prefill and first decode step\n"
|
||||
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
|
||||
"and so on ...",
|
||||
)
|
||||
|
||||
run_to_completion_parser = subparsers.add_parser(
|
||||
"run_to_completion",
|
||||
help="This variation profiles all the engine.step() invocations"
|
||||
"until the engine exhausts all submitted requests.",
|
||||
)
|
||||
run_to_completion_parser.add_argument(
|
||||
"-n",
|
||||
"--complete-num-requests-per-step",
|
||||
type=int,
|
||||
help="Complete complete_num_requests_per_step requests every decode step."
|
||||
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
|
||||
"the profiler is run for 6 engine steps, with the steps processing, "
|
||||
"128, 128, 96, 64, 32, 1 requests respectively.\n"
|
||||
"Note that we tack-on a one-request step at the end as it is often "
|
||||
"useful.",
|
||||
)
|
||||
|
||||
EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
context = ProfileContext(
|
||||
engine_args=EngineArgs.from_cli_args(args),
|
||||
**{
|
||||
k: v
|
||||
for k, v in vars(args).items()
|
||||
if k in inspect.signature(ProfileContext).parameters
|
||||
},
|
||||
)
|
||||
run_profile(context, csv_output=args.csv, json_output=args.json)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -11,7 +11,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, envs
|
||||
from vllm import LLM
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
@ -26,14 +26,6 @@ MODELS = [
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("distilbert/distilgpt2")
|
||||
@ -76,12 +68,6 @@ def test_models(
|
||||
model_executor: str,
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
if not envs.VLLM_USE_V1:
|
||||
if async_scheduling:
|
||||
pytest.skip("async_scheduling only supported in v1.")
|
||||
if model_executor != "uni":
|
||||
pytest.skip("only test uniproc executor for v0.")
|
||||
|
||||
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
||||
pytest.skip(
|
||||
f"{backend} does not support gemma2 with full context length.")
|
||||
|
||||
@ -122,11 +122,12 @@ def test_cumem_with_cudagraph():
|
||||
# sleep mode with safetensors
|
||||
("meta-llama/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
("facebook/opt-125m", False),
|
||||
("facebook/opt-125m", True),
|
||||
])
|
||||
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
assert use_v1
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
|
||||
@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
|
||||
# Use global backends
|
||||
global backend, backend_unfused
|
||||
|
||||
use_v1 = False # can be made a param once V1 support added
|
||||
monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
|
||||
|
||||
# Prompt 4 seems too open-ended, differs between fused and unfused
|
||||
|
||||
@ -160,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch):
|
||||
monkeypatch.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def run_with_both_engines(request, monkeypatch):
|
||||
# Automatically runs tests twice, once with V1 and once without
|
||||
use_v1 = request.param
|
||||
# Tests decorated with `@skip_v1` are only run without v1
|
||||
skip_v0 = request.node.get_closest_marker("skip_v0")
|
||||
skip_v1 = request.node.get_closest_marker("skip_v1")
|
||||
|
||||
if use_v1:
|
||||
if skip_v1:
|
||||
pytest.skip("Skipping test on vllm V1")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '1')
|
||||
else:
|
||||
if skip_v0:
|
||||
pytest.skip("Skipping test on vllm V0")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def init_test_http_connection():
|
||||
# pytest_asyncio may use a different event loop per test
|
||||
|
||||
@ -25,12 +25,6 @@ TOKEN_IDS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
|
||||
@ -6,14 +6,6 @@ import pytest
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
|
||||
|
||||
@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
env={"VLLM_USE_V1": "1" if use_v1 else "0"})
|
||||
env={"VLLM_USE_V1": "1"})
|
||||
|
||||
def is_server_up(url):
|
||||
try:
|
||||
|
||||
@ -69,28 +69,20 @@ def generate_params():
|
||||
|
||||
@pytest.mark.parametrize("device, name, use_mla, block_size",
|
||||
generate_params())
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_env(
|
||||
device: str,
|
||||
name: str,
|
||||
use_mla: bool,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with valid device-backend pairs."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, name)
|
||||
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
|
||||
|
||||
if name == "FLASHINFER" and not use_v1:
|
||||
pytest.skip("FlashInfer backend is only available on V1 engine")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, None, block_size,
|
||||
@ -137,7 +129,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
expected = f"{name}_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
@ -146,7 +138,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
expected = "TRITON_ATTN_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
elif device == "cuda":
|
||||
@ -163,11 +155,7 @@ def test_env(
|
||||
# - TRITON_MLA: fallback for other cases
|
||||
|
||||
if name == "CUTLASS_MLA":
|
||||
if not use_v1:
|
||||
# CUTLASS_MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supported on V1 engine")
|
||||
elif block_size != 128:
|
||||
if block_size != 128:
|
||||
# CUTLASS_MLA only supports block_size == 128
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supports block_size 128")
|
||||
@ -181,11 +169,7 @@ def test_env(
|
||||
expected = "CUTLASS_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER_MLA":
|
||||
if not use_v1:
|
||||
# FlashInfer MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supported on V1 engine")
|
||||
elif block_size not in [32, 64]:
|
||||
if block_size not in [32, 64]:
|
||||
# FlashInfer MLA only supports block_size 32 or 64
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supports block_size 32 "
|
||||
@ -217,23 +201,17 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
expected = f"{name}_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASH_ATTN_MLA":
|
||||
if not use_v1:
|
||||
# FlashAttention MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashAttention MLA only supported on V1 engine"
|
||||
)
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
# TRITON_MLA or other fallback
|
||||
backend = get_attn_backend(16,
|
||||
@ -242,8 +220,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = ("TRITON_MLA_VLLM_V1"
|
||||
if use_v1 else "TRITON_MLA")
|
||||
expected = "TRITON_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER":
|
||||
backend = get_attn_backend(16,
|
||||
@ -252,7 +229,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
||||
expected = "FLASHINFER_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(32,
|
||||
@ -261,36 +238,30 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||
expected = "FLASH_ATTN_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
if use_v1:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_fp32_fallback(
|
||||
device: str,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with fp32."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
@ -300,8 +271,7 @@ def test_fp32_fallback(
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CudaPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||
if use_v1 else "XFORMERS")
|
||||
assert backend.get_name() == "FLEX_ATTENTION"
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test that invalid attention backend names raise ValueError."""
|
||||
with monkeypatch.context() as m, patch(
|
||||
"vllm.attention.selector.current_platform", CudaPlatform()):
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||
|
||||
# Should raise ValueError for invalid backend
|
||||
|
||||
@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
|
||||
@ -15,7 +15,8 @@ from ...utils import check_logprobs_close
|
||||
# have a clean way to fall back, so we fail with
|
||||
# a clear msg when it happens.
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
# NOTE(woosuk): Skipping these tests until V1 supports them.
|
||||
# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if model in REQUIRES_V0:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
|
||||
@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ...utils import check_logprobs_close, check_outputs_equal
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# Mark all tests as hybrid
|
||||
pytestmark = pytest.mark.hybrid_model
|
||||
@ -88,15 +88,6 @@ def test_models(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model not in V0_UNSUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
vllm_v0_outputs = None
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
@ -104,14 +95,6 @@ def test_models(
|
||||
else:
|
||||
vllm_v1_outputs = None
|
||||
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@ -157,45 +140,6 @@ def test_batching(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
def test_chunked_prefill(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
chunked_prefill_token_size: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
enable_chunked_prefill=False,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
non_chunked = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(
|
||||
@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding(
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Tests that outputs are identical with and w/o preemptions (recompute).
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
scheduler = vllm_model.llm.llm_engine.scheduler[0]
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
@ -386,27 +298,10 @@ def test_full_cuda_graph(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model not in V0_UNSUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
vllm_v0_outputs = None
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
@ -442,27 +337,12 @@ def test_fp32_cache_state(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -82,7 +81,7 @@ def test_prm_models(
|
||||
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
max_transformers_version="4.53.2")
|
||||
|
||||
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
|
||||
if current_platform.is_cpu():
|
||||
pytest.skip("CPU only supports V1")
|
||||
|
||||
if current_platform.is_rocm():
|
||||
|
||||
@ -36,9 +36,6 @@ from ..utils import check_logprobs_close
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -49,7 +46,6 @@ def test_models(
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
@ -74,7 +70,6 @@ def test_models(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -85,7 +80,6 @@ def test_models(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -110,9 +104,6 @@ def test_models(
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_cpu_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -120,7 +111,6 @@ def test_cpu_models(
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
max_tokens: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
@ -138,7 +128,6 @@ def test_cpu_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -148,7 +137,6 @@ def test_cpu_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
@ -7,7 +7,6 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
False))
|
||||
|
||||
# Avoid calling model.forward()
|
||||
def _initialize_kv_caches_v0(self) -> None:
|
||||
self.cache_config.num_gpu_blocks = 0
|
||||
self.cache_config.num_cpu_blocks = 0
|
||||
|
||||
def _initialize_kv_caches_v1(self, vllm_config):
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
scheduler_kv_cache_config = get_kv_cache_configs(
|
||||
@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
|
||||
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v0),
|
||||
patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
with (patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v1), monkeypatch.context() as m):
|
||||
if model_info.v0_only:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
# NOTE(woosuk): skip the test for V0-only models
|
||||
return
|
||||
|
||||
if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
|
||||
# Phi4FlashForCausalLM and MotifForCausalLM
|
||||
# only supports DIFFERENTIAL_FLASH_ATTN backend
|
||||
|
||||
@ -42,6 +42,7 @@ def test_oot_registration_text_generation(
|
||||
assert rest == ""
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_embedding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
@ -7,15 +7,6 @@ import torch
|
||||
from vllm.plugins import load_general_plugins
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
def test_platform_plugins():
|
||||
# simulate workload by running an example
|
||||
import runpy
|
||||
|
||||
@ -3,47 +3,18 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
|
||||
class DummyV0Scheduler(Scheduler):
|
||||
|
||||
def schedule(self):
|
||||
raise Exception("Exception raised by DummyV0Scheduler")
|
||||
|
||||
|
||||
class DummyV1Scheduler(V1Scheduler):
|
||||
class DummyV1Scheduler(Scheduler):
|
||||
|
||||
def schedule(self):
|
||||
raise Exception("Exception raised by DummyV1Scheduler")
|
||||
|
||||
|
||||
def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="facebook/opt-125m",
|
||||
enforce_eager=True, # reduce test time
|
||||
scheduler_cls=DummyV0Scheduler,
|
||||
)
|
||||
|
||||
engine = LLMEngine.from_engine_args(engine_args=engine_args)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
engine.add_request("0", "foo", sampling_params)
|
||||
engine.step()
|
||||
|
||||
assert str(
|
||||
exception_info.value) == "Exception raised by DummyV0Scheduler"
|
||||
|
||||
|
||||
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
||||
scheduler_cls=DummyV1Scheduler,
|
||||
)
|
||||
|
||||
engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
|
||||
engine = LLMEngine.from_engine_args(engine_args=engine_args)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
engine.add_request("0", "foo", sampling_params)
|
||||
|
||||
@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
# FIXME(zhuohan): The test can not pass if we:
|
||||
# 1. Increase max_tokens to 256.
|
||||
# 2. Increase beam_width to 8.
|
||||
|
||||
@ -9,13 +9,6 @@ import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
# We also test with llama because it has generation_config to specify EOS
|
||||
# (past regression).
|
||||
MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
|
||||
|
||||
@ -8,12 +8,6 @@ from vllm import SamplingParams
|
||||
MODELS = ["distilbert/distilgpt2"]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_ranks(
|
||||
|
||||
@ -352,58 +352,3 @@ def test_decode_prompt_logprobs(complete_sequence: str,
|
||||
logprobs[token_id + 1].decoded_token
|
||||
for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
|
||||
])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
|
||||
def test_decode_prompt_logprobs_chunked_prefill(
|
||||
vllm_runner,
|
||||
model,
|
||||
chunked_prefill_token_size: int,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
):
|
||||
# VLLM V1 does not use incremental detokenization for
|
||||
# prompt logprobs, so this test strategy is irrelevant.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
max_num_seqs = 256
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype="half",
|
||||
max_logprobs=5,
|
||||
gpu_memory_utilization=0.5,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
|
||||
vllm_sampling_params = SamplingParams(max_tokens=10,
|
||||
logprobs=5,
|
||||
prompt_logprobs=5,
|
||||
temperature=0.0)
|
||||
vllm_results = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=vllm_sampling_params)
|
||||
|
||||
for idx, result in enumerate(vllm_results):
|
||||
assert result.prompt_logprobs is not None
|
||||
assert result.prompt_logprobs[0] is None
|
||||
|
||||
# Compared detokenized prompts ids to original prompt.
|
||||
generated_string = ""
|
||||
for (prompt_token,
|
||||
prompt_logprobs) in zip(result.prompt_token_ids[1:],
|
||||
result.prompt_logprobs[1:]):
|
||||
# prompt_logprobs is a dict of the token_id: logprob
|
||||
# We select the token_id corresponding to the actual prompt
|
||||
# Decoded token in the detokenized string corresponding to this
|
||||
# prompt token.
|
||||
generated_string += prompt_logprobs[prompt_token].decoded_token
|
||||
|
||||
assert generated_string == example_prompts[idx], (
|
||||
"Detokenized prompt logprobs do not match original prompt")
|
||||
|
||||
@ -1508,14 +1508,6 @@ class EngineArgs:
|
||||
recommend_to_remove=True)
|
||||
return False
|
||||
|
||||
if self.kv_cache_dtype != "auto":
|
||||
supported = current_platform.is_kv_cache_dtype_supported(
|
||||
self.kv_cache_dtype, model_config)
|
||||
if not supported:
|
||||
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
# No Mamba or Encoder-Decoder so far.
|
||||
if not model_config.is_v1_compatible:
|
||||
_raise_or_fallback(feature_name=model_config.architectures,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -11,7 +11,6 @@ from pydantic import ValidationError
|
||||
from tqdm.auto import tqdm
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
|
||||
BeamSearchSequence,
|
||||
create_sort_beams_key_function)
|
||||
@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType,
|
||||
StructuredOutputsConfig, TokenizerMode, is_init_field)
|
||||
from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
|
||||
PoolerConfig, RunnerOption)
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||
ChatTemplateContentFormatOption,
|
||||
apply_hf_chat_template,
|
||||
@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
get_cached_tokenizer)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Counter, Device, as_iter, is_list_of
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -309,11 +308,7 @@ class LLM:
|
||||
self.request_counter = Counter()
|
||||
self.default_sampling_params: Union[dict[str, Any], None] = None
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
supported_tasks = self.llm_engine \
|
||||
.get_supported_tasks() # type: ignore
|
||||
else:
|
||||
supported_tasks = self.llm_engine.model_config.supported_tasks
|
||||
supported_tasks = self.llm_engine.get_supported_tasks() # type: ignore
|
||||
|
||||
logger.info("Supported_tasks: %s", supported_tasks)
|
||||
|
||||
@ -1473,8 +1468,6 @@ class LLM:
|
||||
Note:
|
||||
This method is only available with the V1 LLM engine.
|
||||
"""
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
assert isinstance(self.llm_engine, V1LLMEngine)
|
||||
return self.llm_engine.get_metrics()
|
||||
|
||||
def _validate_and_add_requests(
|
||||
|
||||
@ -672,21 +672,15 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
|
||||
) as stream:
|
||||
stream.write(encryption_params.key)
|
||||
|
||||
from vllm import LLMEngine
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
assert envs.VLLM_USE_V1
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
engine.model_executor.collective_rpc(
|
||||
"save_tensorized_model",
|
||||
kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
|
||||
)
|
||||
else:
|
||||
engine = V1LLMEngine.from_vllm_config(engine_config)
|
||||
engine.collective_rpc(
|
||||
"save_tensorized_model",
|
||||
kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
|
||||
)
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
engine = LLMEngine.from_vllm_config(engine_config)
|
||||
engine.collective_rpc(
|
||||
"save_tensorized_model",
|
||||
kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
|
||||
)
|
||||
|
||||
|
||||
def tensorize_lora_adapter(lora_path: str,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user