mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-08 15:07:04 +08:00
Merge branch 'main' into woosuk/model-runner-v2
This commit is contained in:
commit
631b5b47c1
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
@ -6,24 +6,28 @@
|
||||
# to generate the final pipeline yaml file.
|
||||
|
||||
# Documentation
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||
# label(str): the name of the test. emojis allowed.
|
||||
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on the fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
|
||||
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||
# in this case, commands must be specified. the first command runs on first host, the second
|
||||
# commands(list): the list of commands to run for the test. incompatible with command.
|
||||
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
|
||||
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
|
||||
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
|
||||
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
|
||||
# in this case, commands must be specified. the first command runs on the first host, the second
|
||||
# command runs on the second host.
|
||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
||||
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
|
||||
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
|
||||
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
||||
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
||||
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
||||
|
||||
# When adding a test
|
||||
# - If the test belong to an existing group, add it there
|
||||
# - If the test belongs to an existing group, add it there
|
||||
# - If the test is short, add to any existing step
|
||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||
# Note that all steps execute in parallel.
|
||||
@ -110,7 +114,7 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration Test (API Server) # 100min
|
||||
timeout_in_minutes: 130
|
||||
@ -148,7 +152,6 @@ steps:
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/core/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
@ -163,7 +166,6 @@ steps:
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
@ -314,12 +316,11 @@ steps:
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
@ -869,8 +870,6 @@ steps:
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
@ -894,7 +893,7 @@ steps:
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||
|
||||
|
||||
3
.github/CODEOWNERS
vendored
3
.github/CODEOWNERS
vendored
@ -4,11 +4,8 @@
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
|
||||
@ -1,510 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from argparse import RawTextHelpFormatter
|
||||
from collections.abc import Generator
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Optional, TypeAlias
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.profiler.layerwise_profile import layerwise_profile
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
BATCH_SIZE_DEFAULT = 1
|
||||
PROMPT_LEN_DEFAULT = 256
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProfileContext:
|
||||
engine_args: EngineArgs
|
||||
prompt_len: int
|
||||
batch_size: int
|
||||
|
||||
# The profiler can run in 2 modes,
|
||||
# 1. Run profiler for user specified num_steps
|
||||
num_steps: Optional[int] = None
|
||||
# 2. Run profiler until all requests complete
|
||||
complete_num_requests_per_step: Optional[int] = None
|
||||
|
||||
save_chrome_traces_folder: Optional[str] = None
|
||||
|
||||
|
||||
def get_dtype(dtype: str):
|
||||
if dtype == "torch.float":
|
||||
return torch.float
|
||||
else:
|
||||
return dtype
|
||||
|
||||
|
||||
OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
|
||||
|
||||
|
||||
def compute_request_output_lengths(
|
||||
batch_size: int, step_requests: list[int]
|
||||
) -> OutputLen_NumReqs_Map:
|
||||
"""
|
||||
Given the number of requests, batch_size, and the number of requests
|
||||
that each engine-step should process, step_requests, determine the
|
||||
output lengths of the requests such that step_request is honoured.
|
||||
|
||||
Example:
|
||||
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
|
||||
then return,
|
||||
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
|
||||
32 requests should have output length 2,
|
||||
32 requests should have output length 3,
|
||||
32 requests should have output length 4,
|
||||
31 requests should have output length 5,
|
||||
1 request should have output length 6.
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of requests submitted for profile. This is
|
||||
args.batch_size.
|
||||
step_requests (list[int]): step_requests[i] is the number of requests
|
||||
that the ith engine step should process.
|
||||
|
||||
Returns:
|
||||
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
|
||||
number of requests required to have that output-length as values.
|
||||
"""
|
||||
ol_nr: OutputLen_NumReqs_Map = {}
|
||||
|
||||
# Number of request that are assigned an output-length
|
||||
num_reqs_assigned: int = 0
|
||||
num_steps: int = len(step_requests)
|
||||
|
||||
# sanity check. The first step (prefill-step), must process all requests.
|
||||
assert step_requests[0] == batch_size
|
||||
|
||||
# Begin assignments from the last step.
|
||||
output_length: int = num_steps
|
||||
for num_requests_at_step in reversed(step_requests):
|
||||
if num_reqs_assigned == batch_size:
|
||||
break
|
||||
|
||||
assert num_reqs_assigned < batch_size
|
||||
|
||||
# Remove the number of requests that have been determined
|
||||
# to participate in this step and beyond.
|
||||
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
|
||||
assert num_reqs_unassigned_at_step >= 0
|
||||
|
||||
if num_reqs_unassigned_at_step > 0:
|
||||
ol_nr[output_length] = num_reqs_unassigned_at_step
|
||||
num_reqs_assigned += num_reqs_unassigned_at_step
|
||||
|
||||
output_length -= 1
|
||||
|
||||
# sanity checks.
|
||||
assert sum(ol_nr.values()) == batch_size, (
|
||||
"Number of requests in output-length assignment does not match "
|
||||
f"batch-size.\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}"
|
||||
)
|
||||
|
||||
# Check that the output-length is in [1, num-steps]. Output length must be
|
||||
# at least 1 as all requests must participate in the prefill-step.
|
||||
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
|
||||
"Output lengths of requests should be in range "
|
||||
f"[1, num-engine-steps].\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}"
|
||||
)
|
||||
|
||||
return ol_nr
|
||||
|
||||
|
||||
def determine_requests_per_step(context: ProfileContext) -> list[int]:
|
||||
"""
|
||||
Determine number of requests each engine step should process.
|
||||
If context.num_steps is set, then all engine steps process the
|
||||
same number of requests and the output list is of length
|
||||
context.num_steps.
|
||||
|
||||
If context.complete_num_requests_per_step is set, then each decode step
|
||||
processes fewer and fewer requests until there are no requests to process.
|
||||
In this case, the output list is as big as the number of steps
|
||||
required to process all requests.
|
||||
|
||||
Args:
|
||||
context: ProfileContext object.
|
||||
|
||||
Returns:
|
||||
list[int]: Number of requests to process for all engine-steps.
|
||||
output[i], contains the number of requests that the ith step
|
||||
should process.
|
||||
"""
|
||||
if context.num_steps:
|
||||
# All requests must run until num_engine_steps. This implies
|
||||
# that their output lengths must be equal to num_engine_steps.
|
||||
return [context.batch_size] * context.num_steps
|
||||
|
||||
assert (
|
||||
context.complete_num_requests_per_step
|
||||
and context.complete_num_requests_per_step > 0
|
||||
), (
|
||||
f"Expected a positive complete_num_requests_per_step argument."
|
||||
f"Instead got {context.complete_num_requests_per_step}"
|
||||
)
|
||||
|
||||
# We start dropping after the first decode step.
|
||||
step_requests = [
|
||||
context.batch_size, # prefill
|
||||
context.batch_size, # decode
|
||||
]
|
||||
|
||||
num_running_requests = context.batch_size
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
while num_running_requests > 0:
|
||||
step_requests.append(num_running_requests)
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
|
||||
if step_requests[-1] != 1:
|
||||
# have 1 request running at the last step. This is often
|
||||
# useful
|
||||
step_requests.append(1)
|
||||
|
||||
return step_requests
|
||||
|
||||
|
||||
def run_profile(
|
||||
context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
|
||||
):
|
||||
print("Run profile with:")
|
||||
for key, value in asdict(context).items():
|
||||
print(f" {key} = {value}")
|
||||
|
||||
requests_per_step: list[int] = determine_requests_per_step(context)
|
||||
|
||||
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
|
||||
context.batch_size, requests_per_step
|
||||
)
|
||||
|
||||
num_steps_to_profile: int = len(requests_per_step)
|
||||
max_output_len: int = max(ol_nr.keys())
|
||||
assert max_output_len >= 1
|
||||
|
||||
# Create sampling params
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
# max_tokens is set on a per-request basis.
|
||||
max_tokens=None,
|
||||
ignore_eos=True,
|
||||
)
|
||||
|
||||
# Create LLM
|
||||
llm = LLM(**asdict(context.engine_args))
|
||||
batch_size = context.batch_size
|
||||
prompt_len = context.prompt_len
|
||||
|
||||
scheduler_config = llm.llm_engine.vllm_config.scheduler_config
|
||||
max_model_len = llm.llm_engine.model_config.max_model_len
|
||||
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = scheduler_config.max_num_seqs
|
||||
|
||||
if batch_size * prompt_len > max_num_batched_tokens:
|
||||
print(
|
||||
f"ERROR: chosen batch_size * prompt_len "
|
||||
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
|
||||
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
|
||||
f"and therefore cannot be run in a single profile step, please "
|
||||
f"choose a smaller batch size or prompt length, or increase "
|
||||
f"--max-num-batched-tokens"
|
||||
)
|
||||
sys.exit(-1)
|
||||
if batch_size > max_num_seqs:
|
||||
print(
|
||||
f"ERROR: chosen batch_size ({batch_size}) is larger than "
|
||||
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
|
||||
f"single profile step, please choose a smaller batch size"
|
||||
)
|
||||
sys.exit(-1)
|
||||
print(
|
||||
"llm.llm_engine.model_config.max_model_len: ",
|
||||
llm.llm_engine.model_config.max_model_len,
|
||||
)
|
||||
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
|
||||
print(
|
||||
f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
|
||||
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
|
||||
f"than the model's max_model_len ({max_model_len}), please "
|
||||
f"choose a smaller prompt_len or max_output_len, or increase "
|
||||
f"--max-model-len"
|
||||
)
|
||||
sys.exit(-1)
|
||||
|
||||
def add_requests():
|
||||
def get_output_len_generator() -> Generator[int, Any, Any]:
|
||||
for output_len, num_reqs in ol_nr.items():
|
||||
for _ in range(num_reqs):
|
||||
yield output_len
|
||||
|
||||
output_len_generator = get_output_len_generator()
|
||||
for i in range(batch_size):
|
||||
sampling_params.max_tokens = next(output_len_generator)
|
||||
assert isinstance(sampling_params.max_tokens, int)
|
||||
|
||||
prompt_token_ids = torch.randint(
|
||||
llm.get_tokenizer().vocab_size, size=(prompt_len,)
|
||||
).tolist()
|
||||
|
||||
llm.llm_engine.add_request(
|
||||
request_id=f"seq{i}",
|
||||
prompt={"prompt_token_ids": prompt_token_ids},
|
||||
params=sampling_params,
|
||||
)
|
||||
|
||||
def abort_requests():
|
||||
for i in range(batch_size):
|
||||
llm.llm_engine.abort_request(f"seq{i}")
|
||||
|
||||
# Warm up run
|
||||
print("Warm up run ...")
|
||||
add_requests()
|
||||
llm.llm_engine.step() # Prefill
|
||||
llm.llm_engine.step() # Decode
|
||||
abort_requests()
|
||||
|
||||
print("Profile run ...")
|
||||
add_requests()
|
||||
|
||||
with layerwise_profile() as prefill_prof:
|
||||
llm.llm_engine.step() # First step is prefill
|
||||
|
||||
decode_profs = []
|
||||
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
|
||||
num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
|
||||
with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
|
||||
llm.llm_engine.step()
|
||||
decode_profs.append(decode_prof)
|
||||
|
||||
decode_results_list = [prof.results for prof in decode_profs]
|
||||
prefill_results = prefill_prof.results
|
||||
has_decode = len(decode_results_list) > 0
|
||||
|
||||
LINE_WIDTH = 80
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_model_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(
|
||||
f"= First Decode Step Model Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})"
|
||||
)
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_model_table()
|
||||
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_summary_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(
|
||||
f"= First Decode Step Summary Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})"
|
||||
)
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_summary_table()
|
||||
|
||||
if csv_output:
|
||||
csv_filename_base = (
|
||||
csv_output[:-4] if csv_output.endswith(".csv") else csv_output
|
||||
)
|
||||
prefill_results.export_model_stats_table_csv(
|
||||
csv_filename_base + "_prefill_model_table.csv"
|
||||
)
|
||||
prefill_results.export_summary_stats_table_csv(
|
||||
csv_filename_base + "_prefill_summary_table.csv"
|
||||
)
|
||||
|
||||
if has_decode:
|
||||
decode_results_list[0].export_model_stats_table_csv(
|
||||
csv_filename_base + "_decode_model_table.csv"
|
||||
)
|
||||
decode_results_list[0].export_summary_stats_table_csv(
|
||||
csv_filename_base + "_decode_summary_table.csv"
|
||||
)
|
||||
|
||||
if json_output:
|
||||
cuda_devices = [
|
||||
torch.cuda.get_device_properties(dev_idx)
|
||||
for dev_idx in range(torch.cuda.device_count())
|
||||
]
|
||||
|
||||
json_dict = {
|
||||
"context": {
|
||||
"python_version": f"{sys.version}",
|
||||
"torch_version": f"{torch.__version__}",
|
||||
"torch_cuda_version": f"{torch.version.cuda}",
|
||||
"cuda_devices": f"{cuda_devices}",
|
||||
**asdict(context),
|
||||
},
|
||||
"prefill": prefill_results.convert_stats_to_dict(),
|
||||
}
|
||||
|
||||
if has_decode:
|
||||
for idx, dr in enumerate(decode_results_list):
|
||||
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
|
||||
|
||||
# Add .json to json_output filename if it doesn't exist already.
|
||||
json_output_file = (
|
||||
json_output if json_output.endswith(".json") else json_output + ".json"
|
||||
)
|
||||
with open(json_output_file, "w+") as f:
|
||||
json.dump(json_dict, f, indent=2)
|
||||
pass
|
||||
|
||||
if context.save_chrome_traces_folder is not None:
|
||||
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
|
||||
prefill_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + "/prefill.json"
|
||||
)
|
||||
for idx, decode_prof in enumerate(decode_profs):
|
||||
decode_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
|
||||
)
|
||||
print(
|
||||
"Traces saved as prefill.json and decode_1.json, etc."
|
||||
f" in folder {context.save_chrome_traces_folder}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Profile a model
|
||||
|
||||
example:
|
||||
```
|
||||
python examples/offline_inference/profiling.py \\
|
||||
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
|
||||
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
|
||||
--enforce-eager run_num_steps -n 2
|
||||
```
|
||||
|
||||
then you can use various tools to analyze the json output
|
||||
terminal ascii tables:
|
||||
```
|
||||
python tools/profiler/print_layerwise_table.py \\
|
||||
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
|
||||
```
|
||||
or create matplotlib stacked bar charts:
|
||||
```
|
||||
python tools/profiler/visualize_layerwise_profile.py \\
|
||||
--json-trace Llama31-8b-FP8.json \\
|
||||
--output-directory profile_breakdown --plot-metric pct_cuda_time
|
||||
```
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as multiple csv file. This should be the root "
|
||||
"filename, will create <filename>_prefill_model_table.csv, "
|
||||
"<filename>_prefill_summary_table.csv, "
|
||||
"<filename>_decode_model_table.csv, and "
|
||||
"<filename>_decode_summary_table.csv",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as a json file. This should be the filename",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-chrome-traces-folder",
|
||||
type=str,
|
||||
help="Save chrome traces for the prefill and decode "
|
||||
"will save traces as prefill.json and decode_1.json, "
|
||||
"etc. inside this folder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt-len",
|
||||
type=int,
|
||||
default=PROMPT_LEN_DEFAULT,
|
||||
help=f"Length of the random prompt to use when profiling, all batched "
|
||||
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=BATCH_SIZE_DEFAULT,
|
||||
help=f"Number of requests to run as a single batch, "
|
||||
f"default={BATCH_SIZE_DEFAULT}",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="cmd")
|
||||
|
||||
run_num_steps_parser = subparsers.add_parser(
|
||||
"run_num_steps", help="This variation profiles n engine.step() invocations."
|
||||
)
|
||||
run_num_steps_parser.add_argument(
|
||||
"-n",
|
||||
"--num-steps",
|
||||
type=int,
|
||||
help="Number of engine steps to profile.\n"
|
||||
"Setting it to 1, profiles only the prefill step.\n"
|
||||
"Setting it to 2, profiles the prefill and first decode step\n"
|
||||
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
|
||||
"and so on ...",
|
||||
)
|
||||
|
||||
run_to_completion_parser = subparsers.add_parser(
|
||||
"run_to_completion",
|
||||
help="This variation profiles all the engine.step() invocations"
|
||||
"until the engine exhausts all submitted requests.",
|
||||
)
|
||||
run_to_completion_parser.add_argument(
|
||||
"-n",
|
||||
"--complete-num-requests-per-step",
|
||||
type=int,
|
||||
help="Complete complete_num_requests_per_step requests every decode step."
|
||||
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
|
||||
"the profiler is run for 6 engine steps, with the steps processing, "
|
||||
"128, 128, 96, 64, 32, 1 requests respectively.\n"
|
||||
"Note that we tack-on a one-request step at the end as it is often "
|
||||
"useful.",
|
||||
)
|
||||
|
||||
EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
context = ProfileContext(
|
||||
engine_args=EngineArgs.from_cli_args(args),
|
||||
**{
|
||||
k: v
|
||||
for k, v in vars(args).items()
|
||||
if k in inspect.signature(ProfileContext).parameters
|
||||
},
|
||||
)
|
||||
run_profile(context, csv_output=args.csv, json_output=args.json)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -102,6 +102,7 @@ plugins:
|
||||
- https://numpy.org/doc/stable/objects.inv
|
||||
- https://pytorch.org/docs/stable/objects.inv
|
||||
- https://psutil.readthedocs.io/en/stable/objects.inv
|
||||
- https://huggingface.co/docs/transformers/main/en/objects.inv
|
||||
|
||||
markdown_extensions:
|
||||
- attr_list
|
||||
|
||||
@ -70,7 +70,6 @@ line-length = 80
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
# Python 3.8 typing - skip V0 code
|
||||
"vllm/attention/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/core/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/engine/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/executor/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/worker/**/*.py" = ["UP006", "UP035"]
|
||||
@ -117,7 +116,6 @@ files = [
|
||||
"vllm/*.py",
|
||||
"vllm/assets",
|
||||
"vllm/entrypoints",
|
||||
"vllm/core",
|
||||
"vllm/inputs",
|
||||
"vllm/logging_utils",
|
||||
"vllm/multimodal",
|
||||
|
||||
@ -11,7 +11,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, envs
|
||||
from vllm import LLM
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
@ -26,14 +26,6 @@ MODELS = [
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("distilbert/distilgpt2")
|
||||
@ -76,12 +68,6 @@ def test_models(
|
||||
model_executor: str,
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
if not envs.VLLM_USE_V1:
|
||||
if async_scheduling:
|
||||
pytest.skip("async_scheduling only supported in v1.")
|
||||
if model_executor != "uni":
|
||||
pytest.skip("only test uniproc executor for v0.")
|
||||
|
||||
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
||||
pytest.skip(
|
||||
f"{backend} does not support gemma2 with full context length.")
|
||||
|
||||
@ -122,11 +122,12 @@ def test_cumem_with_cudagraph():
|
||||
# sleep mode with safetensors
|
||||
("meta-llama/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
("facebook/opt-125m", False),
|
||||
("facebook/opt-125m", True),
|
||||
])
|
||||
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
assert use_v1
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
|
||||
@ -1,39 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import Cython.Compiler.Options
|
||||
from Cython.Build import cythonize
|
||||
from setuptools import setup
|
||||
|
||||
Cython.Compiler.Options.annotate = True
|
||||
|
||||
infiles = []
|
||||
|
||||
infiles += [
|
||||
"vllm/engine/llm_engine.py",
|
||||
"vllm/transformers_utils/detokenizer.py",
|
||||
"vllm/engine/output_processor/single_step.py",
|
||||
"vllm/outputs.py",
|
||||
"vllm/engine/output_processor/stop_checker.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/core/scheduler.py",
|
||||
"vllm/sequence.py",
|
||||
"vllm/core/block_manager.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/model_executor/layers/sampler.py",
|
||||
"vllm/sampling_params.py",
|
||||
"vllm/utils/__init__.py",
|
||||
]
|
||||
|
||||
setup(ext_modules=cythonize(infiles,
|
||||
annotate=False,
|
||||
force=True,
|
||||
compiler_directives={
|
||||
'language_level': "3",
|
||||
'infer_types': True
|
||||
}))
|
||||
|
||||
# example usage: python3 build_cython.py build_ext --inplace
|
||||
@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
|
||||
# Use global backends
|
||||
global backend, backend_unfused
|
||||
|
||||
use_v1 = False # can be made a param once V1 support added
|
||||
monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
|
||||
|
||||
# Prompt 4 seems too open-ended, differs between fused and unfused
|
||||
|
||||
@ -19,6 +19,7 @@ import socket
|
||||
import tempfile
|
||||
import threading
|
||||
from collections.abc import Generator
|
||||
from contextlib import nullcontext
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
|
||||
|
||||
@ -45,14 +46,14 @@ from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
to_enc_dec_tuple_list, zip_enc_dec_prompts)
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -159,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch):
|
||||
monkeypatch.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def run_with_both_engines(request, monkeypatch):
|
||||
# Automatically runs tests twice, once with V1 and once without
|
||||
use_v1 = request.param
|
||||
# Tests decorated with `@skip_v1` are only run without v1
|
||||
skip_v0 = request.node.get_closest_marker("skip_v0")
|
||||
skip_v1 = request.node.get_closest_marker("skip_v1")
|
||||
|
||||
if use_v1:
|
||||
if skip_v1:
|
||||
pytest.skip("Skipping test on vllm V1")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '1')
|
||||
else:
|
||||
if skip_v0:
|
||||
pytest.skip("Skipping test on vllm V0")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def init_test_http_connection():
|
||||
# pytest_asyncio may use a different event loop per test
|
||||
@ -306,6 +287,35 @@ class HfRunner:
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: Optional[int] = None,
|
||||
) -> None:
|
||||
init_ctx = (nullcontext() if default_torch_num_threads is None else
|
||||
set_default_torch_num_threads(default_torch_num_threads))
|
||||
|
||||
with init_ctx:
|
||||
self._init(
|
||||
model_name=model_name,
|
||||
dtype=dtype,
|
||||
model_kwargs=model_kwargs,
|
||||
trust_remote_code=trust_remote_code,
|
||||
is_sentence_transformer=is_sentence_transformer,
|
||||
is_cross_encoder=is_cross_encoder,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
auto_cls=auto_cls,
|
||||
)
|
||||
|
||||
def _init(
|
||||
self,
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
trust_remote_code: bool = True,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
) -> None:
|
||||
model_name = maybe_model_redirect(model_name)
|
||||
self.model_name = model_name
|
||||
@ -714,26 +724,32 @@ class VllmRunner:
|
||||
enable_chunked_prefill: Optional[bool] = False,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: Optional[bool] = False,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
runner=runner,
|
||||
convert=convert,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=seed,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
block_size=block_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
**kwargs,
|
||||
)
|
||||
init_ctx = (nullcontext() if default_torch_num_threads is None else
|
||||
set_default_torch_num_threads(default_torch_num_threads))
|
||||
|
||||
with init_ctx:
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
runner=runner,
|
||||
convert=convert,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=seed,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
block_size=block_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
|
||||
@ -32,10 +32,6 @@ def _test_stopping(llm: LLM,
|
||||
assert output.stop_reason == expected_reason
|
||||
|
||||
|
||||
def _set_async_mode(llm, is_async):
|
||||
llm.llm_engine.scheduler[0].use_async_output_proc = is_async
|
||||
|
||||
|
||||
def _stop_basic(llm):
|
||||
_test_stopping(llm,
|
||||
stop=["."],
|
||||
@ -103,40 +99,8 @@ def test_stop_strings():
|
||||
# async output processing below.
|
||||
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_basic(llm)
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_basic(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_basic(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_multi_tokens(llm)
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_multi_tokens(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_multi_tokens(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_partial_token(llm)
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_partial_token(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_partial_token(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(llm)
|
||||
pass
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_token_id(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_token_id(llm)
|
||||
_stop_basic(llm)
|
||||
_stop_multi_tokens(llm)
|
||||
_stop_partial_token(llm)
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(llm)
|
||||
|
||||
@ -25,12 +25,6 @@ TOKEN_IDS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
|
||||
@ -6,14 +6,6 @@ import pytest
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
|
||||
|
||||
@ -60,6 +60,7 @@ def create_dummy_embeds(num_tokens: int = 5) -> str:
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@pytest.mark.skip("This test is skipped because it is flaky.")
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_completions_with_prompt_embeds(
|
||||
|
||||
@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
env={"VLLM_USE_V1": "1" if use_v1 else "0"})
|
||||
env={"VLLM_USE_V1": "1"})
|
||||
|
||||
def is_server_up(url):
|
||||
try:
|
||||
|
||||
@ -69,28 +69,20 @@ def generate_params():
|
||||
|
||||
@pytest.mark.parametrize("device, name, use_mla, block_size",
|
||||
generate_params())
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_env(
|
||||
device: str,
|
||||
name: str,
|
||||
use_mla: bool,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with valid device-backend pairs."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, name)
|
||||
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
|
||||
|
||||
if name == "FLASHINFER" and not use_v1:
|
||||
pytest.skip("FlashInfer backend is only available on V1 engine")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, None, block_size,
|
||||
@ -137,7 +129,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
expected = f"{name}_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
@ -146,7 +138,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
expected = "TRITON_ATTN_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
elif device == "cuda":
|
||||
@ -163,11 +155,7 @@ def test_env(
|
||||
# - TRITON_MLA: fallback for other cases
|
||||
|
||||
if name == "CUTLASS_MLA":
|
||||
if not use_v1:
|
||||
# CUTLASS_MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supported on V1 engine")
|
||||
elif block_size != 128:
|
||||
if block_size != 128:
|
||||
# CUTLASS_MLA only supports block_size == 128
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supports block_size 128")
|
||||
@ -181,11 +169,7 @@ def test_env(
|
||||
expected = "CUTLASS_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER_MLA":
|
||||
if not use_v1:
|
||||
# FlashInfer MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supported on V1 engine")
|
||||
elif block_size not in [32, 64]:
|
||||
if block_size not in [32, 64]:
|
||||
# FlashInfer MLA only supports block_size 32 or 64
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supports block_size 32 "
|
||||
@ -217,23 +201,17 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
expected = f"{name}_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASH_ATTN_MLA":
|
||||
if not use_v1:
|
||||
# FlashAttention MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashAttention MLA only supported on V1 engine"
|
||||
)
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
# TRITON_MLA or other fallback
|
||||
backend = get_attn_backend(16,
|
||||
@ -242,8 +220,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = ("TRITON_MLA_VLLM_V1"
|
||||
if use_v1 else "TRITON_MLA")
|
||||
expected = "TRITON_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER":
|
||||
backend = get_attn_backend(16,
|
||||
@ -252,7 +229,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
||||
expected = "FLASHINFER_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(32,
|
||||
@ -261,36 +238,30 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||
expected = "FLASH_ATTN_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
if use_v1:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_fp32_fallback(
|
||||
device: str,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with fp32."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
@ -300,8 +271,7 @@ def test_fp32_fallback(
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CudaPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||
if use_v1 else "XFORMERS")
|
||||
assert backend.get_name() == "FLEX_ATTENTION"
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test that invalid attention backend names raise ValueError."""
|
||||
with monkeypatch.context() as m, patch(
|
||||
"vllm.attention.selector.current_platform", CudaPlatform()):
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||
|
||||
# Should raise ValueError for invalid backend
|
||||
|
||||
@ -17,7 +17,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.interfaces import SupportsLoRA
|
||||
from vllm.platforms import current_platform
|
||||
@ -97,7 +96,6 @@ def dummy_model() -> nn.Module:
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model.config = MagicMock()
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
@ -125,7 +123,6 @@ def dummy_model_gate_up() -> nn.Module:
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model.config = MagicMock()
|
||||
model.packed_modules_mapping = {
|
||||
|
||||
@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
|
||||
@ -15,7 +15,8 @@ from ...utils import check_logprobs_close
|
||||
# have a clean way to fall back, so we fail with
|
||||
# a clear msg when it happens.
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
# NOTE(woosuk): Skipping these tests until V1 supports them.
|
||||
# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if model in REQUIRES_V0:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
|
||||
@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ...utils import check_logprobs_close, check_outputs_equal
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# Mark all tests as hybrid
|
||||
pytestmark = pytest.mark.hybrid_model
|
||||
@ -88,15 +88,6 @@ def test_models(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model not in V0_UNSUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
vllm_v0_outputs = None
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
@ -104,14 +95,6 @@ def test_models(
|
||||
else:
|
||||
vllm_v1_outputs = None
|
||||
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@ -157,45 +140,6 @@ def test_batching(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
def test_chunked_prefill(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
chunked_prefill_token_size: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
enable_chunked_prefill=False,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
non_chunked = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(
|
||||
@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding(
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Tests that outputs are identical with and w/o preemptions (recompute).
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
scheduler = vllm_model.llm.llm_engine.scheduler[0]
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
@ -386,27 +298,10 @@ def test_full_cuda_graph(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model not in V0_UNSUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
vllm_v0_outputs = None
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
@ -442,27 +337,12 @@ def test_fp32_cache_state(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -82,7 +81,7 @@ def test_prm_models(
|
||||
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
max_transformers_version="4.53.2")
|
||||
|
||||
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
|
||||
if current_platform.is_cpu():
|
||||
pytest.skip("CPU only supports V1")
|
||||
|
||||
if current_platform.is_rocm():
|
||||
|
||||
@ -32,13 +32,6 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
REQUIRES_V0_MODELS = [
|
||||
# V1 Test: not enough KV cache space in C1.
|
||||
"fuyu",
|
||||
# V1 Test: Deadlock issue when processing mm_inputs
|
||||
"llava-onevision-transformers",
|
||||
]
|
||||
|
||||
# yapf: disable
|
||||
COMMON_BROADCAST_SETTINGS = {
|
||||
"test_type": VLMTestType.IMAGE,
|
||||
@ -186,8 +179,11 @@ VLM_TEST_SETTINGS = {
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
"default_torch_num_threads": 1,
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
# FIXME: Investigate why the test hangs
|
||||
# when processing the 3rd prompt in vLLM
|
||||
marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")],
|
||||
),
|
||||
"idefics3-transformers": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
@ -320,6 +316,7 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
@ -861,13 +858,14 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_single_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -886,13 +884,14 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_multi_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -911,13 +910,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_image_embedding_models(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_image_embedding_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_embedding_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -935,11 +934,13 @@ def test_image_embedding_models(model_type: str,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_video_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_video_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -957,11 +958,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_audio_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_audio_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -984,10 +987,7 @@ def test_custom_inputs_models(
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch,
|
||||
):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_custom_inputs_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1006,13 +1006,14 @@ def test_custom_inputs_models(
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@create_new_process_for_each_test()
|
||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_single_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -1032,13 +1033,14 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@create_new_process_for_each_test()
|
||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_multi_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -1058,14 +1060,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@create_new_process_for_each_test()
|
||||
def test_image_embedding_models_heavy(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_image_embedding_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_embedding_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1083,12 +1084,13 @@ def test_image_embedding_models_heavy(model_type: str,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_video_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_video_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1106,12 +1108,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
def test_audio_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_audio_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1135,10 +1138,7 @@ def test_custom_inputs_models_heavy(
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch,
|
||||
):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_custom_inputs_test(
|
||||
model_test_info=model_test_info,
|
||||
|
||||
@ -7,8 +7,8 @@ from typing import Optional
|
||||
import pytest
|
||||
from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
|
||||
VllmRunner)
|
||||
|
||||
@ -12,10 +12,10 @@ from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode, rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
|
||||
@ -12,13 +12,12 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm import SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.logprobs import Logprob, SampleLogprobs
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
from vllm.sequence import Logprob, SampleLogprobs
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close, dummy_hf_overrides
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
@ -185,47 +184,3 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls,expected_ranges",
|
||||
[(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
|
||||
(IMG_URLS[1:4], [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])])
|
||||
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
|
||||
expected_ranges: list[PlaceholderRange],
|
||||
local_asset_server, monkeypatch) -> None:
|
||||
local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
|
||||
prompt = _create_engine_inputs_hf(local_image_urls)
|
||||
|
||||
# This placeholder checking test only works with V0 engine
|
||||
# where `multi_modal_placeholders` is returned with `RequestOutput`
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(
|
||||
"mistral-community/pixtral-12b",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
load_format="dummy",
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.llm.generate(prompt)
|
||||
|
||||
assert len(outputs) == 1, f"{len(outputs)=}"
|
||||
output: RequestOutput = outputs[0]
|
||||
assert hasattr(output,
|
||||
"multi_modal_placeholders"), f"{output.__dict__=}"
|
||||
assert "image" in output.multi_modal_placeholders, \
|
||||
f"{output.multi_modal_placeholders.keys()=}"
|
||||
image_placeholder_ranges: list[
|
||||
PlaceholderRange] = output.multi_modal_placeholders["image"]
|
||||
assert len(image_placeholder_ranges) == len(
|
||||
expected_ranges), f"{image_placeholder_ranges=}"
|
||||
for real_range, expected_range in zip(image_placeholder_ranges,
|
||||
expected_ranges):
|
||||
assert real_range.offset == expected_range.offset, \
|
||||
f"{real_range=} {expected_range=}"
|
||||
assert real_range.length == expected_range.length, \
|
||||
f"{real_range=} {expected_range=}"
|
||||
|
||||
@ -10,7 +10,6 @@ from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
||||
PromptVideoInput, VllmRunner)
|
||||
@ -264,8 +263,7 @@ def run_embedding_input_test(
|
||||
processor = AutoProcessor.from_pretrained(model)
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with set_default_torch_num_threads(1):
|
||||
vllm_model = vllm_runner(
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
@ -277,9 +275,8 @@ def run_embedding_input_test(
|
||||
},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
with vllm_model:
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
|
||||
@ -19,7 +19,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||
|
||||
@ -12,7 +12,7 @@ from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
|
||||
|
||||
@ -4,8 +4,6 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
|
||||
@ -30,19 +28,17 @@ def _run_test(
|
||||
} for _ in range(10)
|
||||
]
|
||||
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
) as vllm_model,
|
||||
):
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_model.encode(prompt)
|
||||
|
||||
|
||||
|
||||
@ -45,12 +45,15 @@ def run_awq_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
source_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
@ -59,13 +62,16 @@ def run_awq_test(
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
with vllm_runner(quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
quant_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
@ -108,12 +114,8 @@ def run_awq_test(
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
|
||||
size_factors, dtype, max_tokens, num_logprobs,
|
||||
monkeypatch) -> None:
|
||||
size_factors, dtype, max_tokens, num_logprobs) -> None:
|
||||
|
||||
# Test V1: this test hangs during setup on single-scale input.
|
||||
# TODO: figure out why and re-enable this on V1.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
run_awq_test(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
|
||||
@ -5,10 +5,7 @@
|
||||
Run `pytest tests/quantization/test_bitsandbytes.py`.
|
||||
'''
|
||||
|
||||
import gc
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import BitsAndBytesConfig
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
@ -131,12 +128,15 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
|
||||
))
|
||||
with vllm_runner(model_name,
|
||||
quantization='bitsandbytes',
|
||||
enforce_eager=False) as llm:
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1) as llm:
|
||||
vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens=32,
|
||||
num_logprobs=5)
|
||||
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
with hf_runner(model_name,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
default_torch_num_threads=1) as llm:
|
||||
transformers_outputs = llm.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens=32, num_logprobs=5)
|
||||
check_logprobs_close(
|
||||
@ -174,7 +174,8 @@ def test_4bit_bnb_embedding_model(
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes") as vllm_model:
|
||||
quantization="bitsandbytes",
|
||||
default_torch_num_threads=1) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
@ -184,6 +185,7 @@ def test_4bit_bnb_embedding_model(
|
||||
dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
is_sentence_transformer=True,
|
||||
default_torch_num_threads=1,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
@ -222,26 +224,22 @@ def validate_generated_texts(hf_runner,
|
||||
with vllm_runner(model_name,
|
||||
quantization=None if pre_quant else 'bitsandbytes',
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False) as llm:
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1) as llm:
|
||||
|
||||
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if hf_model_kwargs is None:
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
with hf_runner(model_name,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
default_torch_num_threads=1) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
# Compare the generated strings
|
||||
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
||||
hf_str = hf_log["generated_text"]
|
||||
|
||||
@ -32,13 +32,10 @@ from ..utils import check_logprobs_close
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -49,7 +46,6 @@ def test_models(
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
@ -61,6 +57,9 @@ def test_models(
|
||||
pytest.skip(
|
||||
f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
|
||||
|
||||
if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
|
||||
pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("TOKENIZERS_PARALLELISM", 'true')
|
||||
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||
@ -74,7 +73,6 @@ def test_models(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -85,7 +83,6 @@ def test_models(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -110,9 +107,6 @@ def test_models(
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_cpu_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -120,7 +114,6 @@ def test_cpu_models(
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
max_tokens: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
@ -138,7 +131,6 @@ def test_cpu_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -148,7 +140,6 @@ def test_cpu_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
@ -7,7 +7,6 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
False))
|
||||
|
||||
# Avoid calling model.forward()
|
||||
def _initialize_kv_caches_v0(self) -> None:
|
||||
self.cache_config.num_gpu_blocks = 0
|
||||
self.cache_config.num_cpu_blocks = 0
|
||||
|
||||
def _initialize_kv_caches_v1(self, vllm_config):
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
scheduler_kv_cache_config = get_kv_cache_configs(
|
||||
@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
|
||||
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v0),
|
||||
patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
with (patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v1), monkeypatch.context() as m):
|
||||
if model_info.v0_only:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
# NOTE(woosuk): skip the test for V0-only models
|
||||
return
|
||||
|
||||
if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
|
||||
# Phi4FlashForCausalLM and MotifForCausalLM
|
||||
# only supports DIFFERENTIAL_FLASH_ATTN backend
|
||||
|
||||
@ -42,6 +42,7 @@ def test_oot_registration_text_generation(
|
||||
assert rest == ""
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_embedding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@ -62,6 +63,7 @@ def test_oot_registration_embedding(
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_multimodal(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
@ -5,7 +5,6 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -25,19 +24,17 @@ def test_inference(
|
||||
prompt = dict(prompt_token_ids=[1],
|
||||
multi_modal_data=dict(pixel_values=pixel_values,
|
||||
location_coords=location_coords))
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
) as vllm_model,
|
||||
):
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
|
||||
vllm_output = vllm_model.llm.encode(prompt)
|
||||
assert torch.equal(
|
||||
|
||||
@ -12,7 +12,7 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config import ModelConfig, ModelDType, RunnerOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
@ -9,7 +9,6 @@ from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
|
||||
LlavaForConditionalGeneration,
|
||||
LlavaMultiModalProcessor,
|
||||
LlavaProcessingInfo)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
|
||||
@ -18,11 +17,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
dummy_inputs=LlavaDummyInputsBuilder)
|
||||
class MyLlava(LlavaForConditionalGeneration):
|
||||
|
||||
def compute_logits(
|
||||
self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
|
||||
def compute_logits(self,
|
||||
hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
# this dummy model always predicts the first token
|
||||
logits = super().compute_logits(hidden_states, sampling_metadata)
|
||||
logits = super().compute_logits(hidden_states)
|
||||
if logits is not None:
|
||||
logits.zero_()
|
||||
logits[:, 0] += 1.0
|
||||
|
||||
@ -6,16 +6,14 @@ from typing import Optional
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.models.opt import OPTForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
|
||||
|
||||
class MyOPTForCausalLM(OPTForCausalLM):
|
||||
|
||||
def compute_logits(
|
||||
self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
|
||||
def compute_logits(self,
|
||||
hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
# this dummy model always predicts the first token
|
||||
logits = super().compute_logits(hidden_states, sampling_metadata)
|
||||
logits = super().compute_logits(hidden_states)
|
||||
if logits is not None:
|
||||
logits.zero_()
|
||||
logits[:, 0] += 1.0
|
||||
|
||||
@ -7,15 +7,6 @@ import torch
|
||||
from vllm.plugins import load_general_plugins
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
def test_platform_plugins():
|
||||
# simulate workload by running an example
|
||||
import runpy
|
||||
|
||||
@ -3,47 +3,18 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
|
||||
class DummyV0Scheduler(Scheduler):
|
||||
|
||||
def schedule(self):
|
||||
raise Exception("Exception raised by DummyV0Scheduler")
|
||||
|
||||
|
||||
class DummyV1Scheduler(V1Scheduler):
|
||||
class DummyV1Scheduler(Scheduler):
|
||||
|
||||
def schedule(self):
|
||||
raise Exception("Exception raised by DummyV1Scheduler")
|
||||
|
||||
|
||||
def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="facebook/opt-125m",
|
||||
enforce_eager=True, # reduce test time
|
||||
scheduler_cls=DummyV0Scheduler,
|
||||
)
|
||||
|
||||
engine = LLMEngine.from_engine_args(engine_args=engine_args)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
engine.add_request("0", "foo", sampling_params)
|
||||
engine.step()
|
||||
|
||||
assert str(
|
||||
exception_info.value) == "Exception raised by DummyV0Scheduler"
|
||||
|
||||
|
||||
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
||||
scheduler_cls=DummyV1Scheduler,
|
||||
)
|
||||
|
||||
engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
|
||||
engine = LLMEngine.from_engine_args(engine_args=engine_args)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
engine.add_request("0", "foo", sampling_params)
|
||||
|
||||
@ -357,6 +357,9 @@ def test_compressed_tensors_fp8(vllm_runner):
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_kv_cache_dtype_supported("fp8", None),
|
||||
reason="FP8 KV cache is not supported on this device.")
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
reason="This test is skipped on non-CUDA platform.")
|
||||
def test_compressed_tensors_kv_cache(vllm_runner):
|
||||
@ -738,4 +741,4 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
perplexity = llm.generate_prompt_perplexity([prompt])[0]
|
||||
print(perplexity)
|
||||
assert perplexity <= exp_perplexity
|
||||
assert perplexity <= exp_perplexity
|
||||
|
||||
@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
# FIXME(zhuohan): The test can not pass if we:
|
||||
# 1. Increase max_tokens to 256.
|
||||
# 2. Increase beam_width to 8.
|
||||
|
||||
@ -9,13 +9,6 @@ import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
# We also test with llama because it has generation_config to specify EOS
|
||||
# (past regression).
|
||||
MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
|
||||
|
||||
@ -8,12 +8,6 @@ from vllm import SamplingParams
|
||||
MODELS = ["distilbert/distilgpt2"]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_ranks(
|
||||
|
||||
@ -3,38 +3,52 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import SpeculativeConfig
|
||||
from vllm.model_executor.models.interfaces import supports_eagle3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
|
||||
def test_llama(vllm_runner, example_prompts, model_path, monkeypatch):
|
||||
@pytest.mark.parametrize("model_path", [
|
||||
pytest.param(
|
||||
"nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized",
|
||||
id="llama3-eagle3-speculator"),
|
||||
pytest.param(
|
||||
"nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized",
|
||||
id="qwen3-eagle3-speculator"),
|
||||
])
|
||||
def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path,
|
||||
monkeypatch):
|
||||
"""
|
||||
Test Eagle3 speculators models properly initialize speculative decoding.
|
||||
|
||||
This test verifies:
|
||||
1. Eagle3 support is detected for the model
|
||||
2. Speculative config is automatically initialized from embedded config
|
||||
3. The draft model path is correctly set to the speculators model
|
||||
4. Speculative tokens count is valid
|
||||
5. Text generation works with speculative decoding enabled
|
||||
"""
|
||||
# Set environment variable for V1 engine serialization
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
|
||||
# Verify Eagle3 support is detected
|
||||
eagle3_supported = vllm_model.apply_model(supports_eagle3)
|
||||
assert eagle3_supported
|
||||
assert eagle3_supported, f"Eagle3 should be supported for {model_path}"
|
||||
|
||||
vllm_config = vllm_model.llm.llm_engine.vllm_config
|
||||
|
||||
assert isinstance(vllm_config.speculative_config, SpeculativeConfig), \
|
||||
"Speculative config should be initialized for speculators model"
|
||||
|
||||
spec_config = vllm_config.speculative_config
|
||||
assert spec_config.num_speculative_tokens > 0, \
|
||||
(f"Expected positive speculative tokens, "
|
||||
f"got {spec_config.num_speculative_tokens}")
|
||||
|
||||
assert spec_config.model == model_path, \
|
||||
f"Draft model should be {model_path}, got {spec_config.model}"
|
||||
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=20)
|
||||
print(vllm_outputs)
|
||||
assert vllm_outputs
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
|
||||
def test_qwen(vllm_runner, example_prompts, model_path, monkeypatch):
|
||||
# Set environment variable for V1 engine serialization
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
|
||||
eagle3_supported = vllm_model.apply_model(supports_eagle3)
|
||||
assert eagle3_supported
|
||||
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens=20)
|
||||
print(vllm_outputs)
|
||||
assert vllm_outputs
|
||||
assert vllm_outputs, \
|
||||
f"No outputs generated for speculators model {model_path}"
|
||||
|
||||
@ -57,10 +57,19 @@ def llama_3p2_1b_files():
|
||||
|
||||
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||
llm_sharded_writer = LLM(model=input_dir, **kwargs)
|
||||
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
|
||||
# Dump worker states to output directory
|
||||
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
|
||||
path=output_dir)
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
|
||||
path=output_dir)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm_sharded_writer.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=output_dir)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(input_dir):
|
||||
@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
gpu_memory_utilization = 0.8
|
||||
input_dir = llama_3p2_1b_files
|
||||
ctx = mp.get_context("spawn")
|
||||
# The interface in v1 engine has changed, run in v1 engine will hang.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
# Run in separate processes for memory & CUDA isolation
|
||||
with TemporaryDirectory() as output_dir:
|
||||
@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
args=(input_dir, output_dir, weights_patterns),
|
||||
kwargs=dict(
|
||||
tensor_parallel_size=tp_size,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enforce_eager=True,
|
||||
))
|
||||
@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
p = ctx.Process(target=_run_generate,
|
||||
args=(input_dir, queue),
|
||||
kwargs=dict(
|
||||
distributed_executor_backend="mp",
|
||||
enable_lora=enable_lora,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
tensor_parallel_size=tp_size,
|
||||
@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
p = ctx.Process(target=_run_generate,
|
||||
args=(output_dir, queue),
|
||||
kwargs=dict(
|
||||
distributed_executor_backend="mp",
|
||||
enable_lora=enable_lora,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
tensor_parallel_size=tp_size,
|
||||
|
||||
@ -8,10 +8,7 @@ import pytest
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
from vllm.inputs import token_inputs
|
||||
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
|
||||
@ -217,193 +214,3 @@ def test_oov_decode(tokenizer, fast):
|
||||
|
||||
assert decoded_text == ''
|
||||
assert out_ids == [len(tokenizer)]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def detokenizer(tokenizer_name: str) -> Detokenizer:
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_name,
|
||||
tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
|
||||
trust_remote_code=False,
|
||||
revision=None,
|
||||
)
|
||||
|
||||
return Detokenizer(tokenizer)
|
||||
|
||||
|
||||
@pytest.fixture(name="complete_sequence_token_ids")
|
||||
def create_complete_sequence_token_ids(complete_sequence: str,
|
||||
tokenizer) -> list[int]:
|
||||
return tokenizer(complete_sequence, add_special_tokens=False).input_ids
|
||||
|
||||
|
||||
def create_sequence(prompt_token_ids=None):
|
||||
prompt_token_ids = prompt_token_ids or []
|
||||
return Sequence(
|
||||
seq_id=0,
|
||||
inputs=token_inputs(prompt_token_ids),
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
|
||||
def create_dummy_logprobs(
|
||||
complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]:
|
||||
return [{
|
||||
token_id: Logprob(logprob=0.0),
|
||||
token_id + 1: Logprob(logprob=0.1)
|
||||
} for token_id in complete_sequence_token_ids]
|
||||
|
||||
|
||||
def create_dummy_prompt_logprobs(
|
||||
complete_sequence_token_ids: list[int]
|
||||
) -> list[Optional[dict[int, Any]]]:
|
||||
# logprob for the first prompt token is None.
|
||||
logprobs: list[Optional[dict[int, Any]]] = [None]
|
||||
logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
|
||||
return logprobs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("complete_sequence", TRUTH)
|
||||
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
|
||||
@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
|
||||
def test_decode_sequence_logprobs(complete_sequence: str,
|
||||
complete_sequence_token_ids: list[int],
|
||||
detokenizer: Detokenizer,
|
||||
skip_special_tokens: bool):
|
||||
"""Verify Detokenizer decodes logprobs correctly."""
|
||||
sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
|
||||
logprobs=2)
|
||||
|
||||
# Run sequentially.
|
||||
seq = create_sequence()
|
||||
dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
|
||||
sequential_logprobs_text_chosen_token: list[str] = []
|
||||
sequential_logprobs_text_other_token: list[str] = []
|
||||
for new_token, logprobs in zip(complete_sequence_token_ids,
|
||||
dummy_logprobs):
|
||||
seq.append_token_id(new_token, logprobs)
|
||||
detokenizer.decode_sequence_inplace(seq, sampling_params)
|
||||
sequential_logprobs_text_chosen_token.append(
|
||||
seq.output_logprobs[-1][new_token].decoded_token)
|
||||
sequential_logprobs_text_other_token.append(
|
||||
seq.output_logprobs[-1][new_token + 1].decoded_token)
|
||||
sequential_result = seq.output_text
|
||||
|
||||
assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
|
||||
assert sequential_result != "".join(sequential_logprobs_text_other_token)
|
||||
|
||||
if not skip_special_tokens:
|
||||
# Text for logprobs for the chosen token should be the same as the
|
||||
# generated text. Note that this will only be true if we skip
|
||||
# special tokens.
|
||||
assert sequential_result == complete_sequence
|
||||
|
||||
|
||||
@pytest.mark.parametrize("complete_sequence", TRUTH)
|
||||
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
|
||||
def test_decode_prompt_logprobs(complete_sequence: str,
|
||||
complete_sequence_token_ids: list[int],
|
||||
detokenizer: Detokenizer):
|
||||
|
||||
# We want to use skip_special_tokens=False here but Mistral tokenizers
|
||||
# don't support that.
|
||||
if complete_sequence not in SPECIAL_TOKS_TRUTH:
|
||||
skip_special_tokens = True
|
||||
elif not isinstance(detokenizer.tokenizer, MistralTokenizer):
|
||||
skip_special_tokens = False
|
||||
else:
|
||||
pytest.skip("MistralTokenizers don't support "
|
||||
"skip_special_tokens=False")
|
||||
return
|
||||
"""Verify Detokenizer decodes prompt logprobs correctly."""
|
||||
sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
|
||||
prompt_logprobs=1)
|
||||
|
||||
# Run sequentially.
|
||||
seq = create_sequence(complete_sequence_token_ids)
|
||||
seq_group = SequenceGroup(request_id="1",
|
||||
seqs=[seq],
|
||||
sampling_params=sampling_params,
|
||||
arrival_time=0.0)
|
||||
dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
|
||||
detokenizer.decode_prompt_logprobs_inplace(seq_group,
|
||||
dummy_logprobs,
|
||||
position_offset=0)
|
||||
# First logprob is None.
|
||||
decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[
|
||||
1:] # type: ignore
|
||||
|
||||
# decoded_prompt_logprobs doesn't contain the first token.
|
||||
token_ids = complete_sequence_token_ids
|
||||
tokenizer = detokenizer.tokenizer
|
||||
text_full = tokenizer.decode(token_ids,
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
text_first = tokenizer.decode(token_ids[0],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
text = text_full[len(text_first):]
|
||||
|
||||
# Text for logprobs for the chosen token should be the same as the
|
||||
# prompt text. Note that the first logprob is None.
|
||||
assert text == "".join([
|
||||
logprobs[token_id].decoded_token
|
||||
for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
|
||||
])
|
||||
assert text != "".join([
|
||||
logprobs[token_id + 1].decoded_token
|
||||
for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
|
||||
])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
|
||||
def test_decode_prompt_logprobs_chunked_prefill(
|
||||
vllm_runner,
|
||||
model,
|
||||
chunked_prefill_token_size: int,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
):
|
||||
# VLLM V1 does not use incremental detokenization for
|
||||
# prompt logprobs, so this test strategy is irrelevant.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
max_num_seqs = 256
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype="half",
|
||||
max_logprobs=5,
|
||||
gpu_memory_utilization=0.5,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
|
||||
vllm_sampling_params = SamplingParams(max_tokens=10,
|
||||
logprobs=5,
|
||||
prompt_logprobs=5,
|
||||
temperature=0.0)
|
||||
vllm_results = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=vllm_sampling_params)
|
||||
|
||||
for idx, result in enumerate(vllm_results):
|
||||
assert result.prompt_logprobs is not None
|
||||
assert result.prompt_logprobs[0] is None
|
||||
|
||||
# Compared detokenized prompts ids to original prompt.
|
||||
generated_string = ""
|
||||
for (prompt_token,
|
||||
prompt_logprobs) in zip(result.prompt_token_ids[1:],
|
||||
result.prompt_logprobs[1:]):
|
||||
# prompt_logprobs is a dict of the token_id: logprob
|
||||
# We select the token_id corresponding to the actual prompt
|
||||
# Decoded token in the detokenized string corresponding to this
|
||||
# prompt token.
|
||||
generated_string += prompt_logprobs[prompt_token].decoded_token
|
||||
|
||||
assert generated_string == example_prompts[idx], (
|
||||
"Detokenized prompt logprobs do not match original prompt")
|
||||
|
||||
@ -12,7 +12,7 @@ from partial_json_parser.core.options import Allow
|
||||
from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
|
||||
ToolCall)
|
||||
from vllm.entrypoints.openai.tool_parsers import JambaToolParser
|
||||
from vllm.transformers_utils.detokenizer import detokenize_incrementally
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
MODEL = "ai21labs/Jamba-tiny-dev"
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
ToolCall)
|
||||
from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
|
||||
Qwen3CoderToolParser)
|
||||
from vllm.transformers_utils.detokenizer import detokenize_incrementally
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage, FunctionCall,
|
||||
ToolCall)
|
||||
from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
|
||||
from vllm.transformers_utils.detokenizer import detokenize_incrementally
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
|
||||
@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage, FunctionCall,
|
||||
ToolCall)
|
||||
from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
|
||||
from vllm.transformers_utils.detokenizer import detokenize_incrementally
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
|
||||
@ -1,15 +1,20 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for v1 attention backends without GPUModelRunner dependency."""
|
||||
from functools import partial
|
||||
from typing import Optional, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch.nn.attention.flex_attention import create_block_mask, flex_attention
|
||||
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata,
|
||||
create_standard_kv_cache_spec,
|
||||
create_vllm_config,
|
||||
get_attention_backend)
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
|
||||
from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
|
||||
set_kv_cache_layout)
|
||||
@ -183,13 +188,19 @@ class MockAttentionLayer:
|
||||
self._v_scale_float = 1.0
|
||||
|
||||
|
||||
def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
|
||||
layer_names: list[str], vllm_config,
|
||||
device: torch.device,
|
||||
common_attn_metadata: CommonAttentionMetadata,
|
||||
query: torch.Tensor, key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor) -> torch.Tensor:
|
||||
def run_attention_backend(
|
||||
backend: _Backend,
|
||||
kv_cache_spec: FullAttentionSpec,
|
||||
layer_names: list[str],
|
||||
vllm_config,
|
||||
device: torch.device,
|
||||
common_attn_metadata: CommonAttentionMetadata,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
sliding_window: Optional[int] = None,
|
||||
) -> torch.Tensor:
|
||||
"""Run attention computation using the specified backend's AttentionImpl."""
|
||||
|
||||
# Handle special case for FLEX_ATTENTION_SLOW
|
||||
@ -253,7 +264,7 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=None,
|
||||
sliding_window=None,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype="auto",
|
||||
)
|
||||
|
||||
@ -275,13 +286,16 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
|
||||
return output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_spec_name", [
|
||||
"small_decode", "small_prefill", "mixed_small", "medium_decode",
|
||||
"medium_prefill", "mixed_medium", "large_decode", "large_prefill",
|
||||
"single_decode", "single_prefill"
|
||||
])
|
||||
@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
|
||||
def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
def _test_backend_correctness(
|
||||
batch_spec: BatchSpec,
|
||||
model: str,
|
||||
backend_to_test: list[Union[_Backend, str]],
|
||||
mask_mod,
|
||||
*,
|
||||
block_size: int = 16,
|
||||
atol: float = 1e-2,
|
||||
rtol: float = 1e-2,
|
||||
):
|
||||
"""
|
||||
Test that all backends produce similar outputs to a reference implementation
|
||||
using torch.nn.functional.scaled_dot_product_attention.
|
||||
@ -297,9 +311,10 @@ def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
simulated paged KV cache.
|
||||
5. Comparing the vLLM backend's output to the ground-truth SDPA output.
|
||||
"""
|
||||
batch_spec = BATCH_SPECS[batch_spec_name]
|
||||
current_platform.seed_everything(42)
|
||||
vllm_config = create_vllm_config(model_name=model,
|
||||
max_model_len=max(batch_spec.seq_lens),
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=8192)
|
||||
device = torch.device("cuda:0")
|
||||
|
||||
@ -314,6 +329,7 @@ def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
num_kv_heads = vllm_config.model_config.get_num_kv_heads(
|
||||
vllm_config.parallel_config)
|
||||
head_size = vllm_config.model_config.get_head_size()
|
||||
sliding_window = vllm_config.model_config.get_sliding_window()
|
||||
dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
|
||||
block_size = vllm_config.cache_config.block_size
|
||||
scale = 1.0 / (head_size**0.5)
|
||||
@ -361,22 +377,21 @@ def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
# Create causal mask: query token i attends to positions 0 to
|
||||
# (context_len + i)
|
||||
kv_len = s_len
|
||||
offset = context_len
|
||||
attn_mask = torch.full((q_len, kv_len),
|
||||
float('-inf'),
|
||||
device=device,
|
||||
dtype=dtype)
|
||||
for i in range(q_len):
|
||||
attn_mask[i, :offset + i + 1] = 0.0
|
||||
|
||||
sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
|
||||
q_sdpa_in,
|
||||
k_sdpa_in,
|
||||
v_sdpa_in,
|
||||
attn_mask=attn_mask,
|
||||
scale=scale,
|
||||
enable_gqa=True)
|
||||
# Convert back to (L, H, D)
|
||||
final_mask_mod = partial(mask_mod, context_len=context_len)
|
||||
block_mask = create_block_mask(final_mask_mod,
|
||||
B=None,
|
||||
H=None,
|
||||
Q_LEN=q_len,
|
||||
KV_LEN=kv_len,
|
||||
device=device)
|
||||
sdpa_out_i = flex_attention(q_sdpa_in,
|
||||
k_sdpa_in,
|
||||
v_sdpa_in,
|
||||
block_mask=block_mask,
|
||||
scale=scale,
|
||||
enable_gqa=True)
|
||||
|
||||
all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))
|
||||
|
||||
# Inputs for vLLM backends are just the new tokens
|
||||
@ -412,7 +427,7 @@ def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
# 4. Run vLLM backends and compare
|
||||
# Note: flex_attention has known Triton kernel compatibility issues
|
||||
# with test infrastructures
|
||||
for backend_name in BACKENDS_TO_TEST:
|
||||
for backend_name in backend_to_test:
|
||||
# FlashAttentionm + FlexAttention:
|
||||
# [2, num_blocks, block_size, num_kv_heads, head_size]
|
||||
# FlashInfer:
|
||||
@ -427,12 +442,19 @@ def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
2, 3).contiguous().transpose(2, 3)
|
||||
set_kv_cache_layout("HND")
|
||||
|
||||
backend_output = run_attention_backend(backend_name, kv_cache_spec,
|
||||
["placeholder"], vllm_config,
|
||||
device, common_attn_metadata,
|
||||
query_vllm, key_vllm,
|
||||
value_vllm,
|
||||
kv_cache_for_backend)
|
||||
backend_output = run_attention_backend(
|
||||
backend_name,
|
||||
kv_cache_spec,
|
||||
["placeholder"],
|
||||
vllm_config,
|
||||
device,
|
||||
common_attn_metadata,
|
||||
query_vllm,
|
||||
key_vllm,
|
||||
value_vllm,
|
||||
kv_cache_for_backend,
|
||||
sliding_window=sliding_window,
|
||||
)
|
||||
|
||||
# Check shape and dtype consistency
|
||||
assert backend_output.shape == sdpa_output.shape, (
|
||||
@ -446,18 +468,102 @@ def test_backend_correctness(batch_spec_name: str, model: str):
|
||||
f"[{backend_name}] produced non-finite values")
|
||||
|
||||
# Check numerical similarity
|
||||
rtol = 1e-2
|
||||
atol = 5e-3
|
||||
def error_msg(msg: str, backend_name: str):
|
||||
return (f"[{backend_name}] output differs from SDPA baseline. "
|
||||
f"{msg}")
|
||||
|
||||
max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
|
||||
max_rel_diff = torch.max(
|
||||
torch.abs(backend_output - sdpa_output) /
|
||||
torch.abs(sdpa_output)).item()
|
||||
all_close = torch.allclose(backend_output,
|
||||
torch.testing.assert_close(backend_output,
|
||||
sdpa_output,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
atol=atol,
|
||||
msg=partial(error_msg,
|
||||
backend_name=backend_name))
|
||||
|
||||
assert all_close, (
|
||||
f"[{backend_name}] output differs from SDPA baseline. "
|
||||
f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})")
|
||||
|
||||
@pytest.mark.parametrize("batch_spec_name", [
|
||||
"small_decode", "small_prefill", "mixed_small", "medium_decode",
|
||||
"medium_prefill", "mixed_medium", "large_decode", "large_prefill",
|
||||
"single_decode", "single_prefill"
|
||||
])
|
||||
@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
|
||||
def test_causal_backend_correctness(batch_spec_name: str, model: str):
|
||||
"""Test backend's correctness with causal attention."""
|
||||
|
||||
def causal_mask_mod(
|
||||
b: torch.Tensor,
|
||||
h: torch.Tensor,
|
||||
q_idx: torch.Tensor,
|
||||
kv_idx: torch.Tensor,
|
||||
*,
|
||||
context_len: int,
|
||||
):
|
||||
return (q_idx + context_len) >= kv_idx
|
||||
|
||||
batch_spec = BATCH_SPECS[batch_spec_name]
|
||||
LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
|
||||
if is_torch_equal_or_newer("2.9.0.dev0") else [])
|
||||
SMALL_BLOCK_BACKENDS = [
|
||||
x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
|
||||
]
|
||||
_test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
|
||||
causal_mask_mod)
|
||||
|
||||
# Fast FlexAttention needs to run with block_size=128
|
||||
if LARGE_BLOCK_BACKENDS:
|
||||
_test_backend_correctness(batch_spec,
|
||||
model,
|
||||
LARGE_BLOCK_BACKENDS,
|
||||
causal_mask_mod,
|
||||
block_size=128)
|
||||
|
||||
|
||||
SLIDING_WINDOW_BACKENDS_TO_TEST = [
|
||||
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
|
||||
_Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_spec_name", [
|
||||
"small_decode", "small_prefill", "mixed_medium", "large_decode",
|
||||
"large_prefill"
|
||||
])
|
||||
@pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
|
||||
def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
|
||||
"""Test backend's correctness with sliding window attention."""
|
||||
|
||||
def sliding_window_mask_mod(
|
||||
b: torch.Tensor,
|
||||
h: torch.Tensor,
|
||||
q_idx: torch.Tensor,
|
||||
kv_idx: torch.Tensor,
|
||||
*,
|
||||
context_len: int,
|
||||
sliding_window: int,
|
||||
):
|
||||
causal_mask = q_idx + context_len >= kv_idx
|
||||
window_mask = q_idx + context_len - kv_idx < sliding_window
|
||||
return causal_mask & window_mask
|
||||
|
||||
batch_spec = BATCH_SPECS[batch_spec_name]
|
||||
model_config = ModelConfig(model=model,
|
||||
max_model_len=max(batch_spec.seq_lens))
|
||||
sliding_window = model_config.get_sliding_window()
|
||||
sliding_window_mask_mod_fn = partial(sliding_window_mask_mod,
|
||||
sliding_window=sliding_window)
|
||||
|
||||
LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
|
||||
if is_torch_equal_or_newer("2.9.0.dev0") else [])
|
||||
SMALL_BLOCK_BACKENDS = [
|
||||
x for x in SLIDING_WINDOW_BACKENDS_TO_TEST
|
||||
if x not in LARGE_BLOCK_BACKENDS
|
||||
]
|
||||
_test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
|
||||
sliding_window_mask_mod_fn)
|
||||
|
||||
# Fast FlexAttention needs to run with block_size=128
|
||||
if LARGE_BLOCK_BACKENDS:
|
||||
_test_backend_correctness(batch_spec,
|
||||
model,
|
||||
LARGE_BLOCK_BACKENDS,
|
||||
sliding_window_mask_mod_fn,
|
||||
block_size=128)
|
||||
|
||||
@ -12,9 +12,9 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
|
||||
STOP_STRINGS,
|
||||
DummyOutputProcessorTestVectors,
|
||||
MockEngineCore)
|
||||
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.output_processor import (OutputProcessor,
|
||||
|
||||
@ -6,7 +6,6 @@ import pytest
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||
from vllm.platforms.interface import UnspecifiedPlatform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine import processor as processor_mod
|
||||
from vllm.v1.engine.processor import Processor
|
||||
@ -33,15 +32,6 @@ def _mk_processor(monkeypatch,
|
||||
"__post_init__",
|
||||
lambda self, *args: None,
|
||||
raising=True)
|
||||
monkeypatch.setattr(UnspecifiedPlatform,
|
||||
"is_async_output_supported",
|
||||
classmethod(lambda cls, enforce_eager: True),
|
||||
raising=True)
|
||||
monkeypatch.setattr(
|
||||
ModelConfig,
|
||||
"verify_async_output_proc",
|
||||
lambda self, parallel_config, speculative_config, device_config: None,
|
||||
raising=True)
|
||||
monkeypatch.setattr(ModelConfig,
|
||||
"verify_with_parallel_config",
|
||||
lambda self, parallel_config: None,
|
||||
|
||||
@ -29,24 +29,6 @@ def test_unsupported_configs(monkeypatch):
|
||||
},
|
||||
).create_engine_config()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
preemption_mode="swap",
|
||||
).create_engine_config()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
disable_async_output_proc=True,
|
||||
).create_engine_config()
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
scheduler_delay_factor=1.2,
|
||||
).create_engine_config()
|
||||
|
||||
|
||||
def test_enable_by_default_fallback(monkeypatch):
|
||||
with monkeypatch.context() as m:
|
||||
|
||||
@ -4,19 +4,14 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, fields
|
||||
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
|
||||
Protocol, Set, Tuple, Type, TypeVar)
|
||||
from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple,
|
||||
Type, TypeVar)
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner_base import (ModelRunnerBase,
|
||||
ModelRunnerInputBase,
|
||||
ModelRunnerInputBuilderBase)
|
||||
|
||||
|
||||
class AttentionType:
|
||||
"""
|
||||
@ -170,7 +165,7 @@ class AttentionState(ABC, Generic[T]):
|
||||
lifetime of the model runner."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, runner: "ModelRunnerBase"):
|
||||
def __init__(self, runner: Any):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
@ -210,7 +205,7 @@ class AttentionState(ABC, Generic[T]):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def begin_forward(self, model_input: "ModelRunnerInputBase") -> None:
|
||||
def begin_forward(self, model_input) -> None:
|
||||
"""Prepare state for forward pass."""
|
||||
...
|
||||
|
||||
@ -219,7 +214,7 @@ class AttentionMetadataBuilder(ABC, Generic[T]):
|
||||
"""Abstract class for attention metadata builders."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
|
||||
def __init__(self, input_builder) -> None:
|
||||
"""Create the builder, remember some configuration and parameters."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
from einops import rearrange
|
||||
@ -34,9 +34,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
|
||||
flash_attn_with_kvcache)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -329,7 +326,7 @@ class DifferentialFlashAttentionMetadata(AttentionMetadata):
|
||||
class DifferentialFlashAttentionMetadataBuilder(
|
||||
AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
def __init__(self, input_builder):
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
self.sliding_window = input_builder.sliding_window
|
||||
@ -350,9 +347,8 @@ class DifferentialFlashAttentionMetadataBuilder(
|
||||
self.num_decode_tokens = 0
|
||||
self.has_prefix_cache_hit = False
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
|
||||
prefix_cache_hit: bool):
|
||||
"""Add a sequence group to the metadata. Specifically update/append
|
||||
1. context length.
|
||||
2. block table.
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
"""
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
@ -22,9 +22,6 @@ from vllm.utils import async_tensor_h2d
|
||||
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
|
||||
flash_attn_with_kvcache, sparse_attn_func)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -224,9 +221,8 @@ class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder):
|
||||
super().prepare()
|
||||
self.orig_seq_lens: List[int] = []
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
|
||||
prefix_cache_hit: bool):
|
||||
super()._add_seq_group(inter_data, chunked_prefill_enabled,
|
||||
prefix_cache_hit)
|
||||
for prompt_len, seq_len in zip(inter_data.prompt_lens,
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
@ -31,9 +31,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||
from vllm.vllm_flash_attn import (flash_attn_varlen_func,
|
||||
flash_attn_with_kvcache)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -312,7 +309,7 @@ class FlashAttentionMetadata(AttentionMetadata):
|
||||
class FlashAttentionMetadataBuilder(
|
||||
AttentionMetadataBuilder[FlashAttentionMetadata]):
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
def __init__(self, input_builder):
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
self.sliding_window = input_builder.sliding_window
|
||||
@ -332,9 +329,8 @@ class FlashAttentionMetadataBuilder(
|
||||
self.num_decode_tokens = 0
|
||||
self.has_prefix_cache_hit = False
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
|
||||
prefix_cache_hit: bool):
|
||||
"""Add a sequence group to the metadata. Specifically update/append
|
||||
1. context length.
|
||||
2. block table.
|
||||
|
||||
@ -193,8 +193,7 @@ from collections import defaultdict
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
|
||||
Type, TypeVar)
|
||||
from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar
|
||||
|
||||
import torch
|
||||
|
||||
@ -233,9 +232,6 @@ except ImportError:
|
||||
except ImportError:
|
||||
flash_attn_varlen_func = None
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
|
||||
is_hip = current_platform.is_rocm()
|
||||
|
||||
|
||||
@ -638,7 +634,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
|
||||
"""
|
||||
BLOCK_TABLE_EXTENDER: list[list[int]] = []
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
def __init__(self, input_builder):
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
self.sliding_window = input_builder.sliding_window
|
||||
@ -668,9 +664,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
|
||||
self.num_decode_tokens = 0
|
||||
self.has_prefix_cache_hit = False
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool, prefix_cache_hit: bool):
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
|
||||
prefix_cache_hit: bool):
|
||||
"""Add a sequence group to the metadata. Specifically update/append
|
||||
1. context length.
|
||||
2. block table.
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
|
||||
@ -13,9 +13,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
AttentionMetadataBuilder)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import (ModelInputForGPUBuilder)
|
||||
from vllm.utils import async_tensor_h2d
|
||||
|
||||
# Placeholder attention backend for models like Mamba and pooling models that
|
||||
@ -204,7 +201,7 @@ class PlaceholderAttentionMetadata(AttentionMetadata):
|
||||
class PlaceholderAttentionMetadataBuilder(
|
||||
AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
def __init__(self, input_builder):
|
||||
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
@ -220,9 +217,7 @@ class PlaceholderAttentionMetadataBuilder(
|
||||
self.num_prefill_tokens = 0
|
||||
self.num_decode_tokens = 0
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool):
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
|
||||
"""Add a sequence group to the metadata. Specifically update/append
|
||||
1. context length.
|
||||
"""
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Optional, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
import torch
|
||||
|
||||
@ -19,9 +19,6 @@ from vllm.attention.backends.utils import (compute_slot_mapping,
|
||||
from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
|
||||
get_aiter_mla_metadata)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
|
||||
|
||||
def is_aiter_mla_enabled() -> bool:
|
||||
return envs.VLLM_ROCM_USE_AITER \
|
||||
@ -110,7 +107,7 @@ class AiterMLAMetadata(MLACommonMetadata):
|
||||
class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
|
||||
BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
def __init__(self, input_builder):
|
||||
super().__init__(input_builder)
|
||||
assert self.block_size == 1, "AITER MLA requires only block size 1."
|
||||
|
||||
|
||||
@ -5,8 +5,7 @@ from collections import defaultdict
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from itertools import accumulate
|
||||
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
|
||||
TypeVar, Union)
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -21,9 +20,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner_base import ModelRunnerBase
|
||||
|
||||
# Error string(s) for encoder/decoder
|
||||
# unsupported attention scenarios
|
||||
STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported "
|
||||
@ -35,9 +31,6 @@ PAD_SLOT_ID = -1
|
||||
# if we have at least this many elements. Could be tuned further.
|
||||
_COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.worker.model_runner import ModelInputForGPUBuilder
|
||||
|
||||
|
||||
def is_block_tables_empty(block_tables: Union[None, Dict]):
|
||||
"""
|
||||
@ -129,7 +122,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
|
||||
|
||||
_metadata_cls: Type[TAttentionMetadata]
|
||||
|
||||
def __init__(self, input_builder: "ModelInputForGPUBuilder"):
|
||||
def __init__(self, input_builder):
|
||||
self.input_builder = input_builder
|
||||
self.runner = input_builder.runner
|
||||
|
||||
@ -149,9 +142,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
|
||||
self.num_prefill_tokens = 0
|
||||
self.num_decode_tokens = 0
|
||||
|
||||
def _add_seq_group(
|
||||
self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
|
||||
chunked_prefill_enabled: bool):
|
||||
def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
|
||||
is_prompt = inter_data.is_prompt
|
||||
block_tables = inter_data.block_tables
|
||||
|
||||
@ -291,7 +282,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
|
||||
|
||||
class CommonAttentionState(AttentionState):
|
||||
|
||||
def __init__(self, runner: "ModelRunnerBase"):
|
||||
def __init__(self, runner):
|
||||
self.runner = runner
|
||||
self._is_graph_capturing = False
|
||||
|
||||
|
||||
@ -454,9 +454,6 @@ class VllmConfig:
|
||||
self.try_verify_and_update_config()
|
||||
|
||||
if self.model_config is not None:
|
||||
self.model_config.verify_async_output_proc(self.parallel_config,
|
||||
self.speculative_config,
|
||||
self.device_config)
|
||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||
self.model_config.verify_dual_chunk_attention_config(
|
||||
self.load_config)
|
||||
@ -877,7 +874,6 @@ class VllmConfig:
|
||||
f"served_model_name={self.model_config.served_model_name}, "
|
||||
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
|
||||
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
|
||||
f"use_async_output_proc={self.model_config.use_async_output_proc}, "
|
||||
f"pooler_config={self.model_config.pooler_config!r}, "
|
||||
f"compilation_config={self.compilation_config!r}")
|
||||
|
||||
|
||||
@ -27,8 +27,7 @@ from vllm.transformers_utils.config import (
|
||||
ConfigFormat, get_config, get_hf_image_processor_config,
|
||||
get_hf_text_config, get_pooling_config,
|
||||
get_sentence_transformer_tokenizer_config, is_encoder_decoder,
|
||||
is_interleaved, maybe_override_with_speculators_target_model,
|
||||
try_get_generation_config, try_get_safetensors_metadata,
|
||||
is_interleaved, try_get_generation_config, try_get_safetensors_metadata,
|
||||
try_get_tokenizer_config, uses_mrope)
|
||||
from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
|
||||
is_runai_obj_uri)
|
||||
@ -223,8 +222,6 @@ class ModelConfig:
|
||||
that this name(s) will also be used in `model_name` tag content of
|
||||
prometheus metrics, if multiple names provided, metrics tag will take the
|
||||
first one."""
|
||||
use_async_output_proc: bool = True
|
||||
"""Whether to use async output processor."""
|
||||
config_format: Union[str, ConfigFormat] = "auto"
|
||||
"""The format of the model config to load:\n
|
||||
- "auto" will try to load the config in hf format if available else it
|
||||
@ -418,15 +415,6 @@ class ModelConfig:
|
||||
|
||||
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
|
||||
|
||||
if self.runner != "draft":
|
||||
# If we're not running the draft model, check for speculators config
|
||||
# If speculators config, set model / tokenizer to be target model
|
||||
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
revision=self.revision,
|
||||
trust_remote_code=self.trust_remote_code)
|
||||
|
||||
if (backend := envs.VLLM_ATTENTION_BACKEND
|
||||
) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
|
||||
raise ValueError(
|
||||
@ -1119,37 +1107,6 @@ class ModelConfig:
|
||||
raise ValueError("please set VLLM_ATTENTION_BACKEND to "
|
||||
f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
|
||||
|
||||
def verify_async_output_proc(self, parallel_config, speculative_config,
|
||||
device_config) -> None:
|
||||
if not self.use_async_output_proc:
|
||||
# Nothing to check
|
||||
return
|
||||
|
||||
if parallel_config.pipeline_parallel_size > 1:
|
||||
self.use_async_output_proc = False
|
||||
return
|
||||
|
||||
# Reminder: Please update docs/features/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
from vllm.platforms import current_platform
|
||||
if not current_platform.is_async_output_supported(self.enforce_eager):
|
||||
self.use_async_output_proc = False
|
||||
return
|
||||
|
||||
if envs.VLLM_USE_RAY_SPMD_WORKER:
|
||||
self.use_async_output_proc = False
|
||||
return
|
||||
|
||||
# Async postprocessor is not necessary for pooling models
|
||||
# since there is no token generation
|
||||
if self.runner_type == "pooling":
|
||||
self.use_async_output_proc = False
|
||||
|
||||
# Reminder: Please update docs/features/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
if speculative_config:
|
||||
self.use_async_output_proc = False
|
||||
|
||||
def verify_with_parallel_config(
|
||||
self,
|
||||
parallel_config: ParallelConfig,
|
||||
@ -1173,15 +1130,12 @@ class ModelConfig:
|
||||
self._verify_with_expert_parallelism()
|
||||
|
||||
pipeline_parallel_size = parallel_config.pipeline_parallel_size
|
||||
if pipeline_parallel_size > 1:
|
||||
if not self.registry.is_pp_supported_model(self.architectures,
|
||||
self):
|
||||
raise NotImplementedError(
|
||||
"Pipeline parallelism is not supported for this model. "
|
||||
"Supported models implement the `SupportsPP` interface.")
|
||||
|
||||
if self.use_async_output_proc:
|
||||
self.use_async_output_proc = False
|
||||
if (pipeline_parallel_size > 1
|
||||
and not self.registry.is_pp_supported_model(
|
||||
self.architectures, self)):
|
||||
raise NotImplementedError(
|
||||
"Pipeline parallelism is not supported for this model. "
|
||||
"Supported models implement the `SupportsPP` interface.")
|
||||
|
||||
def get_sliding_window(self) -> Optional[int]:
|
||||
"""Get the sliding window size from the HF text config if present."""
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from typing import Any, Literal, Union
|
||||
|
||||
from pydantic import SkipValidation, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
@ -18,7 +18,6 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
||||
logger = init_logger(__name__)
|
||||
|
||||
RunnerType = Literal["generate", "pooling", "draft"]
|
||||
PreemptionMode = Literal["swap", "recompute"]
|
||||
SchedulerPolicy = Literal["fcfs", "priority"]
|
||||
|
||||
|
||||
@ -78,10 +77,6 @@ class SchedulerConfig:
|
||||
3. more than one value (e.g. 1 2 128) is provided, then the capture list
|
||||
will follow the provided list."""
|
||||
|
||||
delay_factor: float = 0.0
|
||||
"""Apply a delay (of delay factor multiplied by previous
|
||||
prompt latency) before scheduling next prompt."""
|
||||
|
||||
enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
|
||||
"""If True, prefill requests can be chunked based
|
||||
on the remaining max_num_batched_tokens."""
|
||||
@ -103,14 +98,6 @@ class SchedulerConfig:
|
||||
NOTE: This is not currently configurable. It will be overridden by
|
||||
max_num_batched_tokens in case max multimodal embedding size is larger."""
|
||||
|
||||
preemption_mode: Optional[PreemptionMode] = None
|
||||
"""Whether to perform preemption by swapping or
|
||||
recomputation. If not specified, we determine the mode as follows:
|
||||
We use recomputation by default since it incurs lower overhead than
|
||||
swapping. However, when the sequence group has multiple sequences
|
||||
(e.g., beam search), recomputation is not currently supported. In
|
||||
such a case, we use swapping instead."""
|
||||
|
||||
send_delta_data: bool = False
|
||||
"""Private API. If used, scheduler sends delta data to
|
||||
workers instead of an entire data. It should be enabled only
|
||||
|
||||
@ -1,399 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.core.block.common import BlockList
|
||||
from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
|
||||
from vllm.utils import Device, cdiv, chunk_list
|
||||
|
||||
|
||||
class BlockTable:
|
||||
"""A class to manage blocks for a specific sequence.
|
||||
|
||||
The BlockTable maps a sequence of tokens to a list of blocks, where each
|
||||
block represents a contiguous memory allocation for a portion of the
|
||||
sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
|
||||
responsible for allocating and freeing memory for the blocks.
|
||||
|
||||
Args:
|
||||
block_size (int): The maximum number of tokens that can be stored in a
|
||||
single block.
|
||||
block_allocator (DeviceAwareBlockAllocator): The block allocator used to
|
||||
manage memory for the blocks.
|
||||
_blocks (Optional[List[Block]], optional): An optional list of existing
|
||||
blocks to initialize the BlockTable with. If not provided, an empty
|
||||
BlockTable is created.
|
||||
max_block_sliding_window (Optional[int], optional): The number of
|
||||
blocks to keep around for each sequence. If None, all blocks
|
||||
are kept (eg., when sliding window is not used).
|
||||
It should at least fit the sliding window size of the model.
|
||||
|
||||
Attributes:
|
||||
_block_size (int): The maximum number of tokens that can be stored in a
|
||||
single block.
|
||||
_allocator (DeviceAwareBlockAllocator): The block allocator used to
|
||||
manage memory for the blocks.
|
||||
_blocks (Optional[List[Block]]): The list of blocks managed by this
|
||||
BlockTable.
|
||||
_num_full_slots (int): The number of tokens currently stored in the
|
||||
blocks.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int,
|
||||
block_allocator: DeviceAwareBlockAllocator,
|
||||
_blocks: Optional[List[Block]] = None,
|
||||
max_block_sliding_window: Optional[int] = None,
|
||||
):
|
||||
self._block_size = block_size
|
||||
self._allocator = block_allocator
|
||||
if _blocks is None:
|
||||
_blocks = []
|
||||
self._blocks: BlockList = BlockList(_blocks)
|
||||
|
||||
self._max_block_sliding_window = max_block_sliding_window
|
||||
self._num_full_slots = self._get_num_token_ids()
|
||||
|
||||
@staticmethod
|
||||
def get_num_required_blocks(token_ids: List[int],
|
||||
block_size: int,
|
||||
num_lookahead_slots: int = 0) -> int:
|
||||
"""Calculates the minimum number of blocks required to store a given
|
||||
sequence of token IDs along with any look-ahead slots that may be
|
||||
required (like in multi-step + chunked-prefill).
|
||||
|
||||
This assumes worst-case scenario, where every block requires a new
|
||||
allocation (e.g. ignoring prefix caching).
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The sequence of token IDs to be stored.
|
||||
block_size (int): The maximum number of tokens that can be stored in
|
||||
a single block.
|
||||
num_lookahead_slots (int): look-ahead slots that the sequence may
|
||||
require.
|
||||
|
||||
Returns:
|
||||
int: The minimum number of blocks required to store the given
|
||||
sequence of token IDs along with any required look-ahead slots.
|
||||
"""
|
||||
return cdiv(len(token_ids) + num_lookahead_slots, block_size)
|
||||
|
||||
def allocate(self,
|
||||
token_ids: List[int],
|
||||
device: Device = Device.GPU,
|
||||
extra_hash: Optional[int] = None) -> None:
|
||||
"""Allocates memory blocks for storing the given sequence of token IDs.
|
||||
|
||||
This method allocates the required number of blocks to store the given
|
||||
sequence of token IDs.
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The sequence of token IDs to be stored.
|
||||
device (Device, optional): The device on which the blocks should be
|
||||
allocated. Defaults to Device.GPU.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefixcaching block.
|
||||
"""
|
||||
assert not self._is_allocated
|
||||
assert token_ids
|
||||
blocks = self._allocate_blocks_for_token_ids(prev_block=None,
|
||||
token_ids=token_ids,
|
||||
device=device,
|
||||
extra_hash=extra_hash)
|
||||
self.update(blocks)
|
||||
self._num_full_slots = len(token_ids)
|
||||
|
||||
def update(self, blocks: List[Block]) -> None:
|
||||
"""Resets the table to the newly provided blocks
|
||||
(with their corresponding block ids)
|
||||
"""
|
||||
self._blocks.update(blocks)
|
||||
|
||||
def append_token_ids(self,
|
||||
token_ids: List[int],
|
||||
num_lookahead_slots: int = 0,
|
||||
num_computed_slots: Optional[int] = None,
|
||||
extra_hash: Optional[int] = None) -> None:
|
||||
"""Appends a sequence of token IDs to the existing blocks in the
|
||||
BlockTable.
|
||||
|
||||
This method appends the given sequence of token IDs to the existing
|
||||
blocks in the BlockTable. If there is not enough space in the existing
|
||||
blocks, new blocks are allocated using the `ensure_num_empty_slots`
|
||||
method to accommodate the additional tokens.
|
||||
|
||||
The token IDs are divided into chunks of size `block_size` (except for
|
||||
the first chunk, which may be smaller), and each chunk is appended to a
|
||||
separate block.
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The sequence of token IDs to be appended.
|
||||
num_computed_slots (Optional[int]): The number of KV cache slots
|
||||
that are already filled (computed).
|
||||
When sliding window is enabled, this is used to compute how many
|
||||
blocks to drop at the front of the sequence.
|
||||
Without sliding window, None can be passed.
|
||||
Without chunked prefill, it should be the same as
|
||||
_num_full_slots.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors such as adapters that influence the block, apart
|
||||
from the token_ids.
|
||||
"""
|
||||
assert self._is_allocated, "no blocks have been allocated"
|
||||
assert len(self._blocks) > 0
|
||||
|
||||
# Drop blocks that are no longer needed due to sliding window
|
||||
if self._max_block_sliding_window is not None:
|
||||
null_block = self._allocator.allocate_or_get_null_block()
|
||||
assert num_computed_slots is not None
|
||||
end_block_idx = (num_computed_slots //
|
||||
self._block_size) - self._max_block_sliding_window
|
||||
for idx in range(0, end_block_idx):
|
||||
b = self._blocks[idx]
|
||||
if b is not null_block:
|
||||
self._allocator.free(b)
|
||||
self._blocks[idx] = null_block
|
||||
|
||||
# Ensure there are enough empty slots for the new tokens plus
|
||||
# lookahead slots
|
||||
self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
|
||||
num_lookahead_slots,
|
||||
extra_hash=extra_hash)
|
||||
|
||||
# Update the blocks with the new tokens
|
||||
first_block_idx = self._num_full_slots // self._block_size
|
||||
token_blocks = self._chunk_token_blocks_for_append(token_ids)
|
||||
|
||||
for i, token_block in enumerate(token_blocks):
|
||||
self._blocks.append_token_ids(first_block_idx + i, token_block)
|
||||
|
||||
self._num_full_slots += len(token_ids)
|
||||
|
||||
def ensure_num_empty_slots(self,
|
||||
num_empty_slots: int,
|
||||
extra_hash: Optional[int] = None) -> None:
|
||||
"""Ensures that the BlockTable has at least the specified number of
|
||||
empty slots available.
|
||||
|
||||
This method checks if the BlockTable has enough empty slots (i.e.,
|
||||
available space) to accommodate the requested number of tokens. If not,
|
||||
it allocates additional blocks on the GPU to ensure that the required
|
||||
number of empty slots is available.
|
||||
|
||||
Args:
|
||||
num_empty_slots (int): The minimum number of empty slots required.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors such as adapters that influence the block, apart
|
||||
from the token_ids.
|
||||
"""
|
||||
# Currently the block table only supports
|
||||
# appending tokens to GPU blocks.
|
||||
device = Device.GPU
|
||||
assert self._is_allocated
|
||||
|
||||
if self._num_empty_slots >= num_empty_slots:
|
||||
return
|
||||
|
||||
slots_to_allocate = num_empty_slots - self._num_empty_slots
|
||||
blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
|
||||
|
||||
for _ in range(blocks_to_allocate):
|
||||
assert len(self._blocks) > 0
|
||||
self._blocks.append(
|
||||
self._allocator.allocate_mutable_block(
|
||||
prev_block=self._blocks[-1],
|
||||
device=device,
|
||||
extra_hash=extra_hash))
|
||||
|
||||
def fork(self) -> "BlockTable":
|
||||
"""Creates a new BlockTable instance with a copy of the blocks from the
|
||||
current instance.
|
||||
|
||||
This method creates a new BlockTable instance with the same block size,
|
||||
block allocator, and a copy of the blocks from the current instance. The
|
||||
new BlockTable has its own independent set of blocks, but shares the
|
||||
same underlying memory allocation with the original BlockTable.
|
||||
|
||||
Returns:
|
||||
BlockTable: A new BlockTable instance with a copy of the blocks from
|
||||
the current instance.
|
||||
"""
|
||||
assert self._is_allocated
|
||||
assert len(self._blocks) > 0
|
||||
forked_blocks = self._allocator.fork(self._blocks[-1])
|
||||
return BlockTable(
|
||||
block_size=self._block_size,
|
||||
block_allocator=self._allocator,
|
||||
_blocks=forked_blocks,
|
||||
max_block_sliding_window=self._max_block_sliding_window,
|
||||
)
|
||||
|
||||
def free(self) -> None:
|
||||
"""Frees the memory occupied by the blocks in the BlockTable.
|
||||
|
||||
This method iterates over all the blocks in the `_blocks` list and calls
|
||||
the `free` method of the `_allocator` object to release the memory
|
||||
occupied by each block. After freeing all the blocks, the `_blocks` list
|
||||
is set to `None`.
|
||||
"""
|
||||
for block in self.blocks:
|
||||
self._allocator.free(block)
|
||||
self._blocks.reset()
|
||||
|
||||
@property
|
||||
def physical_block_ids(self) -> List[int]:
|
||||
"""Returns a list of physical block indices for the blocks in the
|
||||
BlockTable.
|
||||
|
||||
This property returns a list of integers, where each integer represents
|
||||
the physical block index of a corresponding block in the `_blocks` list.
|
||||
The physical block index is a unique identifier for the memory location
|
||||
occupied by the block.
|
||||
|
||||
Returns:
|
||||
List[int]: A list of physical block indices for the blocks in the
|
||||
BlockTable.
|
||||
"""
|
||||
return self._blocks.ids()
|
||||
|
||||
def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
|
||||
"""Get the number of "unseen" tokens in the sequence.
|
||||
|
||||
Unseen tokens are tokens in the sequence corresponding to this block
|
||||
table, but are not yet appended to this block table.
|
||||
|
||||
Args:
|
||||
sequence_token_ids (List[int]): The list of token ids in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
List[int]: The postfix of sequence_token_ids that has not yet been
|
||||
appended to the block table.
|
||||
"""
|
||||
|
||||
# Since the block table is append-only, the unseen token ids are the
|
||||
# ones after the appended ones.
|
||||
return sequence_token_ids[self.num_full_slots:]
|
||||
|
||||
def _allocate_blocks_for_token_ids(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> List[Block]:
|
||||
blocks: List[Block] = []
|
||||
|
||||
block_token_ids = []
|
||||
tail_token_ids = []
|
||||
for cur_token_ids in chunk_list(token_ids, self._block_size):
|
||||
if len(cur_token_ids) == self._block_size:
|
||||
block_token_ids.append(cur_token_ids)
|
||||
else:
|
||||
tail_token_ids.append(cur_token_ids)
|
||||
|
||||
if block_token_ids:
|
||||
blocks.extend(
|
||||
self._allocator.allocate_immutable_blocks(
|
||||
prev_block,
|
||||
block_token_ids=block_token_ids,
|
||||
device=device,
|
||||
extra_hash=extra_hash))
|
||||
prev_block = blocks[-1]
|
||||
|
||||
if tail_token_ids:
|
||||
assert len(tail_token_ids) == 1
|
||||
cur_token_ids = tail_token_ids[0]
|
||||
|
||||
block = self._allocator.allocate_mutable_block(
|
||||
prev_block=prev_block, device=device, extra_hash=extra_hash)
|
||||
block.append_token_ids(cur_token_ids)
|
||||
|
||||
blocks.append(block)
|
||||
|
||||
return blocks
|
||||
|
||||
def _get_all_token_ids(self) -> List[int]:
|
||||
# NOTE: This function is O(seq_len); use sparingly.
|
||||
token_ids: List[int] = []
|
||||
|
||||
if not self._is_allocated:
|
||||
return token_ids
|
||||
|
||||
for block in self.blocks:
|
||||
token_ids.extend(block.token_ids)
|
||||
|
||||
return token_ids
|
||||
|
||||
def _get_num_token_ids(self) -> int:
|
||||
res = 0
|
||||
for block in self.blocks:
|
||||
res += len(block.token_ids)
|
||||
|
||||
return res
|
||||
|
||||
@property
|
||||
def _is_allocated(self) -> bool:
|
||||
return len(self._blocks) > 0
|
||||
|
||||
@property
|
||||
def blocks(self) -> List[Block]:
|
||||
return self._blocks.list()
|
||||
|
||||
@property
|
||||
def _num_empty_slots(self) -> int:
|
||||
assert self._is_allocated
|
||||
return len(self._blocks) * self._block_size - self._num_full_slots
|
||||
|
||||
@property
|
||||
def num_full_slots(self) -> int:
|
||||
"""Returns the total number of tokens currently stored in the
|
||||
BlockTable.
|
||||
|
||||
Returns:
|
||||
int: The total number of tokens currently stored in the BlockTable.
|
||||
"""
|
||||
return self._num_full_slots
|
||||
|
||||
def get_num_blocks_touched_by_append_slots(
|
||||
self, token_ids: List[int], num_lookahead_slots: int) -> int:
|
||||
"""Determine how many blocks will be "touched" by appending the token
|
||||
ids.
|
||||
|
||||
This is required for the scheduler to determine whether a sequence can
|
||||
continue generation, or if it must be preempted.
|
||||
"""
|
||||
# Math below is equivalent to:
|
||||
# all_token_ids = token_ids + [-1] * num_lookahead_slots
|
||||
# token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
|
||||
# return len(token_blocks)
|
||||
|
||||
num_token_ids = len(token_ids) + num_lookahead_slots
|
||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||
self._block_size)
|
||||
num_token_blocks = (1 + math.ceil(
|
||||
(num_token_ids - first_chunk_size) / self._block_size))
|
||||
return num_token_blocks
|
||||
|
||||
def _chunk_token_blocks_for_append(
|
||||
self, token_ids: List[int]) -> List[List[int]]:
|
||||
"""Split the token ids into block-sized chunks so they can be easily
|
||||
appended to blocks. The first such "token block" may have less token ids
|
||||
than the block size, since the last allocated block may be partially
|
||||
full.
|
||||
|
||||
If no token ids are provided, then no chunks are returned.
|
||||
"""
|
||||
|
||||
if not token_ids:
|
||||
return []
|
||||
|
||||
first_chunk_size = self._block_size - (self._num_full_slots %
|
||||
self._block_size)
|
||||
token_blocks = [token_ids[:first_chunk_size]]
|
||||
token_blocks.extend(
|
||||
chunk_list(token_ids[first_chunk_size:], self._block_size))
|
||||
return token_blocks
|
||||
@ -1,371 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
|
||||
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator
|
||||
|
||||
BlockId = int
|
||||
RefCount = int
|
||||
|
||||
|
||||
class RefCounterProtocol(Protocol):
|
||||
|
||||
def incr(self, block_id: BlockId) -> RefCount:
|
||||
raise NotImplementedError
|
||||
|
||||
def decr(self, block_id: BlockId) -> RefCount:
|
||||
raise NotImplementedError
|
||||
|
||||
def get(self, block_id: BlockId) -> RefCount:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class RefCounter(RefCounterProtocol):
|
||||
"""A class for managing reference counts for a set of block indices.
|
||||
|
||||
The RefCounter class maintains a dictionary that maps block indices to their
|
||||
corresponding reference counts. It provides methods to increment, decrement,
|
||||
and retrieve the reference count for a given block index.
|
||||
|
||||
Args:
|
||||
all_block_indices (Iterable[BlockId]): An iterable of block indices
|
||||
to initialize the reference counter with.
|
||||
"""
|
||||
|
||||
def __init__(self, all_block_indices: Iterable[BlockId]):
|
||||
deduped = set(all_block_indices)
|
||||
self._refcounts: Dict[BlockId, RefCount] = {
|
||||
index: 0
|
||||
for index in deduped
|
||||
}
|
||||
|
||||
def incr(self, block_id: BlockId) -> RefCount:
|
||||
assert block_id in self._refcounts
|
||||
pre_incr_refcount = self._refcounts[block_id]
|
||||
|
||||
assert pre_incr_refcount >= 0
|
||||
|
||||
post_incr_refcount = pre_incr_refcount + 1
|
||||
self._refcounts[block_id] = post_incr_refcount
|
||||
return post_incr_refcount
|
||||
|
||||
def decr(self, block_id: BlockId) -> RefCount:
|
||||
assert block_id in self._refcounts
|
||||
refcount = self._refcounts[block_id]
|
||||
|
||||
assert refcount > 0
|
||||
refcount -= 1
|
||||
|
||||
self._refcounts[block_id] = refcount
|
||||
|
||||
return refcount
|
||||
|
||||
def get(self, block_id: BlockId) -> RefCount:
|
||||
assert block_id in self._refcounts
|
||||
return self._refcounts[block_id]
|
||||
|
||||
def as_readonly(self) -> "ReadOnlyRefCounter":
|
||||
return ReadOnlyRefCounter(self)
|
||||
|
||||
|
||||
class ReadOnlyRefCounter(RefCounterProtocol):
|
||||
"""A read-only view of the RefCounter class.
|
||||
|
||||
The ReadOnlyRefCounter class provides a read-only interface to access the
|
||||
reference counts maintained by a RefCounter instance. It does not allow
|
||||
modifications to the reference counts.
|
||||
|
||||
Args:
|
||||
refcounter (RefCounter): The RefCounter instance to create a read-only
|
||||
view for.
|
||||
"""
|
||||
|
||||
def __init__(self, refcounter: RefCounter):
|
||||
self._refcounter = refcounter
|
||||
|
||||
def incr(self, block_id: BlockId) -> RefCount:
|
||||
raise ValueError("Incr not allowed")
|
||||
|
||||
def decr(self, block_id: BlockId) -> RefCount:
|
||||
raise ValueError("Decr not allowed")
|
||||
|
||||
def get(self, block_id: BlockId) -> RefCount:
|
||||
return self._refcounter.get(block_id)
|
||||
|
||||
|
||||
class CopyOnWriteTracker:
|
||||
"""A class for tracking and managing copy-on-write operations for blocks.
|
||||
|
||||
The CopyOnWriteTracker class maintains a mapping of source block indices to
|
||||
their corresponding copy-on-write destination block indices. It works in
|
||||
conjunction with a RefCounter.
|
||||
|
||||
Args:
|
||||
refcounter (RefCounter): The reference counter used to track block
|
||||
reference counts.
|
||||
"""
|
||||
|
||||
def __init__(self, refcounter: RefCounterProtocol):
|
||||
self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
|
||||
self._refcounter = refcounter
|
||||
|
||||
def is_appendable(self, block: Block) -> bool:
|
||||
"""Checks if the block is shared or not. If shared, then it cannot
|
||||
be appended and needs to be duplicated via copy-on-write
|
||||
"""
|
||||
block_id = block.block_id
|
||||
if block_id is None:
|
||||
return True
|
||||
|
||||
refcount = self._refcounter.get(block_id)
|
||||
return refcount <= 1
|
||||
|
||||
def record_cow(self, src_block_id: Optional[BlockId],
|
||||
trg_block_id: Optional[BlockId]) -> None:
|
||||
"""Records a copy-on-write operation from source to target block id
|
||||
Args:
|
||||
src_block_id (BlockId): The source block id from which to copy
|
||||
the data
|
||||
trg_block_id (BlockId): The target block id to which the data
|
||||
is copied
|
||||
"""
|
||||
assert src_block_id is not None
|
||||
assert trg_block_id is not None
|
||||
self._copy_on_writes.append((src_block_id, trg_block_id))
|
||||
|
||||
def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
|
||||
"""Clears the copy-on-write tracking information and returns the current
|
||||
state.
|
||||
|
||||
This method returns a list mapping source block indices to
|
||||
destination block indices for the current copy-on-write operations.
|
||||
It then clears the internal tracking information.
|
||||
|
||||
Returns:
|
||||
List[Tuple[BlockId, BlockId]]: A list mapping source
|
||||
block indices to destination block indices for the
|
||||
current copy-on-write operations.
|
||||
"""
|
||||
cows = self._copy_on_writes
|
||||
self._copy_on_writes = []
|
||||
return cows
|
||||
|
||||
|
||||
class BlockPool:
|
||||
"""Used to pre-allocate block objects, in order to avoid excessive python
|
||||
object allocations/deallocations.
|
||||
The pool starts from "pool_size" objects and will increase to more objects
|
||||
if necessary
|
||||
|
||||
Note that multiple block objects may point to the same physical block id,
|
||||
which is why this pool is needed, so that it will be easier to support
|
||||
prefix caching and more complicated sharing of physical blocks.
|
||||
"""
|
||||
|
||||
def __init__(self, block_size: int, create_block: Block.Factory,
|
||||
allocator: BlockAllocator, pool_size: int):
|
||||
self._block_size = block_size
|
||||
self._create_block = create_block
|
||||
self._allocator = allocator
|
||||
self._pool_size = pool_size
|
||||
assert self._pool_size >= 0
|
||||
|
||||
self._free_ids: Deque[int] = deque(range(self._pool_size))
|
||||
self._pool = []
|
||||
for i in range(self._pool_size):
|
||||
self._pool.append(
|
||||
self._create_block(prev_block=None,
|
||||
token_ids=[],
|
||||
block_size=self._block_size,
|
||||
allocator=self._allocator,
|
||||
block_id=None,
|
||||
extra_hash=None))
|
||||
|
||||
def increase_pool(self):
|
||||
"""Doubles the internal pool size
|
||||
"""
|
||||
cur_pool_size = self._pool_size
|
||||
new_pool_size = cur_pool_size * 2
|
||||
self._pool_size = new_pool_size
|
||||
|
||||
self._free_ids += deque(range(cur_pool_size, new_pool_size))
|
||||
|
||||
for i in range(cur_pool_size, new_pool_size):
|
||||
self._pool.append(
|
||||
self._create_block(prev_block=None,
|
||||
token_ids=[],
|
||||
block_size=self._block_size,
|
||||
allocator=self._allocator,
|
||||
block_id=None,
|
||||
extra_hash=None))
|
||||
|
||||
def init_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
block_size: int,
|
||||
physical_block_id: Optional[int],
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
if len(self._free_ids) == 0:
|
||||
self.increase_pool()
|
||||
assert len(self._free_ids) > 0
|
||||
|
||||
pool_id = self._free_ids.popleft()
|
||||
|
||||
block = self._pool[pool_id]
|
||||
block.__init__( # type: ignore[misc]
|
||||
prev_block=prev_block,
|
||||
token_ids=token_ids,
|
||||
block_size=block_size,
|
||||
allocator=block._allocator, # type: ignore[attr-defined]
|
||||
block_id=physical_block_id,
|
||||
extra_hash=extra_hash)
|
||||
block.pool_id = pool_id # type: ignore[attr-defined]
|
||||
return block
|
||||
|
||||
def free_block(self, block: Block) -> None:
|
||||
self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined]
|
||||
|
||||
|
||||
class BlockList:
|
||||
"""This class is an optimization to allow fast-access to physical
|
||||
block ids. It maintains a block id list that is updated with the
|
||||
block list and this avoids the need to reconstruct the block id
|
||||
list on every iteration of the block manager
|
||||
"""
|
||||
|
||||
def __init__(self, blocks: List[Block]):
|
||||
self._blocks: List[Block] = []
|
||||
self._block_ids: List[int] = []
|
||||
|
||||
self.update(blocks)
|
||||
|
||||
def _add_block_id(self, block_id: Optional[BlockId]) -> None:
|
||||
assert block_id is not None
|
||||
self._block_ids.append(block_id)
|
||||
|
||||
def _update_block_id(self, block_index: int,
|
||||
new_block_id: Optional[BlockId]) -> None:
|
||||
assert new_block_id is not None
|
||||
self._block_ids[block_index] = new_block_id
|
||||
|
||||
def update(self, blocks: List[Block]):
|
||||
self._blocks = blocks
|
||||
|
||||
# Cache block ids for fast query
|
||||
self._block_ids = []
|
||||
for block in self._blocks:
|
||||
self._add_block_id(block.block_id)
|
||||
|
||||
def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
|
||||
block = self._blocks[block_index]
|
||||
prev_block_id = block.block_id
|
||||
|
||||
block.append_token_ids(token_ids)
|
||||
|
||||
# CoW or promotion may update the internal block_id
|
||||
if prev_block_id != block.block_id:
|
||||
self._update_block_id(block_index, block.block_id)
|
||||
|
||||
def append(self, new_block: Block):
|
||||
self._blocks.append(new_block)
|
||||
self._add_block_id(new_block.block_id)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._blocks)
|
||||
|
||||
def __getitem__(self, block_index: int) -> Block:
|
||||
return self._blocks[block_index]
|
||||
|
||||
def __setitem__(self, block_index: int, new_block: Block) -> None:
|
||||
self._blocks[block_index] = new_block
|
||||
self._update_block_id(block_index, new_block.block_id)
|
||||
|
||||
def reset(self):
|
||||
self._blocks = []
|
||||
self._block_ids = []
|
||||
|
||||
def list(self) -> List[Block]:
|
||||
return self._blocks
|
||||
|
||||
def ids(self) -> List[int]:
|
||||
return self._block_ids
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheMetricData:
|
||||
"""A utility dataclass to maintain cache metric.
|
||||
To avoid overflow, we maintain the hit rate in block granularity, so that
|
||||
we can maintain a single hit rate for n_completed_block x block_size,
|
||||
and calculate the real time hit rate by the following:
|
||||
BS = The number of queries per block.
|
||||
nB = The number of completed blocks.
|
||||
HR = hit rate of (nB x BS) queries.
|
||||
Q = current number of queries (< BS).
|
||||
H = current number of hits (< BS).
|
||||
hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
|
||||
"""
|
||||
num_completed_blocks: int = 0
|
||||
completed_block_cache_hit_rate: float = 0.0
|
||||
num_incompleted_block_queries: int = 0
|
||||
num_incompleted_block_hit: int = 0
|
||||
block_size: int = 1000
|
||||
|
||||
def query(self, hit: bool):
|
||||
self.num_incompleted_block_queries += 1
|
||||
self.num_incompleted_block_hit += 1 if hit else 0
|
||||
|
||||
# When a block is completed, update the cache hit rate
|
||||
# and reset the incomplete numbers.
|
||||
if self.num_incompleted_block_queries == self.block_size:
|
||||
hit_rate = (self.num_incompleted_block_hit /
|
||||
self.num_incompleted_block_queries)
|
||||
self.completed_block_cache_hit_rate = (
|
||||
self.completed_block_cache_hit_rate * self.num_completed_blocks
|
||||
+ hit_rate) / (self.num_completed_blocks + 1)
|
||||
self.num_incompleted_block_queries = 0
|
||||
self.num_incompleted_block_hit = 0
|
||||
self.num_completed_blocks += 1
|
||||
|
||||
def get_hit_rate(self):
|
||||
incomplete_ratio = self.num_incompleted_block_queries / self.block_size
|
||||
total_blocks = self.num_completed_blocks + incomplete_ratio
|
||||
if total_blocks == 0:
|
||||
return 0.0
|
||||
|
||||
completed_block_hit, incompleted_block_hit = 0.0, 0.0
|
||||
if self.num_completed_blocks > 0:
|
||||
completed_block_hit = (self.completed_block_cache_hit_rate *
|
||||
self.num_completed_blocks)
|
||||
if self.num_incompleted_block_queries > 0:
|
||||
incompleted_hit_rate = (self.num_incompleted_block_hit /
|
||||
self.num_incompleted_block_queries)
|
||||
incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
|
||||
return (completed_block_hit + incompleted_block_hit) / total_blocks
|
||||
|
||||
|
||||
def get_all_blocks_recursively(last_block: Block) -> List[Block]:
|
||||
"""Retrieves all the blocks in a sequence starting from the last block.
|
||||
|
||||
This function recursively traverses the sequence of blocks in reverse order,
|
||||
starting from the given last block, and returns a list of all the blocks in
|
||||
the sequence.
|
||||
|
||||
Args:
|
||||
last_block (Block): The last block in the sequence.
|
||||
|
||||
Returns:
|
||||
List[Block]: A list of all the blocks in the sequence, in the order they
|
||||
appear.
|
||||
"""
|
||||
|
||||
def recurse(block: Block, lst: List[Block]) -> None:
|
||||
if block.prev_block is not None:
|
||||
recurse(block.prev_block, lst)
|
||||
lst.append(block)
|
||||
|
||||
all_blocks: List[Block] = []
|
||||
recurse(last_block, all_blocks)
|
||||
return all_blocks
|
||||
@ -1,439 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Dict, FrozenSet, List, Optional, Tuple
|
||||
|
||||
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
|
||||
DeviceAwareBlockAllocator)
|
||||
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
||||
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
|
||||
from vllm.utils import Device
|
||||
|
||||
|
||||
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
||||
"""A block allocator that can allocate blocks on both CPU and GPU memory.
|
||||
|
||||
This class implements the `DeviceAwareBlockAllocator` interface and provides
|
||||
functionality for allocating and managing blocks of memory on both CPU and
|
||||
GPU devices.
|
||||
|
||||
The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
|
||||
blocks, and allows for allocation, deallocation, forking, and swapping of
|
||||
blocks across these memory pools.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create(
|
||||
allocator_type: str,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
block_size: int,
|
||||
) -> DeviceAwareBlockAllocator:
|
||||
"""Creates a CpuGpuBlockAllocator instance with the specified
|
||||
configuration.
|
||||
|
||||
This static method creates and returns a CpuGpuBlockAllocator instance
|
||||
based on the provided parameters. It initializes the CPU and GPU block
|
||||
allocators with the specified number of blocks, block size, and
|
||||
allocator type.
|
||||
|
||||
Args:
|
||||
allocator_type (str): The type of block allocator to use for CPU
|
||||
and GPU blocks. Currently supported values are "naive" and
|
||||
"prefix_caching".
|
||||
num_gpu_blocks (int): The number of blocks to allocate for GPU
|
||||
memory.
|
||||
num_cpu_blocks (int): The number of blocks to allocate for CPU
|
||||
memory.
|
||||
block_size (int): The size of each block in number of tokens.
|
||||
|
||||
Returns:
|
||||
DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
|
||||
specified configuration.
|
||||
|
||||
Notes:
|
||||
- The block IDs are assigned contiguously, with GPU block IDs coming
|
||||
before CPU block IDs.
|
||||
"""
|
||||
reserved_blocks = 0
|
||||
block_ids = list(
|
||||
range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
|
||||
num_gpu_blocks -= reserved_blocks
|
||||
gpu_block_ids = block_ids[:num_gpu_blocks]
|
||||
cpu_block_ids = block_ids[num_gpu_blocks:]
|
||||
|
||||
if allocator_type == "naive":
|
||||
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
|
||||
create_block=NaiveBlock, # type: ignore
|
||||
num_blocks=num_gpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=gpu_block_ids,
|
||||
)
|
||||
|
||||
cpu_allocator: BlockAllocator = NaiveBlockAllocator(
|
||||
create_block=NaiveBlock, # type: ignore
|
||||
num_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=cpu_block_ids,
|
||||
)
|
||||
elif allocator_type == "prefix_caching":
|
||||
gpu_allocator = PrefixCachingBlockAllocator(
|
||||
num_blocks=num_gpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=gpu_block_ids,
|
||||
)
|
||||
|
||||
cpu_allocator = PrefixCachingBlockAllocator(
|
||||
num_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
block_ids=cpu_block_ids,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown allocator type {allocator_type=}")
|
||||
|
||||
return CpuGpuBlockAllocator(
|
||||
cpu_block_allocator=cpu_allocator,
|
||||
gpu_block_allocator=gpu_allocator,
|
||||
)
|
||||
|
||||
def __init__(self, cpu_block_allocator: BlockAllocator,
|
||||
gpu_block_allocator: BlockAllocator):
|
||||
assert not (
|
||||
cpu_block_allocator.all_block_ids
|
||||
& gpu_block_allocator.all_block_ids
|
||||
), "cpu and gpu block allocators can't have intersection of block ids"
|
||||
|
||||
self._allocators = {
|
||||
Device.CPU: cpu_block_allocator,
|
||||
Device.GPU: gpu_block_allocator,
|
||||
}
|
||||
|
||||
self._swap_mapping: Dict[int, int] = {}
|
||||
self._null_block: Optional[Block] = None
|
||||
|
||||
self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
|
||||
for _, allocator in self._allocators.items():
|
||||
for block_id in allocator.all_block_ids:
|
||||
self._block_ids_to_allocator[block_id] = allocator
|
||||
|
||||
def allocate_or_get_null_block(self) -> Block:
|
||||
if self._null_block is None:
|
||||
self._null_block = NullBlock(
|
||||
self.allocate_mutable_block(None, Device.GPU))
|
||||
return self._null_block
|
||||
|
||||
def allocate_mutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
"""Allocates a new mutable block on the specified device.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block to in the sequence.
|
||||
Used for prefix hashing.
|
||||
device (Device): The device on which to allocate the new block.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefix caching block.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated mutable block.
|
||||
"""
|
||||
return self._allocators[device].allocate_mutable_block(
|
||||
prev_block, extra_hash=extra_hash)
|
||||
|
||||
def allocate_immutable_blocks(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> List[Block]:
|
||||
"""Allocates a new group of immutable blocks with the provided block
|
||||
token IDs on the specified device.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence.
|
||||
Used for prefix hashing.
|
||||
block_token_ids (List[int]): The list of block token IDs to be
|
||||
stored in the new blocks.
|
||||
device (Device): The device on which to allocate the new block.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefix caching block.
|
||||
|
||||
Returns:
|
||||
List[Block]: The newly allocated list of immutable blocks
|
||||
containing the provided block token IDs.
|
||||
"""
|
||||
return self._allocators[device].allocate_immutable_blocks(
|
||||
prev_block, block_token_ids, extra_hash=extra_hash)
|
||||
|
||||
def allocate_immutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
"""Allocates a new immutable block with the provided token IDs on the
|
||||
specified device.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence.
|
||||
Used for prefix hashing.
|
||||
token_ids (List[int]): The list of token IDs to be stored in the new
|
||||
block.
|
||||
device (Device): The device on which to allocate the new block.
|
||||
extra_hash (Optional[int]): The hash value of additional
|
||||
factors, such as adapters, that influence the block hash
|
||||
in the prefix caching block.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated immutable block containing the provided
|
||||
token IDs.
|
||||
"""
|
||||
return self._allocators[device].allocate_immutable_block(
|
||||
prev_block, token_ids, extra_hash=extra_hash)
|
||||
|
||||
def free(self, block: Block) -> None:
|
||||
"""Frees the memory occupied by the given block.
|
||||
|
||||
Args:
|
||||
block (Block): The block to be freed.
|
||||
"""
|
||||
# Null block should never be freed
|
||||
if isinstance(block, NullBlock):
|
||||
return
|
||||
block_id = block.block_id
|
||||
assert block_id is not None
|
||||
allocator = self._block_ids_to_allocator[block_id]
|
||||
allocator.free(block)
|
||||
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
"""Creates a new sequence of blocks that shares the same underlying
|
||||
memory as the original sequence.
|
||||
|
||||
Args:
|
||||
last_block (Block): The last block in the original sequence.
|
||||
|
||||
Returns:
|
||||
List[Block]: A new list of blocks that shares the same memory as the
|
||||
original sequence.
|
||||
"""
|
||||
# do not attempt to fork the null block
|
||||
assert not isinstance(last_block, NullBlock)
|
||||
block_id = last_block.block_id
|
||||
assert block_id is not None
|
||||
allocator = self._block_ids_to_allocator[block_id]
|
||||
return allocator.fork(last_block)
|
||||
|
||||
def get_num_free_blocks(self, device: Device) -> int:
|
||||
"""Returns the number of free blocks available on the specified device.
|
||||
|
||||
Args:
|
||||
device (Device): The device for which to query the number of free
|
||||
blocks. AssertionError is raised if None is passed.
|
||||
|
||||
Returns:
|
||||
int: The number of free blocks available on the specified device.
|
||||
"""
|
||||
return self._allocators[device].get_num_free_blocks()
|
||||
|
||||
def get_num_total_blocks(self, device: Device) -> int:
|
||||
return self._allocators[device].get_num_total_blocks()
|
||||
|
||||
def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
|
||||
"""Returns the zero-offset block id on certain device given the
|
||||
absolute block id.
|
||||
|
||||
Args:
|
||||
device (Device): The device for which to query relative block id.
|
||||
absolute_id (int): The absolute block id for the block in
|
||||
whole allocator.
|
||||
|
||||
Returns:
|
||||
int: The zero-offset block id on certain device.
|
||||
"""
|
||||
return self._allocators[device].get_physical_block_id(absolute_id)
|
||||
|
||||
def swap(self, blocks: List[Block], src_device: Device,
|
||||
dst_device: Device) -> Dict[int, int]:
|
||||
"""Execute the swap for the given blocks from source_device
|
||||
on to dest_device, save the current swap mapping and append
|
||||
them to the accumulated `self._swap_mapping` for each
|
||||
scheduling move.
|
||||
|
||||
Args:
|
||||
blocks: List of blocks to be swapped.
|
||||
src_device (Device): Device to swap the 'blocks' from.
|
||||
dst_device (Device): Device to swap the 'blocks' to.
|
||||
|
||||
Returns:
|
||||
Dict[int, int]: Swap mapping from source_device
|
||||
on to dest_device.
|
||||
"""
|
||||
src_block_ids = [block.block_id for block in blocks]
|
||||
self._allocators[src_device].swap_out(blocks)
|
||||
self._allocators[dst_device].swap_in(blocks)
|
||||
dst_block_ids = [block.block_id for block in blocks]
|
||||
|
||||
current_swap_mapping: Dict[int, int] = {}
|
||||
for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
|
||||
if src_block_id is not None and dst_block_id is not None:
|
||||
self._swap_mapping[src_block_id] = dst_block_id
|
||||
current_swap_mapping[src_block_id] = dst_block_id
|
||||
return current_swap_mapping
|
||||
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block],
|
||||
device: Device) -> int:
|
||||
"""Returns the number of full blocks that will be touched by
|
||||
swapping in/out the given blocks on to the 'device'.
|
||||
|
||||
Args:
|
||||
blocks: List of blocks to be swapped.
|
||||
device (Device): Device to swap the 'blocks' on.
|
||||
|
||||
Returns:
|
||||
int: the number of full blocks that will be touched by
|
||||
swapping in/out the given blocks on to the 'device'.
|
||||
Non full blocks are ignored when deciding the number
|
||||
of blocks to touch.
|
||||
"""
|
||||
return self._allocators[device].get_num_full_blocks_touched(blocks)
|
||||
|
||||
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
||||
"""Clears the copy-on-write (CoW) state and returns the mapping of
|
||||
source to destination block IDs.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: A list mapping source block IDs to
|
||||
destination block IDs.
|
||||
"""
|
||||
# CoW only supported on GPU
|
||||
device = Device.GPU
|
||||
return self._allocators[device].clear_copy_on_writes()
|
||||
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
"""Mark blocks as accessed, only use for prefix caching."""
|
||||
# Prefix caching only supported on GPU.
|
||||
device = Device.GPU
|
||||
return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
|
||||
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
"""Mark blocks as accessed, only use for prefix caching."""
|
||||
# Prefix caching only supported on GPU.
|
||||
device = Device.GPU
|
||||
return self._allocators[device].mark_blocks_as_computed(block_ids)
|
||||
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
# Prefix caching only supported on GPU.
|
||||
device = Device.GPU
|
||||
return self._allocators[device].get_common_computed_block_ids(
|
||||
computed_seq_block_ids)
|
||||
|
||||
@property
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
return frozenset(self._block_ids_to_allocator.keys())
|
||||
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
assert device in self._allocators
|
||||
return self._allocators[device].get_prefix_cache_hit_rate()
|
||||
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
"""Reset prefix cache for specified or all devices."""
|
||||
if device:
|
||||
return self._allocators[device].reset_prefix_cache()
|
||||
success = True
|
||||
for allocator in self._allocators.values():
|
||||
success = success and allocator.reset_prefix_cache()
|
||||
return success
|
||||
|
||||
def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
|
||||
"""Returns and clears the mapping of source to destination block IDs.
|
||||
Will be called after every swapping operations for now, and after every
|
||||
schedule when BlockManagerV2 become default. Currently not useful.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: A mapping of source to destination block IDs.
|
||||
"""
|
||||
mapping = self._swap_mapping.copy()
|
||||
self._swap_mapping.clear()
|
||||
return list(mapping.items())
|
||||
|
||||
def find_cached_blocks_prefix(
|
||||
self,
|
||||
block_hashes: List[int],
|
||||
device: Device = Device.GPU,
|
||||
) -> List[int]:
|
||||
return self._allocators[device].find_cached_blocks_prefix(block_hashes)
|
||||
|
||||
|
||||
class NullBlock(Block):
|
||||
"""
|
||||
Null blocks are used as a placeholders for KV cache blocks that have
|
||||
been dropped due to sliding window.
|
||||
This implementation just wraps an ordinary block and prevents it from
|
||||
being modified. It also allows for testing if a block is NullBlock
|
||||
via isinstance().
|
||||
"""
|
||||
|
||||
def __init__(self, proxy: Block):
|
||||
super().__init__()
|
||||
self._proxy = proxy
|
||||
|
||||
def append_token_ids(self, token_ids: List[BlockId]):
|
||||
raise ValueError("null block should not be modified")
|
||||
|
||||
@property
|
||||
def block_id(self):
|
||||
return self._proxy.block_id
|
||||
|
||||
@block_id.setter
|
||||
def block_id(self, value: Optional[BlockId]):
|
||||
raise ValueError("null block should not be modified")
|
||||
|
||||
@property
|
||||
def token_ids(self) -> List[BlockId]:
|
||||
return self._proxy.token_ids
|
||||
|
||||
@property
|
||||
def num_tokens_total(self) -> int:
|
||||
raise NotImplementedError(
|
||||
"num_tokens_total is not used for null block")
|
||||
|
||||
@property
|
||||
def num_empty_slots(self) -> BlockId:
|
||||
return self._proxy.num_empty_slots
|
||||
|
||||
@property
|
||||
def is_full(self):
|
||||
return self._proxy.is_full
|
||||
|
||||
@property
|
||||
def prev_block(self):
|
||||
return self._proxy.prev_block
|
||||
|
||||
@property
|
||||
def extra_hash(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def computed(self):
|
||||
return self._proxy.computed
|
||||
|
||||
@computed.setter
|
||||
def computed(self, value):
|
||||
self._proxy.computed = value
|
||||
|
||||
@property
|
||||
def last_accessed(self) -> float:
|
||||
return self._proxy.last_accessed
|
||||
|
||||
@last_accessed.setter
|
||||
def last_accessed(self, last_accessed_ts: float):
|
||||
self._proxy.last_accessed = last_accessed_ts
|
||||
|
||||
@property
|
||||
def content_hash(self):
|
||||
return self._proxy.content_hash
|
||||
@ -1,319 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
|
||||
|
||||
from vllm.utils import Device
|
||||
|
||||
BlockId = int
|
||||
|
||||
|
||||
class Block(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def append_token_ids(self, token_ids: List[int]) -> None:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def block_id(self) -> Optional[int]:
|
||||
pass
|
||||
|
||||
@block_id.setter
|
||||
@abstractmethod
|
||||
def block_id(self, value: Optional[int]) -> None:
|
||||
"""NOTE: Do not use this API outside Block."""
|
||||
self._block_id = value
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def token_ids(self) -> List[int]:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def num_tokens_total(self) -> int:
|
||||
"""The number of tokens till the current block (inclusive)
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def num_empty_slots(self) -> int:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def is_full(self) -> bool:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def prev_block(self) -> Optional["Block"]:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def extra_hash(self) -> Optional[int]:
|
||||
return None
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def computed(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@computed.setter
|
||||
@abstractmethod
|
||||
def computed(self, value) -> bool:
|
||||
"""Should be only used by PrefixCacingAllocator"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def last_accessed(self) -> float:
|
||||
raise NotImplementedError
|
||||
|
||||
@last_accessed.setter
|
||||
@abstractmethod
|
||||
def last_accessed(self, last_accessed_ts: float):
|
||||
raise NotImplementedError
|
||||
|
||||
class Factory(Protocol):
|
||||
|
||||
@abstractmethod
|
||||
def __call__(
|
||||
self,
|
||||
prev_block: Optional["Block"],
|
||||
token_ids: List[int],
|
||||
block_size: int,
|
||||
allocator: "BlockAllocator",
|
||||
block_id: Optional[int] = None,
|
||||
computed: bool = False,
|
||||
extra_hash: Optional[int] = None,
|
||||
) -> "Block":
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def content_hash(self) -> Optional[int]:
|
||||
"""Return the content-based hash of the current block, or None if it is
|
||||
not yet defined or not supported.
|
||||
|
||||
For the content-based hash to be defined, the current block must be
|
||||
full.
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
class BlockAllocator(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def allocate_mutable_block(self, prev_block: Optional[Block],
|
||||
extra_hash: Optional[int]) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_block(self, prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
extra_hash: Optional[int]) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_blocks(self, prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
extra_hash: Optional[int]) -> List[Block]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def free(self, block: Block) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_total_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_free_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_physical_block_id(self, absolute_id: int) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap_out(self, blocks: List[Block]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap_in(self, blocks: List[Block]) -> None:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||
"""NOTE: This should not be used besides Block"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
||||
"""NOTE: This should not be used besides Block"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_prefix_cache_hit_rate(self) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""Reset prefix cache."""
|
||||
pass
|
||||
|
||||
class NoFreeBlocksError(ValueError):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def find_cached_blocks_prefix(
|
||||
self,
|
||||
block_hashes: List[int],
|
||||
) -> List[int]:
|
||||
pass
|
||||
|
||||
|
||||
class DeviceAwareBlockAllocator(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def allocate_mutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None) -> Block:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_immutable_blocks(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
device: Device,
|
||||
extra_hash: Optional[int] = None,
|
||||
) -> List[Block]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_free_blocks(self, device: Device) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_total_blocks(self, device: Device) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def free(self, block: Block) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block],
|
||||
device: Device) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap(self, blocks: List[Block], src_device: Device,
|
||||
dst_device: Device) -> Dict[int, int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate_or_get_null_block(self) -> Block:
|
||||
"""
|
||||
Null blocks are used as a placeholders for KV cache blocks that have
|
||||
been dropped due to sliding window.
|
||||
There is at most one null block per allocator.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
"""Reset prefix cache."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def find_cached_blocks_prefix(
|
||||
self,
|
||||
block_hashes: List[int],
|
||||
device: Device = Device.GPU,
|
||||
) -> List[int]:
|
||||
pass
|
||||
@ -1,466 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections import deque
|
||||
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
|
||||
get_all_blocks_recursively)
|
||||
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
||||
|
||||
Refcount = int
|
||||
|
||||
|
||||
class NaiveBlockAllocator(BlockAllocator):
|
||||
"""A simple block allocator that manages blocks of memory without prefix
|
||||
caching.
|
||||
|
||||
Args:
|
||||
create_block (Block.Factory): A factory function for creating new
|
||||
blocks. This is used when a NaiveBlockAllocator is composed within
|
||||
a prefix caching allocator -- the naive block allocator must
|
||||
construct prefix caching blocks (but shouldn't know anything else
|
||||
about them).
|
||||
num_blocks (int): The total number of blocks to manage.
|
||||
block_size (int): The size of each block in tokens.
|
||||
block_ids (Optional[Iterable[int]], optional): An optional iterable of
|
||||
block IDs. If not provided, block IDs will be assigned sequentially
|
||||
from 0 to num_blocks - 1.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
create_block: Block.Factory,
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
block_ids: Optional[Iterable[int]] = None,
|
||||
block_pool: Optional[BlockPool] = None,
|
||||
):
|
||||
if block_ids is None:
|
||||
block_ids = range(num_blocks)
|
||||
|
||||
self._free_block_indices: Deque[BlockId] = deque(block_ids)
|
||||
self._all_block_indices = frozenset(block_ids)
|
||||
assert len(self._all_block_indices) == num_blocks
|
||||
|
||||
self._refcounter = RefCounter(
|
||||
all_block_indices=self._free_block_indices)
|
||||
self._block_size = block_size
|
||||
|
||||
self._cow_tracker = CopyOnWriteTracker(
|
||||
refcounter=self._refcounter.as_readonly())
|
||||
|
||||
if block_pool is None:
|
||||
extra_factor = 4
|
||||
# Pre-allocate "num_blocks * extra_factor" block objects.
|
||||
# The "* extra_factor" is a buffer to allow more block objects
|
||||
# than physical blocks
|
||||
self._block_pool = BlockPool(self._block_size, create_block, self,
|
||||
num_blocks * extra_factor)
|
||||
else:
|
||||
# In this case, the block pool is provided by the caller,
|
||||
# which means that there is most likely a need to share
|
||||
# a block pool between allocators
|
||||
self._block_pool = block_pool
|
||||
|
||||
def allocate_immutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
extra_hash: Optional[int] = None,
|
||||
device: Optional[Device] = None) -> Block:
|
||||
"""Allocates a new immutable block with the given token IDs, linked to
|
||||
the previous block.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence. If
|
||||
None, then the block to be allocated is the first block in the
|
||||
sequence.
|
||||
token_ids (List[int]): The token IDs to be stored in the new block.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated immutable block.
|
||||
"""
|
||||
assert device is None
|
||||
block = self.allocate_mutable_block(prev_block=prev_block)
|
||||
block.append_token_ids(token_ids)
|
||||
return block
|
||||
|
||||
def allocate_immutable_blocks(
|
||||
self,
|
||||
prev_block: Optional[Block],
|
||||
block_token_ids: List[List[int]],
|
||||
extra_hash: Optional[int] = None,
|
||||
device: Optional[Device] = None) -> List[Block]:
|
||||
assert device is None
|
||||
num_blocks = len(block_token_ids)
|
||||
|
||||
block_ids = []
|
||||
for i in range(num_blocks):
|
||||
block_ids.append(self._allocate_block_id())
|
||||
|
||||
blocks = []
|
||||
for i in range(num_blocks):
|
||||
prev_block = self._block_pool.init_block(
|
||||
prev_block=prev_block,
|
||||
token_ids=block_token_ids[i],
|
||||
block_size=self._block_size,
|
||||
physical_block_id=block_ids[i])
|
||||
blocks.append(prev_block)
|
||||
|
||||
return blocks
|
||||
|
||||
def allocate_mutable_block(self,
|
||||
prev_block: Optional[Block],
|
||||
extra_hash: Optional[int] = None,
|
||||
device: Optional[Device] = None) -> Block:
|
||||
"""Allocates a new mutable block, linked to the previous block.
|
||||
|
||||
Args:
|
||||
prev_block (Optional[Block]): The previous block in the sequence. If
|
||||
None, then the block to be allocated is the first block in the
|
||||
sequence.
|
||||
|
||||
Returns:
|
||||
Block: The newly allocated mutable block.
|
||||
"""
|
||||
assert device is None
|
||||
block_id = self._allocate_block_id()
|
||||
block = self._block_pool.init_block(prev_block=prev_block,
|
||||
token_ids=[],
|
||||
block_size=self._block_size,
|
||||
physical_block_id=block_id)
|
||||
return block
|
||||
|
||||
def _allocate_block_id(self) -> BlockId:
|
||||
if not self._free_block_indices:
|
||||
raise BlockAllocator.NoFreeBlocksError()
|
||||
|
||||
block_id = self._free_block_indices.popleft()
|
||||
self._refcounter.incr(block_id)
|
||||
return block_id
|
||||
|
||||
def _free_block_id(self, block: Union[Block, BlockId]) -> None:
|
||||
if isinstance(block, Block):
|
||||
block_id = block.block_id
|
||||
block.block_id = None
|
||||
else:
|
||||
block_id = block
|
||||
assert block_id is not None
|
||||
|
||||
refcount = self._refcounter.decr(block_id)
|
||||
if refcount == 0:
|
||||
self._free_block_indices.appendleft(block_id)
|
||||
|
||||
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
||||
# Release the physical block id
|
||||
self._free_block_id(block)
|
||||
|
||||
# Release the block object
|
||||
if not keep_block_object:
|
||||
self._block_pool.free_block(block)
|
||||
|
||||
def free_block_id(self, block_id: BlockId) -> None:
|
||||
self._free_block_id(block_id)
|
||||
|
||||
def fork(self, last_block: Block) -> List[Block]:
|
||||
"""Creates a new sequence of blocks that shares the same underlying
|
||||
memory as the original sequence.
|
||||
|
||||
Args:
|
||||
last_block (Block): The last block in the original sequence.
|
||||
|
||||
Returns:
|
||||
List[Block]: The new sequence of blocks that shares the same memory
|
||||
as the original sequence.
|
||||
"""
|
||||
source_blocks = get_all_blocks_recursively(last_block)
|
||||
|
||||
forked_blocks: List[Block] = []
|
||||
prev_block = None
|
||||
for block in source_blocks:
|
||||
|
||||
# Increment refcount for each block.
|
||||
assert block.block_id is not None
|
||||
refcount = self._refcounter.incr(block.block_id)
|
||||
assert refcount != 1, "can't fork freed block"
|
||||
|
||||
forked_block = self._block_pool.init_block(
|
||||
prev_block=prev_block,
|
||||
token_ids=block.token_ids,
|
||||
block_size=self._block_size,
|
||||
physical_block_id=block.block_id)
|
||||
|
||||
forked_blocks.append(forked_block)
|
||||
prev_block = forked_blocks[-1]
|
||||
|
||||
return forked_blocks
|
||||
|
||||
def get_num_free_blocks(self) -> int:
|
||||
return len(self._free_block_indices)
|
||||
|
||||
def get_num_total_blocks(self) -> int:
|
||||
return len(self._all_block_indices)
|
||||
|
||||
def get_physical_block_id(self, absolute_id: int) -> int:
|
||||
"""Returns the zero-offset block id on certain block allocator
|
||||
given the absolute block id.
|
||||
|
||||
Args:
|
||||
absolute_id (int): The absolute block id for the block
|
||||
in whole allocator.
|
||||
|
||||
Returns:
|
||||
int: The zero-offset block id on certain device.
|
||||
"""
|
||||
return sorted(self._all_block_indices).index(absolute_id)
|
||||
|
||||
@property
|
||||
def refcounter(self):
|
||||
return self._refcounter
|
||||
|
||||
@property
|
||||
def all_block_ids(self) -> FrozenSet[int]:
|
||||
return self._all_block_indices
|
||||
|
||||
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
||||
"""Performs a copy-on-write operation on the given block if it is not
|
||||
appendable.
|
||||
|
||||
Args:
|
||||
block (Block): The block to check for copy-on-write.
|
||||
|
||||
Returns:
|
||||
BlockId: The block index of the new block if a copy-on-write
|
||||
operation was performed, or the original block index if
|
||||
no copy-on-write was necessary.
|
||||
"""
|
||||
src_block_id = block.block_id
|
||||
assert src_block_id is not None
|
||||
|
||||
if self._cow_tracker.is_appendable(block):
|
||||
return src_block_id
|
||||
|
||||
self._free_block_id(block)
|
||||
trg_block_id = self._allocate_block_id()
|
||||
|
||||
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
||||
|
||||
return trg_block_id
|
||||
|
||||
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
||||
"""Returns the copy-on-write source->destination mapping and clears it.
|
||||
|
||||
Returns:
|
||||
List[Tuple[BlockId, BlockId]]: A list mapping source
|
||||
block indices to destination block indices.
|
||||
"""
|
||||
return self._cow_tracker.clear_cows()
|
||||
|
||||
def mark_blocks_as_accessed(self, block_ids: List[int],
|
||||
now: float) -> None:
|
||||
"""Mark blocks as accessed, used in prefix caching.
|
||||
|
||||
Since the naive allocator does not implement prefix caching, we do
|
||||
nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
||||
"""Mark blocks as computed, used in prefix caching.
|
||||
|
||||
Since the naive allocator does not implement prefix caching, we do
|
||||
nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_common_computed_block_ids(
|
||||
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
||||
"""Determine blocks that can be skipped in prefill.
|
||||
|
||||
Since the naive allocator does not support prefix caching, always return
|
||||
an empty list.
|
||||
"""
|
||||
return []
|
||||
|
||||
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
||||
raise NotImplementedError("There is no promotion for naive blocks")
|
||||
|
||||
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
||||
"""Returns the number of full blocks that will be touched by
|
||||
swapping in/out.
|
||||
|
||||
Args:
|
||||
blocks: List of blocks to be swapped.
|
||||
Returns:
|
||||
int: the number of full blocks that will be touched by
|
||||
swapping in/out the given blocks. Non full blocks are ignored
|
||||
when deciding the number of blocks to touch.
|
||||
"""
|
||||
# NOTE: for naive block, we use set to eliminate common blocks among
|
||||
# seqs, also we compare the empty slots in the mutable blocks with
|
||||
# lookahead slots to get the number of unique new block that are
|
||||
# needed.
|
||||
old_block_set = set()
|
||||
for block in blocks:
|
||||
if block.is_full:
|
||||
old_block_set.add(block)
|
||||
return len(old_block_set)
|
||||
|
||||
def swap_out(self, blocks: List[Block]) -> None:
|
||||
for block in blocks:
|
||||
self._free_block_id(block)
|
||||
|
||||
def swap_in(self, blocks: List[Block]) -> None:
|
||||
for block in blocks:
|
||||
# Here we allocate either immutable or mutable block and then
|
||||
# extract its block_id. Note that the block object is released
|
||||
# and the block_id is assigned to "block" to allow reusing the
|
||||
# existing "block" object
|
||||
if block.is_full:
|
||||
tmp_block = self.allocate_immutable_block(
|
||||
prev_block=block.prev_block, token_ids=block.token_ids)
|
||||
else:
|
||||
tmp_block = self.allocate_mutable_block(
|
||||
prev_block=block.prev_block)
|
||||
tmp_block.append_token_ids(block.token_ids)
|
||||
|
||||
block_id = tmp_block.block_id
|
||||
tmp_block.block_id = None
|
||||
self._block_pool.free_block(tmp_block)
|
||||
|
||||
block.block_id = block_id # Assign block_id
|
||||
|
||||
def get_prefix_cache_hit_rate(self) -> float:
|
||||
return -1
|
||||
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""No prefix cache for naive block allocator."""
|
||||
return True
|
||||
|
||||
def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
|
||||
# Not applicable for naive block allocator.
|
||||
return []
|
||||
|
||||
|
||||
class NaiveBlock(Block):
|
||||
"""An implementation of the Block class that does not support prefix
|
||||
caching.
|
||||
|
||||
The NaiveBlock class represents a block of token IDs with a fixed size. It
|
||||
provides methods for appending token IDs to the block and manages copy-on
|
||||
-write operations when necessary.
|
||||
|
||||
Args:
|
||||
prev_block (Block): The previous block in the sequence.
|
||||
token_ids (List[int]): The initial token IDs to be stored in the block.
|
||||
block_size (int): The maximum number of token IDs that can be stored in
|
||||
the block.
|
||||
allocator (BlockAllocator): The block allocator associated with this
|
||||
block.
|
||||
block_id (Optional[int], optional): The physical block index
|
||||
of this block. Defaults to None, which means no allocation has been
|
||||
made.
|
||||
_cow_target (Optional[Block], optional): The copy-on-write target block.
|
||||
If not provided, it defaults to self.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
prev_block: Optional[Block],
|
||||
token_ids: List[int],
|
||||
block_size: int,
|
||||
allocator: BlockAllocator,
|
||||
block_id: Optional[int] = None,
|
||||
_cow_target: Optional[Block] = None,
|
||||
extra_hash: Optional[int] = None):
|
||||
self._token_ids: List[int] = []
|
||||
self._block_size = block_size
|
||||
self._prev_block = prev_block
|
||||
self._block_id = block_id
|
||||
self._allocator = allocator
|
||||
self._cow_target = _cow_target if _cow_target is not None else self
|
||||
|
||||
self._append_token_ids_no_cow(token_ids)
|
||||
|
||||
def append_token_ids(self, token_ids: List[int]) -> None:
|
||||
"""Appends the given token IDs to the block and performs a
|
||||
copy-on-write if necessary.
|
||||
|
||||
Args:
|
||||
token_ids (Optional[List[int]]): The token IDs to be appended
|
||||
to the block.
|
||||
"""
|
||||
self._append_token_ids_no_cow(token_ids)
|
||||
|
||||
if self._block_id is not None:
|
||||
self._block_id = (self._allocator.cow_block_if_not_appendable(
|
||||
self._cow_target))
|
||||
|
||||
def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
|
||||
"""Appends the given token IDs to the block
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): The token IDs to be appended to the block.
|
||||
"""
|
||||
if len(token_ids) == 0:
|
||||
return
|
||||
|
||||
assert len(token_ids) <= self.num_empty_slots
|
||||
|
||||
self._token_ids.extend(token_ids)
|
||||
|
||||
@property
|
||||
def computed(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
@computed.setter
|
||||
def computed(self, value) -> None:
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def last_accessed(self) -> float:
|
||||
raise NotImplementedError
|
||||
|
||||
@last_accessed.setter
|
||||
def last_accessed(self, last_accessed_ts: float):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def block_id(self) -> Optional[int]:
|
||||
return self._block_id
|
||||
|
||||
@block_id.setter
|
||||
def block_id(self, value: Optional[int]) -> None:
|
||||
self._block_id = value
|
||||
|
||||
@property
|
||||
def is_full(self) -> bool:
|
||||
return self.num_empty_slots == 0
|
||||
|
||||
@property
|
||||
def num_empty_slots(self) -> int:
|
||||
return self._block_size - len(self.token_ids)
|
||||
|
||||
@property
|
||||
def token_ids(self) -> List[int]:
|
||||
return self._token_ids
|
||||
|
||||
@property
|
||||
def num_tokens_total(self) -> int:
|
||||
raise NotImplementedError(
|
||||
"num_tokens_total is not used for naive block")
|
||||
|
||||
@property
|
||||
def block_size(self) -> int:
|
||||
return self._block_size
|
||||
|
||||
@property
|
||||
def prev_block(self) -> Optional["Block"]:
|
||||
return self._prev_block
|
||||
|
||||
@property
|
||||
def extra_hash(self):
|
||||
return None
|
||||
|
||||
@property
|
||||
def content_hash(self) -> Optional[int]:
|
||||
return None
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,28 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Block manager utils."""
|
||||
from vllm.sequence import SequenceGroup
|
||||
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
||||
STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
|
||||
|
||||
def check_no_caching_or_swa_for_blockmgr_encdec(
|
||||
block_mgr, seq_group: SequenceGroup) -> None:
|
||||
'''
|
||||
Enforce that prefix caching & sliding-window attention (SWA)
|
||||
are currently unsupported *specifically* for encoder/decoder models.
|
||||
|
||||
Raises NotImplementedError if unsupported scenario is detected.
|
||||
|
||||
Arguments:
|
||||
|
||||
* block_mgr: BlockSpaceManager instance
|
||||
* seq_group: SequenceGroup passed to block_mgr
|
||||
'''
|
||||
|
||||
if seq_group.is_encoder_decoder():
|
||||
if block_mgr.max_block_sliding_window is not None:
|
||||
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
|
||||
|
||||
if block_mgr.enable_caching:
|
||||
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
|
||||
@ -1,523 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""A block manager that manages token blocks."""
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Tuple
|
||||
|
||||
from vllm.core.block.block_table import BlockTable
|
||||
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
|
||||
from vllm.core.block.interfaces import Block
|
||||
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
|
||||
LastAccessBlocksTracker)
|
||||
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
|
||||
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
||||
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
|
||||
from vllm.utils import Device
|
||||
|
||||
SeqId = int
|
||||
EncoderSeqId = str
|
||||
|
||||
|
||||
class SelfAttnBlockSpaceManager(BlockSpaceManager):
|
||||
"""BlockSpaceManager which manages the allocation of KV cache.
|
||||
|
||||
It owns responsibility for allocation, swapping, allocating memory for
|
||||
autoregressively-generated tokens, and other advanced features such as
|
||||
prefix caching, forking/copy-on-write, and sliding-window memory allocation.
|
||||
|
||||
This class implements the design described in
|
||||
https://github.com/vllm-project/vllm/pull/3492.
|
||||
|
||||
Lookahead slots
|
||||
The block manager has the notion of a "lookahead slot". These are slots
|
||||
in the KV cache that are allocated for a sequence. Unlike the other
|
||||
allocated slots, the content of these slots is undefined -- the worker
|
||||
may use the memory allocations in any way.
|
||||
|
||||
In practice, a worker could use these lookahead slots to run multiple
|
||||
forward passes for a single scheduler invocation. Each successive
|
||||
forward pass would write KV activations to the corresponding lookahead
|
||||
slot. This allows low inter-token latency use-cases, where the overhead
|
||||
of continuous batching scheduling is amortized over >1 generated tokens.
|
||||
|
||||
Speculative decoding uses lookahead slots to store KV activations of
|
||||
proposal tokens.
|
||||
|
||||
See https://github.com/vllm-project/vllm/pull/3250 for more information
|
||||
on lookahead scheduling.
|
||||
|
||||
Args:
|
||||
block_size (int): The size of each memory block.
|
||||
num_gpu_blocks (int): The number of memory blocks allocated on GPU.
|
||||
num_cpu_blocks (int): The number of memory blocks allocated on CPU.
|
||||
watermark (float, optional): The threshold used for memory swapping.
|
||||
Defaults to 0.01.
|
||||
sliding_window (Optional[int], optional): The size of the sliding
|
||||
window. Defaults to None.
|
||||
enable_caching (bool, optional): Flag indicating whether caching is
|
||||
enabled. Defaults to False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int,
|
||||
num_gpu_blocks: int,
|
||||
num_cpu_blocks: int,
|
||||
watermark: float = 0.01,
|
||||
sliding_window: Optional[int] = None,
|
||||
enable_caching: bool = False,
|
||||
) -> None:
|
||||
self.block_size = block_size
|
||||
self.num_total_gpu_blocks = num_gpu_blocks
|
||||
self.num_total_cpu_blocks = num_cpu_blocks
|
||||
|
||||
self.sliding_window = sliding_window
|
||||
# max_block_sliding_window is the max number of blocks that need to be
|
||||
# allocated
|
||||
self.max_block_sliding_window = None
|
||||
if sliding_window is not None:
|
||||
# +1 here because // rounds down
|
||||
num_blocks = sliding_window // block_size + 1
|
||||
# +1 here because the last block may not be full,
|
||||
# and so the sequence stretches one more block at the beginning
|
||||
# For example, if sliding_window is 3 and block_size is 4,
|
||||
# we may need 2 blocks when the second block only holds 1 token.
|
||||
self.max_block_sliding_window = num_blocks + 1
|
||||
|
||||
self.watermark = watermark
|
||||
assert watermark >= 0.0
|
||||
|
||||
self.enable_caching = enable_caching
|
||||
|
||||
self.watermark_blocks = int(watermark * num_gpu_blocks)
|
||||
|
||||
self.block_allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type="prefix_caching" if enable_caching else "naive",
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
self.block_tables: Dict[SeqId, BlockTable] = {}
|
||||
self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
|
||||
|
||||
self._computed_blocks_tracker = ComputedBlocksTracker(
|
||||
self.block_allocator, self.block_size, self.enable_caching)
|
||||
self._last_access_blocks_tracker = LastAccessBlocksTracker(
|
||||
self.block_allocator)
|
||||
|
||||
def can_allocate(self,
|
||||
seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int = 0) -> AllocStatus:
|
||||
# FIXME(woosuk): Here we assume that all sequences in the group share
|
||||
# the same prompt. This may not be true for preempted sequences.
|
||||
|
||||
check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
|
||||
|
||||
seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
|
||||
num_required_blocks = BlockTable.get_num_required_blocks(
|
||||
seq.get_token_ids(),
|
||||
block_size=self.block_size,
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
)
|
||||
|
||||
if seq_group.is_encoder_decoder():
|
||||
encoder_seq = seq_group.get_encoder_seq()
|
||||
assert encoder_seq is not None
|
||||
num_required_blocks += BlockTable.get_num_required_blocks(
|
||||
encoder_seq.get_token_ids(),
|
||||
block_size=self.block_size,
|
||||
)
|
||||
|
||||
if self.max_block_sliding_window is not None:
|
||||
num_required_blocks = min(num_required_blocks,
|
||||
self.max_block_sliding_window)
|
||||
|
||||
num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
|
||||
device=Device.GPU)
|
||||
|
||||
# Use watermark to avoid frequent cache eviction.
|
||||
if (self.num_total_gpu_blocks - num_required_blocks
|
||||
< self.watermark_blocks):
|
||||
return AllocStatus.NEVER
|
||||
if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
|
||||
return AllocStatus.OK
|
||||
else:
|
||||
return AllocStatus.LATER
|
||||
|
||||
def _allocate_sequence(self, seq: Sequence) -> BlockTable:
|
||||
block_table = BlockTable(
|
||||
block_size=self.block_size,
|
||||
block_allocator=self.block_allocator,
|
||||
max_block_sliding_window=self.max_block_sliding_window,
|
||||
)
|
||||
if seq.get_token_ids():
|
||||
# NOTE: If there are any factors affecting the block besides
|
||||
# token_ids, they should be added as input to extra_hash.
|
||||
extra_hash = seq.extra_hash()
|
||||
|
||||
# Add blocks to the block table only if the sequence is non empty.
|
||||
block_table.allocate(token_ids=seq.get_token_ids(),
|
||||
extra_hash=extra_hash)
|
||||
|
||||
return block_table
|
||||
|
||||
def allocate(self, seq_group: SequenceGroup) -> None:
|
||||
|
||||
# Allocate self-attention block tables for decoder sequences
|
||||
waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
|
||||
assert not (set(seq.seq_id for seq in waiting_seqs)
|
||||
& self.block_tables.keys()), "block table already exists"
|
||||
|
||||
# NOTE: Here we assume that all sequences in the group have the same
|
||||
# prompt.
|
||||
seq = waiting_seqs[0]
|
||||
block_table: BlockTable = self._allocate_sequence(seq)
|
||||
self.block_tables[seq.seq_id] = block_table
|
||||
|
||||
# Track seq
|
||||
self._last_access_blocks_tracker.add_seq(seq.seq_id)
|
||||
|
||||
# Assign the block table for each sequence.
|
||||
for seq in waiting_seqs[1:]:
|
||||
self.block_tables[seq.seq_id] = block_table.fork()
|
||||
|
||||
# Track seq
|
||||
self._last_access_blocks_tracker.add_seq(seq.seq_id)
|
||||
|
||||
# Allocate cross-attention block table for encoder sequence
|
||||
#
|
||||
# NOTE: Here we assume that all sequences in the group have the same
|
||||
# encoder prompt.
|
||||
request_id = seq_group.request_id
|
||||
|
||||
assert (request_id
|
||||
not in self.cross_block_tables), \
|
||||
"block table already exists"
|
||||
|
||||
check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
|
||||
|
||||
if seq_group.is_encoder_decoder():
|
||||
encoder_seq = seq_group.get_encoder_seq()
|
||||
assert encoder_seq is not None
|
||||
block_table = self._allocate_sequence(encoder_seq)
|
||||
self.cross_block_tables[request_id] = block_table
|
||||
|
||||
def can_append_slots(self, seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int) -> bool:
|
||||
"""Determine if there is enough space in the GPU KV cache to continue
|
||||
generation of the specified sequence group.
|
||||
|
||||
We use a worst-case heuristic: assume each touched block will require a
|
||||
new allocation (either via CoW or new block). We can append slots if the
|
||||
number of touched blocks is less than the number of free blocks.
|
||||
|
||||
"Lookahead slots" are slots that are allocated in addition to the slots
|
||||
for known tokens. The contents of the lookahead slots are not defined.
|
||||
This is used by speculative decoding when speculating future tokens.
|
||||
"""
|
||||
|
||||
num_touched_blocks = 0
|
||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||
block_table = self.block_tables[seq.seq_id]
|
||||
|
||||
num_touched_blocks += (
|
||||
block_table.get_num_blocks_touched_by_append_slots(
|
||||
token_ids=block_table.get_unseen_token_ids(
|
||||
seq.get_token_ids()),
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
))
|
||||
|
||||
num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
|
||||
Device.GPU)
|
||||
return num_touched_blocks <= num_free_gpu_blocks
|
||||
|
||||
def append_slots(
|
||||
self,
|
||||
seq: Sequence,
|
||||
num_lookahead_slots: int,
|
||||
) -> List[Tuple[int, int]]:
|
||||
|
||||
block_table = self.block_tables[seq.seq_id]
|
||||
|
||||
block_table.append_token_ids(
|
||||
token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
num_computed_slots=seq.data.get_num_computed_tokens(),
|
||||
extra_hash=seq.extra_hash(),
|
||||
)
|
||||
# Return any new copy-on-writes.
|
||||
new_cows = self.block_allocator.clear_copy_on_writes()
|
||||
return new_cows
|
||||
|
||||
def free(self, seq: Sequence) -> None:
|
||||
seq_id = seq.seq_id
|
||||
|
||||
if seq_id not in self.block_tables:
|
||||
# Already freed or haven't been scheduled yet.
|
||||
return
|
||||
|
||||
# Update seq block ids with the latest access time
|
||||
self._last_access_blocks_tracker.update_seq_blocks_last_access(
|
||||
seq_id, self.block_tables[seq.seq_id].physical_block_ids)
|
||||
|
||||
# Untrack seq
|
||||
self._last_access_blocks_tracker.remove_seq(seq_id)
|
||||
self._computed_blocks_tracker.remove_seq(seq_id)
|
||||
|
||||
# Free table/blocks
|
||||
self.block_tables[seq_id].free()
|
||||
del self.block_tables[seq_id]
|
||||
|
||||
def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
|
||||
seq_id = seq.seq_id
|
||||
self._computed_blocks_tracker.remove_seq(seq_id)
|
||||
|
||||
def free_cross(self, seq_group: SequenceGroup) -> None:
|
||||
request_id = seq_group.request_id
|
||||
if request_id not in self.cross_block_tables:
|
||||
# Already freed or hasn't been scheduled yet.
|
||||
return
|
||||
self.cross_block_tables[request_id].free()
|
||||
del self.cross_block_tables[request_id]
|
||||
|
||||
def get_block_table(self, seq: Sequence) -> List[int]:
|
||||
block_ids = self.block_tables[seq.seq_id].physical_block_ids
|
||||
return block_ids # type: ignore
|
||||
|
||||
def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
|
||||
request_id = seq_group.request_id
|
||||
assert request_id in self.cross_block_tables
|
||||
block_ids = self.cross_block_tables[request_id].physical_block_ids
|
||||
assert all(b is not None for b in block_ids)
|
||||
return block_ids # type: ignore
|
||||
|
||||
def access_all_blocks_in_seq(self, seq: Sequence, now: float):
|
||||
if self.enable_caching:
|
||||
# Record the latest access time for the sequence. The actual update
|
||||
# of the block ids is deferred to the sequence free(..) call, since
|
||||
# only during freeing of block ids, the blocks are actually added to
|
||||
# the evictor (which is when the most updated time is required)
|
||||
# (This avoids expensive calls to mark_blocks_as_accessed(..))
|
||||
self._last_access_blocks_tracker.update_last_access(
|
||||
seq.seq_id, now)
|
||||
|
||||
def mark_blocks_as_computed(self, seq_group: SequenceGroup,
|
||||
token_chunk_size: int):
|
||||
# If prefix caching is enabled, mark immutable blocks as computed
|
||||
# right after they have been scheduled (for prefill). This assumes
|
||||
# the scheduler is synchronous so blocks are actually computed when
|
||||
# scheduling the next batch.
|
||||
self.block_allocator.mark_blocks_as_computed([])
|
||||
|
||||
def get_common_computed_block_ids(
|
||||
self, seqs: List[Sequence]) -> GenericSequence[int]:
|
||||
"""Determine which blocks for which we skip prefill.
|
||||
|
||||
With prefix caching we can skip prefill for previously-generated blocks.
|
||||
Currently, the attention implementation only supports skipping cached
|
||||
blocks if they are a contiguous prefix of cached blocks.
|
||||
|
||||
This method determines which blocks can be safely skipped for all
|
||||
sequences in the sequence group.
|
||||
"""
|
||||
computed_seq_block_ids = []
|
||||
for seq in seqs:
|
||||
all_blocks = self.block_tables[seq.seq_id].physical_block_ids
|
||||
num_cached_tokens = (
|
||||
self._computed_blocks_tracker.get_num_cached_tokens(seq))
|
||||
assert num_cached_tokens % self.block_size == 0
|
||||
num_cached_blocks = num_cached_tokens // self.block_size
|
||||
computed_block_ids = all_blocks[:num_cached_blocks]
|
||||
computed_seq_block_ids.append(computed_block_ids)
|
||||
|
||||
# NOTE(sang): This assumes seq_block_ids doesn't contain any None.
|
||||
return self.block_allocator.get_common_computed_block_ids(
|
||||
computed_seq_block_ids) # type: ignore
|
||||
|
||||
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
||||
if parent_seq.seq_id not in self.block_tables:
|
||||
# Parent sequence has either been freed or never existed.
|
||||
return
|
||||
src_block_table = self.block_tables[parent_seq.seq_id]
|
||||
self.block_tables[child_seq.seq_id] = src_block_table.fork()
|
||||
|
||||
# Track child seq
|
||||
self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
|
||||
|
||||
def can_swap_in(self, seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int) -> AllocStatus:
|
||||
"""Returns the AllocStatus for the given sequence_group
|
||||
with num_lookahead_slots.
|
||||
|
||||
Args:
|
||||
seq_group (SequenceGroup): The sequence group to swap in.
|
||||
num_lookahead_slots (int): Number of lookahead slots used in
|
||||
speculative decoding, default to 0.
|
||||
|
||||
Returns:
|
||||
AllocStatus: The AllocStatus for the given sequence group.
|
||||
"""
|
||||
return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
|
||||
num_lookahead_slots)
|
||||
|
||||
def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||
"""Returns the block id mapping (from CPU to GPU) generated by
|
||||
swapping in the given seq_group with num_lookahead_slots.
|
||||
|
||||
Args:
|
||||
seq_group (SequenceGroup): The sequence group to swap in.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: The mapping of swapping block from CPU
|
||||
to GPU.
|
||||
"""
|
||||
physical_block_id_mapping = []
|
||||
for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
|
||||
blocks = self.block_tables[seq.seq_id].blocks
|
||||
if len(blocks) == 0:
|
||||
continue
|
||||
|
||||
seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
|
||||
src_device=Device.CPU,
|
||||
dst_device=Device.GPU)
|
||||
|
||||
# Refresh the block ids of the table (post-swap)
|
||||
self.block_tables[seq.seq_id].update(blocks)
|
||||
|
||||
seq_physical_block_id_mapping = {
|
||||
self.block_allocator.get_physical_block_id(
|
||||
Device.CPU, cpu_block_id):
|
||||
self.block_allocator.get_physical_block_id(
|
||||
Device.GPU, gpu_block_id)
|
||||
for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
|
||||
}
|
||||
|
||||
physical_block_id_mapping.extend(
|
||||
list(seq_physical_block_id_mapping.items()))
|
||||
|
||||
return physical_block_id_mapping
|
||||
|
||||
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
||||
"""Returns whether we can swap out the given sequence_group
|
||||
with num_lookahead_slots.
|
||||
|
||||
Args:
|
||||
seq_group (SequenceGroup): The sequence group to swap out.
|
||||
|
||||
Returns:
|
||||
bool: Whether it's possible to swap out current sequence group.
|
||||
"""
|
||||
alloc_status = self._can_swap(seq_group, Device.CPU,
|
||||
SequenceStatus.RUNNING)
|
||||
return alloc_status == AllocStatus.OK
|
||||
|
||||
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||
"""Returns the block id mapping (from GPU to CPU) generated by
|
||||
swapping out the given sequence_group with num_lookahead_slots.
|
||||
|
||||
Args:
|
||||
seq_group (SequenceGroup): The sequence group to swap out.
|
||||
|
||||
Returns:
|
||||
List[Tuple[int, int]]: The mapping of swapping block from
|
||||
GPU to CPU.
|
||||
"""
|
||||
physical_block_id_mapping = []
|
||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||
blocks = self.block_tables[seq.seq_id].blocks
|
||||
if len(blocks) == 0:
|
||||
continue
|
||||
|
||||
seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
|
||||
src_device=Device.GPU,
|
||||
dst_device=Device.CPU)
|
||||
|
||||
# Refresh the block ids of the table (post-swap)
|
||||
self.block_tables[seq.seq_id].update(blocks)
|
||||
|
||||
seq_physical_block_id_mapping = {
|
||||
self.block_allocator.get_physical_block_id(
|
||||
Device.GPU, gpu_block_id):
|
||||
self.block_allocator.get_physical_block_id(
|
||||
Device.CPU, cpu_block_id)
|
||||
for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
|
||||
}
|
||||
|
||||
physical_block_id_mapping.extend(
|
||||
list(seq_physical_block_id_mapping.items()))
|
||||
|
||||
return physical_block_id_mapping
|
||||
|
||||
def get_num_free_gpu_blocks(self) -> int:
|
||||
return self.block_allocator.get_num_free_blocks(Device.GPU)
|
||||
|
||||
def get_num_free_cpu_blocks(self) -> int:
|
||||
return self.block_allocator.get_num_free_blocks(Device.CPU)
|
||||
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
return self.block_allocator.get_prefix_cache_hit_rate(device)
|
||||
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
return self.block_allocator.reset_prefix_cache(device)
|
||||
|
||||
def _can_swap(self,
|
||||
seq_group: SequenceGroup,
|
||||
device: Device,
|
||||
status: SequenceStatus,
|
||||
num_lookahead_slots: int = 0) -> AllocStatus:
|
||||
"""Returns the AllocStatus for swapping in/out the given sequence_group
|
||||
on to the 'device'.
|
||||
|
||||
Args:
|
||||
seq_group (SequenceGroup): The sequence group to swap in/out.
|
||||
device (Device): device to swap the 'seq_group' on.
|
||||
status (SequenceStatus): The status of sequence which is needed
|
||||
for action. RUNNING for swap out and SWAPPED for swap in
|
||||
num_lookahead_slots (int): Number of lookahead slots used in
|
||||
speculative decoding, default to 0.
|
||||
|
||||
Returns:
|
||||
AllocStatus: The AllocStatus for swapping in/out the given
|
||||
sequence_group on to the 'device'.
|
||||
"""
|
||||
# First determine the number of blocks that will be touched by this
|
||||
# swap. Then verify if there are available blocks in the device
|
||||
# to perform the swap.
|
||||
num_blocks_touched = 0
|
||||
blocks: List[Block] = []
|
||||
for seq in seq_group.get_seqs(status=status):
|
||||
block_table = self.block_tables[seq.seq_id]
|
||||
if block_table.blocks is not None:
|
||||
# Compute the number blocks to touch for the tokens to be
|
||||
# appended. This does NOT include the full blocks that need
|
||||
# to be touched for the swap.
|
||||
num_blocks_touched += \
|
||||
block_table.get_num_blocks_touched_by_append_slots(
|
||||
block_table.get_unseen_token_ids(seq.get_token_ids()),
|
||||
num_lookahead_slots=num_lookahead_slots)
|
||||
blocks.extend(block_table.blocks)
|
||||
# Compute the number of full blocks to touch and add it to the
|
||||
# existing count of blocks to touch.
|
||||
num_blocks_touched += self.block_allocator.get_num_full_blocks_touched(
|
||||
blocks, device=device)
|
||||
|
||||
watermark_blocks = 0
|
||||
if device == Device.GPU:
|
||||
watermark_blocks = self.watermark_blocks
|
||||
|
||||
if self.block_allocator.get_num_total_blocks(
|
||||
device) < num_blocks_touched:
|
||||
return AllocStatus.NEVER
|
||||
elif self.block_allocator.get_num_free_blocks(
|
||||
device) - num_blocks_touched >= watermark_blocks:
|
||||
return AllocStatus.OK
|
||||
else:
|
||||
return AllocStatus.LATER
|
||||
|
||||
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
||||
"""Get the number of tokens in blocks that are already computed and
|
||||
cached in the block manager for the sequence.
|
||||
"""
|
||||
return self._computed_blocks_tracker.get_num_cached_tokens(seq)
|
||||
@ -1,157 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
import heapq
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
class EvictionPolicy(enum.Enum):
|
||||
"""Enum for eviction policy used by make_evictor to instantiate the correct
|
||||
Evictor subclass.
|
||||
"""
|
||||
LRU = enum.auto()
|
||||
|
||||
|
||||
class Evictor(ABC):
|
||||
"""The Evictor subclasses should be used by the BlockAllocator class to
|
||||
handle eviction of freed Blocks.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __contains__(self, block_id: int) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def evict(self) -> Tuple[int, int]:
|
||||
"""Runs the eviction algorithm and returns the evicted block's
|
||||
content hash along with physical block id along with physical block id
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
|
||||
last_accessed: float):
|
||||
"""Adds block to the evictor, making it a candidate for eviction"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def update(self, block_id: int, last_accessed: float):
|
||||
"""Update corresponding block's access time in metadata"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def remove(self, block_id: int):
|
||||
"""Remove a given block id from the cache."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def num_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
|
||||
class BlockMetaData:
|
||||
"""Data structure for storing key data describe cached block, so that
|
||||
evictor could use to make its decision which one to choose for eviction
|
||||
|
||||
Here we use physical block id as the dict key, as there maybe several
|
||||
blocks with the same content hash, but their physical id is unique.
|
||||
"""
|
||||
|
||||
def __init__(self, content_hash: int, num_hashed_tokens: int,
|
||||
last_accessed: float):
|
||||
self.content_hash = content_hash
|
||||
self.num_hashed_tokens = num_hashed_tokens
|
||||
self.last_accessed = last_accessed
|
||||
|
||||
|
||||
class LRUEvictor(Evictor):
|
||||
"""Evicts in a least-recently-used order using the last_accessed timestamp
|
||||
that's recorded in the Block. If there are multiple blocks with
|
||||
the same last_accessed time, then the one with the largest num_hashed_tokens
|
||||
will be evicted. If two blocks each have the lowest last_accessed time and
|
||||
highest num_hashed_tokens value, then one will be chosen arbitrarily
|
||||
"""
|
||||
|
||||
# CLEANUP_THRESHOLD determines the maximum allowable size of the priority
|
||||
# queue relative to the free table size. When this threshold is exceeded,
|
||||
# a cleanup operation is triggered to reduce memory usage.
|
||||
CLEANUP_THRESHOLD = 50
|
||||
|
||||
def __init__(self):
|
||||
self.free_table: Dict[int, BlockMetaData] = {}
|
||||
self.priority_queue = []
|
||||
|
||||
def __contains__(self, block_id: int) -> bool:
|
||||
return block_id in self.free_table
|
||||
|
||||
def evict(self) -> Tuple[int, int]:
|
||||
if len(self.free_table) == 0:
|
||||
raise ValueError("No usable cache memory left")
|
||||
|
||||
while self.priority_queue:
|
||||
# We do not remove outdated entries from the priority queue at the
|
||||
# time of updating the last_accessed timestamp. Instead, outdated
|
||||
# entries are filtered out here during eviction. Outdated entries
|
||||
# would either not in the free table, or have older last accessed
|
||||
# time.
|
||||
last_accessed, _, block_id, content_hash = heapq.heappop(
|
||||
self.priority_queue)
|
||||
if (block_id in self.free_table and
|
||||
self.free_table[block_id].last_accessed == last_accessed):
|
||||
self.free_table.pop(block_id)
|
||||
return block_id, content_hash
|
||||
|
||||
raise ValueError("No usable cache memory left")
|
||||
|
||||
def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
|
||||
last_accessed: float):
|
||||
self.free_table[block_id] = BlockMetaData(content_hash,
|
||||
num_hashed_tokens,
|
||||
last_accessed)
|
||||
heapq.heappush(
|
||||
self.priority_queue,
|
||||
(last_accessed, -num_hashed_tokens, block_id, content_hash))
|
||||
self._cleanup_if_necessary()
|
||||
|
||||
def update(self, block_id: int, last_accessed: float):
|
||||
self.free_table[block_id].last_accessed = last_accessed
|
||||
|
||||
def _cleanup_if_necessary(self):
|
||||
if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
|
||||
self.free_table):
|
||||
self._cleanup()
|
||||
|
||||
def _cleanup(self):
|
||||
new_priority_queue: List[Tuple[float, int, int, int]] = []
|
||||
|
||||
for block_id, block in self.free_table.items():
|
||||
new_priority_queue.append(
|
||||
(block.last_accessed, -block.num_hashed_tokens, block_id,
|
||||
block.content_hash))
|
||||
heapq.heapify(new_priority_queue)
|
||||
|
||||
self.priority_queue = new_priority_queue
|
||||
|
||||
def remove(self, block_id: int):
|
||||
if block_id not in self.free_table:
|
||||
raise ValueError(
|
||||
"Attempting to remove block that's not in the evictor")
|
||||
self.free_table.pop(block_id)
|
||||
|
||||
@property
|
||||
def num_blocks(self) -> int:
|
||||
return len(self.free_table)
|
||||
|
||||
|
||||
def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
|
||||
if eviction_policy == EvictionPolicy.LRU:
|
||||
return LRUEvictor()
|
||||
else:
|
||||
raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
|
||||
@ -1,139 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
from typing import Sequence as GenericSequence
|
||||
from typing import Tuple
|
||||
|
||||
from vllm.sequence import Sequence, SequenceGroup
|
||||
from vllm.utils import Device
|
||||
|
||||
|
||||
class AllocStatus(enum.Enum):
|
||||
"""Result for BlockSpaceManager.can_allocate
|
||||
|
||||
1. Ok: seq_group can be allocated now.
|
||||
2. Later: seq_group cannot be allocated.
|
||||
The capacity of allocator is larger than seq_group required.
|
||||
3. Never: seq_group can never be allocated.
|
||||
The seq_group is too large to allocated in GPU.
|
||||
"""
|
||||
OK = enum.auto()
|
||||
LATER = enum.auto()
|
||||
NEVER = enum.auto()
|
||||
|
||||
|
||||
class BlockSpaceManager(ABC):
|
||||
|
||||
@staticmethod
|
||||
def get_block_space_manager_class(version: str):
|
||||
version = version.lower()
|
||||
|
||||
if version == "selfattn":
|
||||
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
||||
return SelfAttnBlockSpaceManager
|
||||
|
||||
if version == "placeholder":
|
||||
from vllm.core.placeholder_block_space_manager import (
|
||||
PlaceholderBlockSpaceManager)
|
||||
return PlaceholderBlockSpaceManager
|
||||
|
||||
raise ValueError(f"Unknown version {version=}")
|
||||
|
||||
@abstractmethod
|
||||
def can_allocate(self,
|
||||
seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int = 0) -> AllocStatus:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allocate(self, seq_group: SequenceGroup) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def can_append_slots(self, seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def append_slots(
|
||||
self,
|
||||
seq: Sequence,
|
||||
num_lookahead_slots: int,
|
||||
) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def can_swap_in(self, seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int) -> AllocStatus:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def free(self, seq: Sequence) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_block_table(self, seq: Sequence) -> List[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_free_gpu_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_free_cpu_blocks(self) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def access_all_blocks_in_seq(
|
||||
self,
|
||||
seq: Sequence,
|
||||
access_time: float,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_common_computed_block_ids(
|
||||
self, seqs: List[Sequence]) -> GenericSequence[int]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mark_blocks_as_computed(self, seq_group: SequenceGroup,
|
||||
token_chunk_size: int):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
"""Reset prefix cache for specified or all devices."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
|
||||
pass
|
||||
@ -1,103 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
||||
from vllm.sequence import Sequence, SequenceGroup
|
||||
from vllm.utils import Device
|
||||
|
||||
|
||||
class PlaceholderBlockSpaceManager(BlockSpaceManager):
|
||||
"""A version of BlockSpaceManager for use in environments
|
||||
where block management is not required.
|
||||
For example: pooling models or attention-free models like Mamba.
|
||||
|
||||
This class provides the same interface as BlockSpaceManager, but its
|
||||
methods perform no actions or return simple values like True in specific
|
||||
actions. It's designed to be used in scenarios where the overhead of
|
||||
block management is unnecessary, such as in an embedding environment.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
def can_allocate(self,
|
||||
seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int = 0) -> AllocStatus:
|
||||
# Always return OK for dummy purposes
|
||||
return AllocStatus.OK
|
||||
|
||||
def allocate(self, seq_group: SequenceGroup) -> None:
|
||||
# No actual allocation logic needed
|
||||
pass
|
||||
|
||||
def can_append_slots(self, seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int) -> bool:
|
||||
return True
|
||||
|
||||
def append_slots(
|
||||
self,
|
||||
seq: Sequence,
|
||||
num_lookahead_slots: int,
|
||||
) -> List[Tuple[int, int]]:
|
||||
return []
|
||||
|
||||
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
||||
pass
|
||||
|
||||
def can_swap_in(self, seq_group: SequenceGroup,
|
||||
num_lookahead_slots: int) -> AllocStatus:
|
||||
return AllocStatus.OK
|
||||
|
||||
def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||
return None # type: ignore
|
||||
|
||||
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
||||
return True
|
||||
|
||||
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
||||
return None # type: ignore
|
||||
|
||||
def free(self, seq: Sequence) -> None:
|
||||
# No operation on free
|
||||
return
|
||||
|
||||
def get_block_table(self, seq: Sequence) -> List[int]:
|
||||
return None # type: ignore
|
||||
|
||||
def get_num_free_gpu_blocks(self) -> int:
|
||||
return 1
|
||||
|
||||
def get_num_free_cpu_blocks(self) -> int:
|
||||
return 1
|
||||
|
||||
def access_all_blocks_in_seq(
|
||||
self,
|
||||
seq: Sequence,
|
||||
access_time: float,
|
||||
) -> None:
|
||||
pass
|
||||
|
||||
def get_common_computed_block_ids(self,
|
||||
seq_group: List[Sequence]) -> List[int]:
|
||||
return []
|
||||
|
||||
def mark_blocks_as_computed(self, seq_group: SequenceGroup,
|
||||
token_chunk_size: int):
|
||||
pass
|
||||
|
||||
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
||||
return -1
|
||||
|
||||
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
|
||||
return True
|
||||
|
||||
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
||||
return 0
|
||||
|
||||
def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
|
||||
return
|
||||
File diff suppressed because it is too large
Load Diff
@ -41,7 +41,8 @@ from vllm.plugins import load_general_plugins
|
||||
from vllm.ray.lazy_utils import is_ray_initialized
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
from vllm.transformers_utils.config import get_model_path, is_interleaved
|
||||
from vllm.transformers_utils.config import (get_model_path, is_interleaved,
|
||||
maybe_override_with_speculators)
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
|
||||
GiB_bytes, get_ip, is_in_ray_actor)
|
||||
@ -409,9 +410,7 @@ class EngineArgs:
|
||||
get_field(LoadConfig, "model_loader_extra_config")
|
||||
ignore_patterns: Optional[Union[str,
|
||||
List[str]]] = LoadConfig.ignore_patterns
|
||||
preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
|
||||
|
||||
scheduler_delay_factor: float = SchedulerConfig.delay_factor
|
||||
enable_chunked_prefill: Optional[
|
||||
bool] = SchedulerConfig.enable_chunked_prefill
|
||||
disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
|
||||
@ -439,7 +438,6 @@ class EngineArgs:
|
||||
ObservabilityConfig.otlp_traces_endpoint
|
||||
collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
|
||||
ObservabilityConfig.collect_detailed_traces
|
||||
disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
|
||||
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
|
||||
scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
|
||||
|
||||
@ -561,14 +559,6 @@ class EngineArgs:
|
||||
**model_kwargs["enable_prompt_embeds"])
|
||||
model_group.add_argument("--served-model-name",
|
||||
**model_kwargs["served_model_name"])
|
||||
# This one is a special case because it is the
|
||||
# opposite of ModelConfig.use_async_output_proc
|
||||
model_group.add_argument(
|
||||
"--disable-async-output-proc",
|
||||
action="store_true",
|
||||
default=EngineArgs.disable_async_output_proc,
|
||||
help="Disable async output processing. This may result in "
|
||||
"lower performance.")
|
||||
model_group.add_argument("--config-format",
|
||||
**model_kwargs["config_format"])
|
||||
# This one is a special case because it can bool
|
||||
@ -897,10 +887,6 @@ class EngineArgs:
|
||||
**scheduler_kwargs["long_prefill_token_threshold"])
|
||||
scheduler_group.add_argument("--num-lookahead-slots",
|
||||
**scheduler_kwargs["num_lookahead_slots"])
|
||||
scheduler_group.add_argument("--scheduler-delay-factor",
|
||||
**scheduler_kwargs["delay_factor"])
|
||||
scheduler_group.add_argument("--preemption-mode",
|
||||
**scheduler_kwargs["preemption_mode"])
|
||||
# multi-step scheduling has been removed; corresponding arguments
|
||||
# are no longer supported.
|
||||
scheduler_group.add_argument("--scheduling-policy",
|
||||
@ -1029,7 +1015,6 @@ class EngineArgs:
|
||||
interleave_mm_strings=self.interleave_mm_strings,
|
||||
media_io_kwargs=self.media_io_kwargs,
|
||||
skip_mm_profiling=self.skip_mm_profiling,
|
||||
use_async_output_proc=not self.disable_async_output_proc,
|
||||
config_format=self.config_format,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
mm_processor_cache_gb=self.mm_processor_cache_gb,
|
||||
@ -1098,29 +1083,8 @@ class EngineArgs:
|
||||
provided as a JSON string input via CLI arguments or directly as a
|
||||
dictionary from the engine.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.transformers_utils.configs.speculators.base import (
|
||||
SpeculatorsConfig)
|
||||
|
||||
if self.speculative_config is None:
|
||||
hf_config = get_config(
|
||||
self.hf_config_path or target_model_config.model,
|
||||
self.trust_remote_code, self.revision, self.code_revision,
|
||||
self.config_format)
|
||||
|
||||
# if loading a SpeculatorsConfig, load the speculative_config
|
||||
# details from the config directly
|
||||
# no user input required / expected
|
||||
if isinstance(hf_config, SpeculatorsConfig):
|
||||
# We create one since we don't create one
|
||||
self.speculative_config = {}
|
||||
self.speculative_config[
|
||||
"num_speculative_tokens"] = hf_config.num_lookahead_tokens
|
||||
self.speculative_config["model"] = target_model_config.model
|
||||
self.speculative_config["method"] = hf_config.method
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
|
||||
# Note(Shangming): These parameters are not obtained from the cli arg
|
||||
# '--speculative-config' and must be passed in when creating the engine
|
||||
@ -1155,6 +1119,15 @@ class EngineArgs:
|
||||
|
||||
device_config = DeviceConfig(
|
||||
device=cast(Device, current_platform.device_type))
|
||||
|
||||
(self.model, self.tokenizer,
|
||||
self.speculative_config) = maybe_override_with_speculators(
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
revision=self.revision,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
vllm_speculative_config=self.speculative_config,
|
||||
)
|
||||
model_config = self.create_model_config()
|
||||
|
||||
# * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
|
||||
@ -1395,11 +1368,9 @@ class EngineArgs:
|
||||
max_model_len=model_config.max_model_len,
|
||||
cuda_graph_sizes=self.cuda_graph_sizes,
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
delay_factor=self.scheduler_delay_factor,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
disable_chunked_mm_input=self.disable_chunked_mm_input,
|
||||
is_multimodal_model=model_config.is_multimodal_model,
|
||||
preemption_mode=self.preemption_mode,
|
||||
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
and parallel_config.use_ray),
|
||||
policy=self.scheduling_policy,
|
||||
@ -1486,42 +1457,12 @@ class EngineArgs:
|
||||
#############################################################
|
||||
# Unsupported Feature Flags on V1.
|
||||
|
||||
if self.load_format == "sharded_state":
|
||||
_raise_or_fallback(
|
||||
feature_name=f"--load_format {self.load_format}",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
if (self.logits_processor_pattern
|
||||
!= EngineArgs.logits_processor_pattern):
|
||||
_raise_or_fallback(feature_name="--logits-processor-pattern",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
if self.preemption_mode != SchedulerConfig.preemption_mode:
|
||||
_raise_or_fallback(feature_name="--preemption-mode",
|
||||
recommend_to_remove=True)
|
||||
return False
|
||||
|
||||
if (self.disable_async_output_proc
|
||||
!= EngineArgs.disable_async_output_proc):
|
||||
_raise_or_fallback(feature_name="--disable-async-output-proc",
|
||||
recommend_to_remove=True)
|
||||
return False
|
||||
|
||||
if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
|
||||
_raise_or_fallback(feature_name="--scheduler-delay-factor",
|
||||
recommend_to_remove=True)
|
||||
return False
|
||||
|
||||
if self.kv_cache_dtype != "auto":
|
||||
supported = current_platform.is_kv_cache_dtype_supported(
|
||||
self.kv_cache_dtype, model_config)
|
||||
if not supported:
|
||||
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
|
||||
# No Mamba or Encoder-Decoder so far.
|
||||
if not model_config.is_v1_compatible:
|
||||
_raise_or_fallback(feature_name=model_config.architectures,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,59 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
from vllm.config import SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.sequence import SequenceGroup, SequenceGroupOutput
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
|
||||
class SequenceGroupOutputProcessor(ABC):
|
||||
"""Interface for logic that processes new token ids in sequence groups,
|
||||
managing detokenization, stop checking, and freeing/forking sequences with
|
||||
the scheduler.
|
||||
|
||||
This is highly coupled with the LLMEngine and should be seen as an extension
|
||||
of it. The logic is separated to simplify the LLMEngine class and allow
|
||||
separate implementations for single-step decoding (which supports beam
|
||||
search sequence forking) and multi-step decoding (which does not support
|
||||
beam search, but does support speculative decoding).
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_output_processor(
|
||||
scheduler_config: SchedulerConfig,
|
||||
detokenizer: Detokenizer,
|
||||
scheduler: List[Scheduler],
|
||||
seq_counter: Counter,
|
||||
stop_checker: "StopChecker",
|
||||
):
|
||||
"""Create an output processor.
|
||||
|
||||
Multi-step scheduling is no longer supported. Always return a
|
||||
single-step output processor.
|
||||
"""
|
||||
from vllm.engine.output_processor.single_step import (
|
||||
SingleStepOutputProcessor)
|
||||
return SingleStepOutputProcessor(scheduler_config, detokenizer,
|
||||
scheduler, seq_counter, stop_checker)
|
||||
|
||||
@abstractmethod
|
||||
def process_outputs(self, sequence_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput],
|
||||
is_async: bool) -> None:
|
||||
"""Process new token ids for the sequence group. Handles logic such as
|
||||
detokenization, stop checking, and freeing/forking sequences in the
|
||||
scheduler.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
"""Update prompt logprobs received from outputs to seq_group."""
|
||||
pass
|
||||
@ -1,145 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import List
|
||||
|
||||
from vllm.config import SchedulerConfig
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.interfaces import (
|
||||
SequenceGroupOutputProcessor)
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup,
|
||||
SequenceGroupOutput)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def single_step_process_prompt_logprob(
|
||||
sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
|
||||
output: CompletionSequenceGroupOutput) -> None:
|
||||
"""Process prompt logprobs associated with the
|
||||
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
|
||||
|
||||
Do nothing if the output has no prompt logprobs.
|
||||
|
||||
Account for the fact that transformers do not compute first-token logprobs.
|
||||
|
||||
Args:
|
||||
sg_output_proc:
|
||||
[`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
|
||||
instance
|
||||
seq_group: the output is associated with this
|
||||
[`SequenceGroup`][vllm.sequence.SequenceGroup]
|
||||
output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
|
||||
for a single scheduler step
|
||||
"""
|
||||
prompt_logprobs = output.prompt_logprobs
|
||||
|
||||
# If this is the first (or only) "chunk" of the prefill, we need
|
||||
# to prepend None to the list of prompt logprobs. The reason for this
|
||||
# is that for N prompt tokens, the Sampler will generate N-1 total
|
||||
# prompt logprobs during prefill since the token at idx 0 will not
|
||||
# have a logprob associated with it.
|
||||
if prompt_logprobs is not None:
|
||||
if not seq_group.prompt_logprobs:
|
||||
prompt_logprobs = [None] + prompt_logprobs
|
||||
seq_group.prompt_logprobs = []
|
||||
|
||||
assert hasattr(sg_output_proc, 'detokenizer')
|
||||
if (seq_group.sampling_params.detokenize
|
||||
and sg_output_proc.detokenizer):
|
||||
sg_output_proc.detokenizer.decode_prompt_logprobs_inplace(
|
||||
seq_group,
|
||||
prompt_logprobs,
|
||||
position_offset=len(seq_group.prompt_logprobs))
|
||||
|
||||
seq_group.prompt_logprobs.extend(prompt_logprobs)
|
||||
|
||||
|
||||
class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
"""SequenceGroupOutputProcessor which handles "output processing" logic,
|
||||
which happens after the model returns generated token ids and before
|
||||
scheduling of the next batch. Output processing logic includes
|
||||
detokenization, and determining if a sequence is finished (e.g. via max len
|
||||
or eos token).
|
||||
|
||||
The SingleStepOutputProcessor is specialized to the case where the model
|
||||
emits at most a single token per invocation, which precludes configurations
|
||||
such as speculative decoding or multi-step decoding. This enables beam
|
||||
search sampling, which requires forking/finishing/freeing sequences in a way
|
||||
that is currently difficult to schedule multiple steps ahead of time.
|
||||
"""
|
||||
|
||||
def __init__(self, scheduler_config: SchedulerConfig,
|
||||
detokenizer: Detokenizer, scheduler: List[Scheduler],
|
||||
seq_counter: Counter, stop_checker: StopChecker):
|
||||
self.scheduler_config = scheduler_config
|
||||
self.detokenizer = detokenizer
|
||||
self.scheduler = scheduler
|
||||
self.seq_counter = seq_counter
|
||||
self.stop_checker = stop_checker
|
||||
|
||||
def process_outputs(self, sequence_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput],
|
||||
is_async: bool) -> None:
|
||||
"""Append all new tokens to sequences in the sequence group. Fork any
|
||||
surviving beam candidates; free any unsurviving ones.
|
||||
|
||||
Invokes detokenizer to detokenize new tokens, and also marks sequences
|
||||
as finished if they meet stop conditions.
|
||||
|
||||
is_async - Indicates whether this postprocessor runs in
|
||||
parallel with the GPU forward pass and is processing
|
||||
tokens from the previous step. If this is true, then
|
||||
no tokens need to be appended since it is already done
|
||||
externally (before the next schedule() call)
|
||||
"""
|
||||
assert (len(outputs) == 1
|
||||
), f"{type(self)} does not support multiple outputs per step"
|
||||
return self._process_sequence_group_outputs(sequence_group, outputs[0],
|
||||
is_async)
|
||||
|
||||
def process_prompt_logprob(self, seq_group: SequenceGroup,
|
||||
outputs: List[SequenceGroupOutput]) -> None:
|
||||
"""Process prompt logprobs associated with one step of a single-step-
|
||||
scheduled computation.
|
||||
|
||||
Args:
|
||||
seq_group: the output is associated with this
|
||||
[`SequenceGroup`][vllm.sequence.SequenceGroup]
|
||||
outputs: the
|
||||
[`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
|
||||
for a single scheduler step
|
||||
"""
|
||||
assert len(outputs) == 1, "Single step should only have 1 output."
|
||||
output = outputs[0]
|
||||
assert isinstance(output, CompletionSequenceGroupOutput)
|
||||
single_step_process_prompt_logprob(self, seq_group, output)
|
||||
|
||||
def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
|
||||
outputs: SequenceGroupOutput,
|
||||
is_async: bool) -> None:
|
||||
sampling_params = seq_group.sampling_params
|
||||
|
||||
sample = outputs.samples[0]
|
||||
seq = seq_group.first_seq
|
||||
if not is_async:
|
||||
seq.append_token_id(sample.output_token, sample.logprobs,
|
||||
sample.output_embed)
|
||||
if sampling_params.detokenize and self.detokenizer:
|
||||
new_char_count = self.detokenizer.decode_sequence_inplace(
|
||||
seq, sampling_params)
|
||||
else:
|
||||
new_char_count = 0
|
||||
self.stop_checker.maybe_stop_sequence(
|
||||
seq,
|
||||
new_char_count,
|
||||
sampling_params,
|
||||
lora_req=seq_group.lora_request,
|
||||
)
|
||||
if seq.is_finished():
|
||||
for scheduler in self.scheduler:
|
||||
scheduler.free_seq(seq)
|
||||
@ -1,139 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import Sequence, SequenceStatus
|
||||
|
||||
|
||||
class StopChecker:
|
||||
"""LLMEngine helper class which separates out the logic involving stop
|
||||
checking. This checks things such as: whether the eos token was emitted,
|
||||
whether the max_tokens has been consumed, whether a stop string has been
|
||||
emitted, or if we have exceeded the max model len.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_model_len: int,
|
||||
reasoner: Optional[ReasoningParser] = None,
|
||||
):
|
||||
# Do not use it directly, but use `self._get_max_model_len`.
|
||||
self._max_model_len = max_model_len
|
||||
self.reasoner = reasoner
|
||||
|
||||
def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
|
||||
if lora_req and lora_req.long_lora_max_len:
|
||||
return lora_req.long_lora_max_len
|
||||
else:
|
||||
return self._max_model_len
|
||||
|
||||
def maybe_stop_sequence(
|
||||
self,
|
||||
seq: Sequence,
|
||||
new_char_count: int,
|
||||
sampling_params: SamplingParams,
|
||||
lora_req: Optional[LoRARequest] = None,
|
||||
) -> None:
|
||||
"""Stop the finished sequences.
|
||||
|
||||
new_char_count is the number of chars added to the
|
||||
sequence's output text for the newly generated token
|
||||
"""
|
||||
|
||||
# Check if the minimum number of tokens has been generated yet;
|
||||
# skip the stop string/token checks if not
|
||||
if seq.get_output_len() < sampling_params.min_tokens:
|
||||
return
|
||||
|
||||
# Check if the sequence has generated the EOS token.
|
||||
if ((not sampling_params.ignore_eos)
|
||||
and seq.get_last_token_id() == seq.eos_token_id):
|
||||
# Remove the last EOS token unless explicitly specified
|
||||
# This prevents unintended exposure of the EOS token
|
||||
if new_char_count and (
|
||||
not sampling_params.include_stop_str_in_output):
|
||||
seq.output_text = seq.output_text[:-new_char_count]
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
return
|
||||
|
||||
# Skip stop string/token checks if in reasoning content generation
|
||||
if self.reasoner is not None and \
|
||||
not self.reasoner.is_reasoning_end(seq.get_token_ids()):
|
||||
return
|
||||
|
||||
# Check if a stop token was encountered.
|
||||
# This assumes a single token produced per step.
|
||||
last_token_id = seq.get_last_token_id()
|
||||
if last_token_id in (sampling_params.stop_token_ids or ()):
|
||||
if new_char_count and (
|
||||
not sampling_params.include_stop_str_in_output):
|
||||
# Remove last token
|
||||
seq.output_text = seq.output_text[:-new_char_count]
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
seq.stop_reason = last_token_id
|
||||
return
|
||||
|
||||
# Check if any stop strings are matched.
|
||||
stop = self.check_stop_strings(
|
||||
seq.output_text, new_char_count, sampling_params.stop,
|
||||
sampling_params.include_stop_str_in_output)
|
||||
if stop is not None:
|
||||
stop_str, truncate_to = stop
|
||||
if truncate_to != -1:
|
||||
seq.output_text = seq.output_text[:truncate_to]
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
seq.stop_reason = stop_str
|
||||
return
|
||||
|
||||
# Check if the sequence has reached max_model_len.
|
||||
if seq.get_len() >= self._get_max_model_len(lora_req):
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
# Check if the sequence has reached max_tokens.
|
||||
if seq.get_output_len() == sampling_params.max_tokens:
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def check_stop_strings(
|
||||
output_text: str,
|
||||
new_char_count: int,
|
||||
stop: List[str],
|
||||
include_in_output: bool,
|
||||
) -> Optional[Tuple[str, int]]:
|
||||
"""Check if any stop strings are matched and truncate sequence
|
||||
output text accordingly.
|
||||
|
||||
Returns tuple (stop_string, offset) if matched or else None.
|
||||
|
||||
Where stop_string is the matched stop string and offset is the
|
||||
length to which output_text should be truncated, or -1 for no
|
||||
truncation.
|
||||
"""
|
||||
if not new_char_count or not stop:
|
||||
return None
|
||||
|
||||
for stop_str in stop:
|
||||
stop_string_len = len(stop_str)
|
||||
# Avoid searching already-searched text.
|
||||
stop_index = output_text.find(stop_str,
|
||||
1 - new_char_count - stop_string_len)
|
||||
if stop_index == -1:
|
||||
continue
|
||||
|
||||
if include_in_output:
|
||||
# Truncate to end of stop string.
|
||||
stop_index += stop_string_len
|
||||
if stop_index >= len(output_text):
|
||||
# No truncation required.
|
||||
return stop_str, -1
|
||||
|
||||
# Truncate the output text to either the beginning
|
||||
# or end of the stop string.
|
||||
return stop_str, stop_index
|
||||
return None
|
||||
@ -7,13 +7,11 @@ from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
|
||||
|
||||
from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.core.scheduler import SchedulerOutputs
|
||||
from vllm.inputs.data import PromptType, TokensPrompt
|
||||
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
|
||||
from vllm.plugins.io_processors.interface import IOProcessor
|
||||
from vllm.pooling_params import PoolingParams
|
||||
@ -266,11 +264,7 @@ class EngineClient(ABC):
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def do_log_stats(
|
||||
self,
|
||||
scheduler_outputs: Optional[SchedulerOutputs] = None,
|
||||
model_output: Optional[list[SamplerOutput]] = None,
|
||||
) -> None:
|
||||
async def do_log_stats(self) -> None:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@ -421,6 +421,51 @@ def resolve_mistral_chat_template(
|
||||
return None
|
||||
|
||||
|
||||
_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
|
||||
"""
|
||||
Used in `_try_get_processor_chat_template` to avoid calling
|
||||
`cached_get_processor` again if the processor fails to be loaded.
|
||||
|
||||
This is needed because `lru_cache` does not cache when an exception happens.
|
||||
"""
|
||||
|
||||
|
||||
def _try_get_processor_chat_template(
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
model_config: ModelConfig,
|
||||
) -> Optional[str]:
|
||||
cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
|
||||
if cache_key in _PROCESSOR_CHAT_TEMPLATES:
|
||||
return _PROCESSOR_CHAT_TEMPLATES[cache_key]
|
||||
|
||||
try:
|
||||
processor = cached_get_processor(
|
||||
tokenizer.name_or_path,
|
||||
processor_cls=(
|
||||
PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast,
|
||||
ProcessorMixin,
|
||||
),
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
if (
|
||||
isinstance(processor, ProcessorMixin)
|
||||
and hasattr(processor, "chat_template")
|
||||
and (chat_template := processor.chat_template) is not None
|
||||
):
|
||||
_PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
|
||||
return chat_template
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Failed to load AutoProcessor chat template for %s",
|
||||
tokenizer.name_or_path,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
_PROCESSOR_CHAT_TEMPLATES[cache_key] = None
|
||||
return None
|
||||
|
||||
|
||||
def resolve_hf_chat_template(
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
chat_template: Optional[str],
|
||||
@ -434,28 +479,10 @@ def resolve_hf_chat_template(
|
||||
|
||||
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
|
||||
if tools is None:
|
||||
try:
|
||||
processor = cached_get_processor(
|
||||
tokenizer.name_or_path,
|
||||
processor_cls=(
|
||||
PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast,
|
||||
ProcessorMixin,
|
||||
),
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
if (
|
||||
isinstance(processor, ProcessorMixin)
|
||||
and hasattr(processor, "chat_template")
|
||||
and processor.chat_template is not None
|
||||
):
|
||||
return processor.chat_template
|
||||
except Exception:
|
||||
logger.debug(
|
||||
"Failed to load AutoProcessor chat template for %s",
|
||||
tokenizer.name_or_path,
|
||||
exc_info=True,
|
||||
) # noqa: E501
|
||||
chat_template = _try_get_processor_chat_template(tokenizer,
|
||||
model_config)
|
||||
if chat_template is not None:
|
||||
return chat_template
|
||||
|
||||
# 3rd priority: AutoTokenizer chat template
|
||||
try:
|
||||
|
||||
@ -11,7 +11,6 @@ from pydantic import ValidationError
|
||||
from tqdm.auto import tqdm
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
|
||||
BeamSearchSequence,
|
||||
create_sort_beams_key_function)
|
||||
@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType,
|
||||
StructuredOutputsConfig, TokenizerMode, is_init_field)
|
||||
from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
|
||||
PoolerConfig, RunnerOption)
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||
ChatTemplateContentFormatOption,
|
||||
apply_hf_chat_template,
|
||||
@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
get_cached_tokenizer)
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Counter, Device, as_iter, is_list_of
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -138,8 +137,6 @@ class LLM:
|
||||
back to the eager mode.
|
||||
disable_custom_all_reduce: See
|
||||
[ParallelConfig][vllm.config.ParallelConfig].
|
||||
disable_async_output_proc: Disable async output processing.
|
||||
This may result in lower performance.
|
||||
hf_token: The token to use as HTTP bearer authorization for remote files
|
||||
. If `True`, will use the token generated when running
|
||||
`huggingface-cli login` (stored in `~/.huggingface`).
|
||||
@ -189,7 +186,6 @@ class LLM:
|
||||
enforce_eager: bool = False,
|
||||
max_seq_len_to_capture: int = 8192,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
disable_async_output_proc: bool = False,
|
||||
hf_token: Optional[Union[bool, str]] = None,
|
||||
hf_overrides: Optional[HfOverrides] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
@ -287,7 +283,6 @@ class LLM:
|
||||
enforce_eager=enforce_eager,
|
||||
max_seq_len_to_capture=max_seq_len_to_capture,
|
||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
hf_token=hf_token,
|
||||
hf_overrides=hf_overrides,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
@ -309,11 +304,7 @@ class LLM:
|
||||
self.request_counter = Counter()
|
||||
self.default_sampling_params: Union[dict[str, Any], None] = None
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
supported_tasks = self.llm_engine \
|
||||
.get_supported_tasks() # type: ignore
|
||||
else:
|
||||
supported_tasks = self.llm_engine.model_config.supported_tasks
|
||||
supported_tasks = self.llm_engine.get_supported_tasks() # type: ignore
|
||||
|
||||
logger.info("Supported_tasks: %s", supported_tasks)
|
||||
|
||||
@ -1473,8 +1464,6 @@ class LLM:
|
||||
Note:
|
||||
This method is only available with the V1 LLM engine.
|
||||
"""
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
assert isinstance(self.llm_engine, V1LLMEngine)
|
||||
return self.llm_engine.get_metrics()
|
||||
|
||||
def _validate_and_add_requests(
|
||||
|
||||
@ -15,10 +15,10 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest, PoolerOutput
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils import make_async
|
||||
from vllm.v1.outputs import SamplerOutput
|
||||
from vllm.worker.worker_base import WorkerBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -1,244 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Any, Callable, List, Optional, Union
|
||||
|
||||
import cloudpickle
|
||||
|
||||
from vllm.executor.executor_base import DistributedExecutorBase
|
||||
from vllm.executor.multiproc_worker_utils import (
|
||||
ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
|
||||
set_multiprocessing_worker_envs)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
|
||||
get_distributed_init_method, get_ip, get_open_port,
|
||||
make_async, run_method, update_environment_variables)
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class MultiprocessingDistributedExecutor(DistributedExecutorBase):
|
||||
"""Python multiprocessing-based distributed executor"""
|
||||
|
||||
uses_ray: bool = False
|
||||
|
||||
def _check_cuda(self) -> None:
|
||||
"""Check that the number of GPUs is sufficient for the parallel
|
||||
configuration. Separate from _init_executor to reduce the number of
|
||||
indented blocks.
|
||||
"""
|
||||
parallel_config = self.parallel_config
|
||||
world_size = parallel_config.world_size
|
||||
tensor_parallel_size = parallel_config.tensor_parallel_size
|
||||
|
||||
cuda_device_count = cuda_device_count_stateless()
|
||||
# Use confusing message for more common TP-only case.
|
||||
if tensor_parallel_size > cuda_device_count:
|
||||
raise RuntimeError(
|
||||
f"please set tensor_parallel_size ({tensor_parallel_size}) "
|
||||
f"to less than max local gpu count ({cuda_device_count})")
|
||||
|
||||
if world_size > cuda_device_count:
|
||||
raise RuntimeError(
|
||||
f"please ensure that world_size ({world_size}) "
|
||||
f"is less than than max local gpu count ({cuda_device_count})")
|
||||
|
||||
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
|
||||
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
||||
update_environment_variables({
|
||||
"CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
|
||||
})
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.is_cuda_alike():
|
||||
self._check_cuda()
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
world_size = self.parallel_config.world_size
|
||||
tensor_parallel_size = self.parallel_config.tensor_parallel_size
|
||||
|
||||
# Set multiprocessing envs that are common to V0 and V1
|
||||
set_multiprocessing_worker_envs(self.parallel_config)
|
||||
|
||||
# Multiprocessing-based executor does not support multi-node setting.
|
||||
# Since it only works for single node, we can use the loopback address
|
||||
# 127.0.0.1 for communication.
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
"127.0.0.1", get_open_port())
|
||||
|
||||
self.workers: List[ProcessWorkerWrapper] = []
|
||||
# This is the list of workers that are rank 0 of each TP group EXCEPT
|
||||
# global rank 0. These are the workers that will broadcast to the
|
||||
# rest of the workers.
|
||||
self.tp_driver_workers: List[ProcessWorkerWrapper] = []
|
||||
# This is the list of workers that are not drivers and not the first
|
||||
# worker in a TP group. These are the workers that will be
|
||||
# broadcasted to.
|
||||
self.non_driver_workers: List[ProcessWorkerWrapper] = []
|
||||
|
||||
if world_size == 1:
|
||||
self.worker_monitor = None
|
||||
else:
|
||||
result_handler = ResultHandler()
|
||||
for rank in range(1, world_size):
|
||||
worker = ProcessWorkerWrapper(result_handler,
|
||||
WorkerWrapperBase,
|
||||
self.vllm_config, rank)
|
||||
self.workers.append(worker)
|
||||
if rank % tensor_parallel_size == 0:
|
||||
self.tp_driver_workers.append(worker)
|
||||
else:
|
||||
self.non_driver_workers.append(worker)
|
||||
|
||||
self.worker_monitor = WorkerMonitor(self.workers, result_handler)
|
||||
result_handler.start()
|
||||
self.worker_monitor.start()
|
||||
|
||||
# Set up signal handlers to shut down the executor cleanly
|
||||
# sometimes gc does not work well
|
||||
|
||||
self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
|
||||
|
||||
all_kwargs = []
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
for i in range(world_size):
|
||||
local_rank = i
|
||||
rank = i
|
||||
kwargs = dict(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=(not self.parallel_config)
|
||||
or (rank % self.parallel_config.tensor_parallel_size == 0),
|
||||
)
|
||||
all_kwargs.append(kwargs)
|
||||
self._run_workers("init_worker", all_kwargs)
|
||||
self._run_workers("init_device")
|
||||
self._run_workers("load_model",
|
||||
max_concurrent_workers=self.parallel_config.
|
||||
max_parallel_loading_workers)
|
||||
self.driver_exec_model = make_async(self.driver_worker.execute_model)
|
||||
self.pp_locks: Optional[List[asyncio.Lock]] = None
|
||||
|
||||
def shutdown(self):
|
||||
if (worker_monitor := getattr(self, "worker_monitor",
|
||||
None)) is not None:
|
||||
worker_monitor.close()
|
||||
|
||||
def _driver_execute_model(
|
||||
self, execute_model_req: Optional[ExecuteModelRequest]
|
||||
) -> Optional[List[SamplerOutput]]:
|
||||
"""Run execute_model in the driver worker.
|
||||
|
||||
Passing None will cause the driver to stop the model execution
|
||||
loop running in each of the remote workers.
|
||||
"""
|
||||
return self.driver_worker.execute_model(execute_model_req)
|
||||
|
||||
def _run_workers(
|
||||
self,
|
||||
method: Union[str, Callable],
|
||||
*args,
|
||||
async_run_tensor_parallel_workers_only: bool = False,
|
||||
max_concurrent_workers: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> List[Any]:
|
||||
"""Runs the given method on all workers.
|
||||
|
||||
Args:
|
||||
async_run_tensor_parallel_workers_only: If True the method will be
|
||||
run only in the remote TP workers, not the driver worker.
|
||||
It will also be run asynchronously and return a list of futures
|
||||
rather than blocking on the results.
|
||||
"""
|
||||
if isinstance(method, str):
|
||||
sent_method = method
|
||||
else:
|
||||
sent_method = cloudpickle.dumps(method)
|
||||
del method
|
||||
|
||||
if max_concurrent_workers:
|
||||
raise NotImplementedError(
|
||||
"max_concurrent_workers is not supported yet.")
|
||||
|
||||
if async_run_tensor_parallel_workers_only:
|
||||
# Run only non-driver workers and just return futures.
|
||||
return [
|
||||
worker.execute_method(sent_method, *args, **kwargs)
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
|
||||
# Start all remote workers first.
|
||||
worker_outputs = [
|
||||
worker.execute_method(sent_method, *args, **kwargs)
|
||||
for worker in self.workers
|
||||
]
|
||||
|
||||
driver_worker_output = run_method(self.driver_worker, sent_method,
|
||||
args, kwargs)
|
||||
|
||||
# Get the results of the workers.
|
||||
return [driver_worker_output
|
||||
] + [output.get() for output in worker_outputs]
|
||||
|
||||
def check_health(self) -> None:
|
||||
"""Raises an error if engine is unhealthy."""
|
||||
if self.worker_monitor is not None and not self.worker_monitor.is_alive(
|
||||
):
|
||||
raise RuntimeError("Worker processes are not running")
|
||||
|
||||
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
|
||||
"""Wait for futures returned from _run_workers() with
|
||||
async_run_remote_workers_only to complete."""
|
||||
for result in parallel_worker_tasks:
|
||||
result.get()
|
||||
|
||||
async def _driver_execute_model_async(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None
|
||||
) -> List[SamplerOutput]:
|
||||
if not self.tp_driver_workers:
|
||||
return await self.driver_exec_model(execute_model_req)
|
||||
|
||||
if self.pp_locks is None:
|
||||
# This locks each pipeline parallel stage so multiple virtual
|
||||
# engines can't execute on the same stage at the same time
|
||||
# We create the locks here to avoid creating them in the constructor
|
||||
# which uses a different asyncio loop.
|
||||
self.pp_locks = [
|
||||
asyncio.Lock()
|
||||
for _ in range(self.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
|
||||
execute_model_req))
|
||||
]
|
||||
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
|
||||
start=1):
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
_run_task_with_lock(driver_worker.execute_method_async,
|
||||
self.pp_locks[pp_rank],
|
||||
"execute_model", execute_model_req)))
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Only the last PP stage has the final results.
|
||||
return results[-1]
|
||||
|
||||
async def _start_worker_execution_loop(self):
|
||||
coros = [
|
||||
worker.execute_method_async("start_worker_execution_loop")
|
||||
for worker in self.non_driver_workers
|
||||
]
|
||||
return await asyncio.gather(*coros)
|
||||
@ -1,279 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import threading
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Queue
|
||||
from multiprocessing.connection import wait
|
||||
from multiprocessing.process import BaseProcess
|
||||
from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context,
|
||||
run_method)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
_TERMINATE = "TERMINATE" # sentinel
|
||||
|
||||
JOIN_TIMEOUT_S = 2
|
||||
|
||||
|
||||
@dataclass
|
||||
class Result(Generic[T]):
|
||||
"""Result of task dispatched to worker"""
|
||||
|
||||
task_id: uuid.UUID
|
||||
value: Optional[T] = None
|
||||
exception: Optional[BaseException] = None
|
||||
|
||||
|
||||
class ResultFuture(threading.Event, Generic[T]):
|
||||
"""Synchronous future for non-async case"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.result: Optional[Result[T]] = None
|
||||
|
||||
def set_result(self, result: Result[T]):
|
||||
self.result = result
|
||||
self.set()
|
||||
|
||||
def get(self) -> T:
|
||||
self.wait()
|
||||
assert self.result is not None
|
||||
if self.result.exception is not None:
|
||||
raise self.result.exception
|
||||
return self.result.value # type: ignore[return-value]
|
||||
|
||||
|
||||
def _set_future_result(future: Union[ResultFuture, asyncio.Future],
|
||||
result: Result):
|
||||
if isinstance(future, ResultFuture):
|
||||
future.set_result(result)
|
||||
return
|
||||
loop = future.get_loop()
|
||||
if not loop.is_closed():
|
||||
if result.exception is not None:
|
||||
loop.call_soon_threadsafe(future.set_exception, result.exception)
|
||||
else:
|
||||
loop.call_soon_threadsafe(future.set_result, result.value)
|
||||
|
||||
|
||||
class ResultHandler(threading.Thread):
|
||||
"""Handle results from all workers (in background thread)"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(daemon=True)
|
||||
self.result_queue = get_mp_context().Queue()
|
||||
self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
|
||||
|
||||
def run(self):
|
||||
for result in iter(self.result_queue.get, _TERMINATE):
|
||||
future = self.tasks.pop(result.task_id)
|
||||
_set_future_result(future, result)
|
||||
# Ensure that all waiters will receive an exception
|
||||
for task_id, future in self.tasks.items():
|
||||
_set_future_result(
|
||||
future,
|
||||
Result(task_id=task_id,
|
||||
exception=ChildProcessError("worker died")))
|
||||
|
||||
def close(self):
|
||||
self.result_queue.put(_TERMINATE)
|
||||
|
||||
|
||||
class WorkerMonitor(threading.Thread):
|
||||
"""Monitor worker status (in background thread)"""
|
||||
|
||||
def __init__(self, workers: List['ProcessWorkerWrapper'],
|
||||
result_handler: ResultHandler):
|
||||
super().__init__(daemon=True)
|
||||
self.workers = workers
|
||||
self.result_handler = result_handler
|
||||
self._close = False
|
||||
|
||||
def run(self) -> None:
|
||||
# Blocks until any worker exits
|
||||
dead_sentinels = wait([w.process.sentinel for w in self.workers])
|
||||
if not self._close:
|
||||
self._close = True
|
||||
|
||||
# Kill / cleanup all workers
|
||||
for worker in self.workers:
|
||||
process = worker.process
|
||||
if process.sentinel in dead_sentinels:
|
||||
process.join(JOIN_TIMEOUT_S)
|
||||
if process.exitcode is not None and process.exitcode != 0:
|
||||
logger.error("Worker %s pid %s died, exit code: %s",
|
||||
process.name, process.pid, process.exitcode)
|
||||
# Cleanup any remaining workers
|
||||
if logger:
|
||||
logger.info("Killing local vLLM worker processes")
|
||||
for worker in self.workers:
|
||||
worker.kill_worker()
|
||||
# Must be done after worker task queues are all closed
|
||||
self.result_handler.close()
|
||||
|
||||
for worker in self.workers:
|
||||
worker.process.join(JOIN_TIMEOUT_S)
|
||||
|
||||
def close(self):
|
||||
if self._close:
|
||||
return
|
||||
self._close = True
|
||||
logger.info("Terminating local vLLM worker processes")
|
||||
for worker in self.workers:
|
||||
worker.terminate_worker()
|
||||
# Must be done after worker task queues are all closed
|
||||
self.result_handler.close()
|
||||
|
||||
|
||||
class ProcessWorkerWrapper:
|
||||
"""Local process wrapper for vllm.worker.Worker,
|
||||
for handling single-node multi-GPU tensor parallel."""
|
||||
|
||||
def __init__(self, result_handler: ResultHandler,
|
||||
worker_factory: Callable[[VllmConfig, int], Any],
|
||||
vllm_config: VllmConfig, rank: int) -> None:
|
||||
self.mp = get_mp_context()
|
||||
self._task_queue = self.mp.Queue()
|
||||
self.result_queue = result_handler.result_queue
|
||||
self.tasks = result_handler.tasks
|
||||
self.process: BaseProcess = self.mp.Process( # type: ignore[attr-defined]
|
||||
target=_run_worker_process,
|
||||
name="VllmWorkerProcess",
|
||||
kwargs=dict(
|
||||
worker_factory=worker_factory,
|
||||
task_queue=self._task_queue,
|
||||
result_queue=self.result_queue,
|
||||
vllm_config=vllm_config,
|
||||
rank=rank,
|
||||
),
|
||||
daemon=True)
|
||||
|
||||
self.process.start()
|
||||
|
||||
def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
|
||||
method: Union[str, bytes], args, kwargs):
|
||||
task_id = uuid.uuid4()
|
||||
self.tasks[task_id] = future
|
||||
try:
|
||||
self._task_queue.put((task_id, method, args, kwargs))
|
||||
except SystemExit:
|
||||
raise
|
||||
except BaseException as e:
|
||||
del self.tasks[task_id]
|
||||
raise ChildProcessError("worker died") from e
|
||||
|
||||
def execute_method(self, method: Union[str, bytes], *args, **kwargs):
|
||||
future: ResultFuture = ResultFuture()
|
||||
self._enqueue_task(future, method, args, kwargs)
|
||||
return future
|
||||
|
||||
async def execute_method_async(self, method: Union[str, bytes], *args,
|
||||
**kwargs):
|
||||
future = asyncio.get_running_loop().create_future()
|
||||
self._enqueue_task(future, method, args, kwargs)
|
||||
return await future
|
||||
|
||||
def terminate_worker(self):
|
||||
try:
|
||||
self._task_queue.put(_TERMINATE)
|
||||
except ValueError:
|
||||
self.process.kill()
|
||||
self._task_queue.close()
|
||||
|
||||
def kill_worker(self):
|
||||
self._task_queue.close()
|
||||
self.process.kill()
|
||||
|
||||
|
||||
def _run_worker_process(
|
||||
worker_factory: Callable[[VllmConfig, int], Any],
|
||||
task_queue: Queue,
|
||||
result_queue: Queue,
|
||||
vllm_config: VllmConfig,
|
||||
rank: int,
|
||||
) -> None:
|
||||
"""Worker process event loop"""
|
||||
|
||||
# Add process-specific prefix to stdout and stderr
|
||||
process_name = get_mp_context().current_process().name
|
||||
decorate_logs(process_name)
|
||||
|
||||
# Initialize worker
|
||||
worker = worker_factory(vllm_config, rank)
|
||||
del worker_factory
|
||||
|
||||
# Accept tasks from the engine in task_queue
|
||||
# and return task output in result_queue
|
||||
logger.info("Worker ready; awaiting tasks")
|
||||
try:
|
||||
for items in iter(task_queue.get, _TERMINATE):
|
||||
output = None
|
||||
exception = None
|
||||
task_id, method, args, kwargs = items
|
||||
try:
|
||||
output = run_method(worker, method, args, kwargs)
|
||||
except SystemExit:
|
||||
raise
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
except BaseException as e:
|
||||
logger.exception(
|
||||
"Exception in worker %s while processing method %s.",
|
||||
process_name, method)
|
||||
exception = e
|
||||
result_queue.put(
|
||||
Result(task_id=task_id, value=output, exception=exception))
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
except Exception:
|
||||
logger.exception("Worker failed")
|
||||
|
||||
# Flush TunableOp results when TunableOp is enabled and
|
||||
# online (in situ) tuning is enabled.
|
||||
# Offline tuning API (record_untuned_is_enabled()) only
|
||||
# available in PyTorch 2.6 or later.
|
||||
if torch.cuda.is_available():
|
||||
import torch.cuda.tunable as tunable
|
||||
if (tunable.is_enabled() and tunable.tuning_is_enabled()
|
||||
and not tunable.record_untuned_is_enabled()):
|
||||
tunable.write_file()
|
||||
|
||||
logger.info("Worker exiting")
|
||||
|
||||
|
||||
def set_multiprocessing_worker_envs(parallel_config):
|
||||
""" Set up environment variables that should be used when there are workers
|
||||
in a multiprocessing environment. This should be called by the parent
|
||||
process before worker processes are created"""
|
||||
|
||||
_maybe_force_spawn()
|
||||
|
||||
# Configure thread parallelism if OMP_NUM_THREADS isn't set
|
||||
#
|
||||
# Helps to avoid CPU contention. The default of spawning a thread per
|
||||
# core combined with multiprocessing for each GPU can have a negative
|
||||
# impact on performance. The contention is amplified when running in a
|
||||
# container where CPU limits can cause throttling.
|
||||
default_omp_num_threads = 1
|
||||
if "OMP_NUM_THREADS" not in os.environ and (
|
||||
current_parallelism :=
|
||||
torch.get_num_threads()) > default_omp_num_threads:
|
||||
logger.warning(
|
||||
"Reducing Torch parallelism from %d threads to %d to avoid "
|
||||
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
|
||||
"external environment to tune this value as needed.",
|
||||
current_parallelism, default_omp_num_threads)
|
||||
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
|
||||
torch.set_num_threads(default_omp_num_threads)
|
||||
@ -17,12 +17,12 @@ from vllm.executor.msgspec_utils import encode_hook
|
||||
from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
|
||||
ray)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.ray.ray_env import get_env_vars_to_copy
|
||||
from vllm.sequence import ExecuteModelRequest
|
||||
from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
|
||||
get_ip, get_open_port, make_async)
|
||||
from vllm.v1.outputs import SamplerOutput
|
||||
|
||||
if ray is not None:
|
||||
from ray.actor import ActorHandle
|
||||
|
||||
@ -137,10 +137,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
|
||||
def _init_executor(self) -> None:
|
||||
"""Initialize the worker and load the model.
|
||||
"""
|
||||
assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
|
||||
("ExecutorWithExternalLauncher needs deterministic "
|
||||
"execution, so it"
|
||||
"does not support delay_factor in scheduling")
|
||||
if envs.VLLM_USE_V1:
|
||||
assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
|
||||
("To get deterministic execution in V1, "
|
||||
|
||||
@ -7,15 +7,7 @@ from .data import (DataPrompt, DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
|
||||
SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
|
||||
build_explicit_enc_dec_prompt, embeds_inputs,
|
||||
to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
|
||||
from .registry import (DummyData, InputContext, InputProcessingContext,
|
||||
InputRegistry)
|
||||
|
||||
INPUT_REGISTRY = InputRegistry()
|
||||
"""
|
||||
The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
|
||||
by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
|
||||
target model.
|
||||
"""
|
||||
from .registry import InputContext, InputProcessingContext
|
||||
|
||||
__all__ = [
|
||||
"DataPrompt",
|
||||
@ -36,9 +28,6 @@ __all__ = [
|
||||
"build_explicit_enc_dec_prompt",
|
||||
"to_enc_dec_tuple_list",
|
||||
"zip_enc_dec_prompts",
|
||||
"INPUT_REGISTRY",
|
||||
"DummyData",
|
||||
"InputContext",
|
||||
"InputProcessingContext",
|
||||
"InputRegistry",
|
||||
]
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Union
|
||||
|
||||
import torch
|
||||
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
|
||||
@ -15,16 +15,9 @@ from vllm.utils.jsontree import JSONTree, json_map_leaves
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
|
||||
MultiModalRegistry)
|
||||
from vllm.sequence import SequenceData
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
else:
|
||||
ModelConfig = Any
|
||||
MultiModalDataDict = Any
|
||||
MultiModalPlaceholderDict = Any
|
||||
MultiModalRegistry = Any
|
||||
SequenceData = Any
|
||||
AnyTokenizer = Any
|
||||
|
||||
_T = TypeVar("_T")
|
||||
@ -191,61 +184,3 @@ class InputProcessingContext(InputContext):
|
||||
f"on data={data} with kwargs={allowed_kwargs}")
|
||||
|
||||
raise ValueError(msg) from exc
|
||||
|
||||
|
||||
class DummyData(NamedTuple):
|
||||
"""
|
||||
Dummy data used for profiling.
|
||||
|
||||
Note: This is only used in V0.
|
||||
"""
|
||||
|
||||
seq_data: SequenceData
|
||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
||||
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
|
||||
|
||||
|
||||
class InputRegistry:
|
||||
"""
|
||||
Note: This is only used in V0.
|
||||
"""
|
||||
|
||||
def dummy_data_for_profiling(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
seq_len: int,
|
||||
mm_registry: MultiModalRegistry,
|
||||
is_encoder_data: bool = False,
|
||||
) -> DummyData:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by ``model_config``.
|
||||
"""
|
||||
# Avoid circular import
|
||||
from vllm.multimodal.cache import processor_only_cache_from_config
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
if not model_config.is_multimodal_model:
|
||||
seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
|
||||
return DummyData(seq_data=seq_data)
|
||||
|
||||
cache = processor_only_cache_from_config(model_config, mm_registry)
|
||||
|
||||
# Encoder dummy data does not contain multi-modal data
|
||||
if is_encoder_data:
|
||||
enc_data = mm_registry.get_encoder_dummy_data(model_config,
|
||||
seq_len,
|
||||
cache=cache)
|
||||
seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
|
||||
return DummyData(seq_data=seq_data)
|
||||
|
||||
dec_data = mm_registry.get_decoder_dummy_data(model_config,
|
||||
seq_len,
|
||||
cache=cache)
|
||||
|
||||
return DummyData(
|
||||
seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
|
||||
multi_modal_data=dec_data.multi_modal_data.get_data(),
|
||||
multi_modal_placeholders=dec_data.multi_modal_placeholders,
|
||||
)
|
||||
|
||||
@ -2,7 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.logging_utils.formatter import NewLineFormatter
|
||||
from vllm.logging_utils.log_time import logtime
|
||||
|
||||
__all__ = [
|
||||
"NewLineFormatter",
|
||||
"logtime",
|
||||
]
|
||||
|
||||
32
vllm/logging_utils/log_time.py
Normal file
32
vllm/logging_utils/log_time.py
Normal file
@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Provides a timeslice logging decorator
|
||||
"""
|
||||
|
||||
import functools
|
||||
import time
|
||||
|
||||
|
||||
def logtime(logger, msg=None):
|
||||
"""
|
||||
Logs the execution time of the decorated function.
|
||||
Always place it beneath other decorators.
|
||||
"""
|
||||
|
||||
def _inner(func):
|
||||
|
||||
@functools.wraps(func)
|
||||
def _wrapper(*args, **kwargs):
|
||||
start = time.perf_counter()
|
||||
result = func(*args, **kwargs)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
|
||||
if msg is None else msg
|
||||
logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
|
||||
return result
|
||||
|
||||
return _wrapper
|
||||
|
||||
return _inner
|
||||
@ -3,13 +3,9 @@
|
||||
|
||||
from vllm.model_executor.parameter import (BasevLLMParameter,
|
||||
PackedvLLMParameter)
|
||||
from vllm.model_executor.sampling_metadata import (SamplingMetadata,
|
||||
SamplingMetadataCache)
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
__all__ = [
|
||||
"SamplingMetadata",
|
||||
"SamplingMetadataCache",
|
||||
"set_random_seed",
|
||||
"BasevLLMParameter",
|
||||
"PackedvLLMParameter",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user