mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-06 11:37:59 +08:00
update
Signed-off-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
commit
0fa9111e82
@ -25,25 +25,28 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -evx
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
|
# Note: disable Bart until supports V1
|
||||||
|
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c cpu_tests
|
timeout 120m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@ -44,6 +44,5 @@ docker run \
|
|||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_metrics
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
8
.github/CODEOWNERS
vendored
8
.github/CODEOWNERS
vendored
@ -121,3 +121,11 @@ mkdocs.yaml @hmellor
|
|||||||
|
|
||||||
# KVConnector installation files
|
# KVConnector installation files
|
||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
|
# Pooling models
|
||||||
|
/examples/*/pooling/ @noooop
|
||||||
|
/tests/models/*/pooling* @noooop
|
||||||
|
/tests/entrypoints/pooling @noooop
|
||||||
|
/vllm/config/pooler.py @noooop
|
||||||
|
/vllm/pooling_params.py @noooop
|
||||||
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|||||||
@ -8,7 +8,6 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: Optional[str] = None
|
model_name: str | None = None
|
||||||
logprobs: Optional[int] = None
|
logprobs: int | None = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: dict | None = None
|
||||||
multi_modal_content: Optional[dict | list[dict]] = None
|
multi_modal_content: dict | list[dict] | None = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
language: str | None = None
|
||||||
request_id: Optional[str] = None
|
request_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
|||||||
@ -32,7 +32,6 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
|||||||
@ -7,7 +7,6 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -24,7 +23,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|||||||
@ -32,7 +32,6 @@ import uuid
|
|||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -316,7 +315,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,9 +435,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: int | None,
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any, Optional, Union
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(
|
def convert_to_pytorch_benchmark_format(
|
||||||
@ -92,7 +92,7 @@ class TimeCollector:
|
|||||||
def __init__(self, scale: int) -> None:
|
def __init__(self, scale: int) -> None:
|
||||||
self.cnt: int = 0
|
self.cnt: int = 0
|
||||||
self._sum: int = 0
|
self._sum: int = 0
|
||||||
self._max: Optional[int] = None
|
self._max: int | None = None
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.start_time: int = time.monotonic_ns()
|
self.start_time: int = time.monotonic_ns()
|
||||||
|
|
||||||
@ -104,13 +104,13 @@ class TimeCollector:
|
|||||||
else:
|
else:
|
||||||
self._max = max(self._max, v)
|
self._max = max(self._max, v)
|
||||||
|
|
||||||
def avg(self) -> Union[float, str]:
|
def avg(self) -> float | str:
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||||
|
|
||||||
def max(self) -> Union[float, str]:
|
def max(self) -> float | str:
|
||||||
return self._max / self.scale if self._max else "N/A"
|
return self._max / self.scale if self._max else "N/A"
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
def dump_avg_max(self) -> list[float | str]:
|
||||||
return [self.avg(), self.max()]
|
return [self.avg(), self.max()]
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
@ -118,8 +118,8 @@ class TimeCollector:
|
|||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
self,
|
self,
|
||||||
exc_type: Optional[type[BaseException]],
|
exc_type: type[BaseException] | None,
|
||||||
exc_value: Optional[BaseException],
|
exc_value: BaseException | None,
|
||||||
exc_traceback: Optional[TracebackType],
|
exc_traceback: TracebackType | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
self.collect(time.monotonic_ns() - self.start_time)
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -53,7 +52,7 @@ def bench_int8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
@ -108,7 +107,7 @@ def bench_fp8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
@ -183,7 +182,7 @@ def bench(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
def run(
|
def run(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
|
|||||||
@ -3,10 +3,9 @@
|
|||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
def unfused_int8_impl(
|
def unfused_int8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
|||||||
def unfused_fp8_impl(
|
def unfused_fp8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
|||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Callable
|
from collections.abc import Callable
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|||||||
@ -22,8 +22,8 @@ Example:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
|||||||
def benchmark_allreduce_single(
|
def benchmark_allreduce_single(
|
||||||
self,
|
self,
|
||||||
sequence_length: int,
|
sequence_length: int,
|
||||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||||
should_use_fn: Callable[[torch.Tensor], bool],
|
should_use_fn: Callable[[torch.Tensor], bool],
|
||||||
context,
|
context,
|
||||||
num_warmup: int,
|
num_warmup: int,
|
||||||
num_trials: int,
|
num_trials: int,
|
||||||
) -> Optional[float]:
|
) -> float | None:
|
||||||
"""Benchmark method with CUDA graph optimization."""
|
"""Benchmark method with CUDA graph optimization."""
|
||||||
try:
|
try:
|
||||||
# Create test tensor (2D: sequence_length x hidden_size)
|
# Create test tensor (2D: sequence_length x hidden_size)
|
||||||
|
|||||||
@ -6,11 +6,12 @@ import copy
|
|||||||
import json
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -158,7 +159,7 @@ def ref_group_gemm(
|
|||||||
seq_lens_cpu: torch.Tensor,
|
seq_lens_cpu: torch.Tensor,
|
||||||
prompt_lora_mapping_cpu: torch.Tensor,
|
prompt_lora_mapping_cpu: torch.Tensor,
|
||||||
scaling: float,
|
scaling: float,
|
||||||
add_inputs: Optional[bool],
|
add_inputs: bool | None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Torch group gemm reference implementation to test correctness of
|
Torch group gemm reference implementation to test correctness of
|
||||||
@ -316,8 +317,8 @@ class BenchmarkContext:
|
|||||||
lora_rank: int
|
lora_rank: int
|
||||||
sort_by_lora_id: bool
|
sort_by_lora_id: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
seq_length: Optional[int] = None
|
seq_length: int | None = None
|
||||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
num_slices: int | None = None # num_slices for slice based ops
|
||||||
|
|
||||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||||
ctx = copy.copy(self)
|
ctx = copy.copy(self)
|
||||||
@ -561,7 +562,7 @@ class BenchmarkTensors:
|
|||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def bench_fn_kwargs(
|
||||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
self, op_type: OpType, add_inputs: bool | None = None
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
if op_type.is_shrink_fn():
|
if op_type.is_shrink_fn():
|
||||||
assert add_inputs is None
|
assert add_inputs is None
|
||||||
@ -575,7 +576,7 @@ class BenchmarkTensors:
|
|||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
def test_correctness(
|
def test_correctness(
|
||||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Test correctness of op_type implementation against a grouped gemm
|
Test correctness of op_type implementation against a grouped gemm
|
||||||
@ -611,8 +612,8 @@ def bench_optype(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
expand_fn_add_inputs: Optional[bool] = None,
|
expand_fn_add_inputs: bool | None = None,
|
||||||
test_correctness: bool = False,
|
test_correctness: bool = False,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
assert arg_pool_size >= 1
|
assert arg_pool_size >= 1
|
||||||
@ -679,7 +680,7 @@ def bench_torch_mm(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
"""
|
"""
|
||||||
Benchmark basic torch.mm as a roofline.
|
Benchmark basic torch.mm as a roofline.
|
||||||
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||||
compare = TBenchmark.Compare(timers)
|
compare = TBenchmark.Compare(timers)
|
||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|||||||
@ -8,10 +8,9 @@ import math
|
|||||||
import os
|
import os
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
|||||||
a: torch.Tensor
|
a: torch.Tensor
|
||||||
|
|
||||||
w_q: torch.Tensor
|
w_q: torch.Tensor
|
||||||
group_size: Optional[int]
|
group_size: int | None
|
||||||
wtype: ScalarType
|
wtype: ScalarType
|
||||||
w_g_s: torch.Tensor
|
w_g_s: torch.Tensor
|
||||||
w_g_zp: Optional[torch.Tensor]
|
w_g_zp: torch.Tensor | None
|
||||||
w_ch_s: Optional[torch.Tensor]
|
w_ch_s: torch.Tensor | None
|
||||||
w_tok_s: Optional[torch.Tensor]
|
w_tok_s: torch.Tensor | None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
act_type: torch.dtype
|
act_type: torch.dtype
|
||||||
weight_type: ScalarType
|
weight_type: ScalarType
|
||||||
output_type: Optional[torch.dtype]
|
output_type: torch.dtype | None
|
||||||
group_scale_type: Optional[torch.dtype]
|
group_scale_type: torch.dtype | None
|
||||||
group_zero_type: Optional[torch.dtype]
|
group_zero_type: torch.dtype | None
|
||||||
channel_scale_type: Optional[torch.dtype]
|
channel_scale_type: torch.dtype | None
|
||||||
token_scale_type: Optional[torch.dtype]
|
token_scale_type: torch.dtype | None
|
||||||
|
|
||||||
|
|
||||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
|||||||
atype: torch.dtype,
|
atype: torch.dtype,
|
||||||
w: torch.Tensor,
|
w: torch.Tensor,
|
||||||
wtype: ScalarType,
|
wtype: ScalarType,
|
||||||
stype: Optional[torch.dtype],
|
stype: torch.dtype | None,
|
||||||
group_size: Optional[int],
|
group_size: int | None,
|
||||||
zero_points: bool = False,
|
zero_points: bool = False,
|
||||||
):
|
):
|
||||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
|||||||
|
|
||||||
|
|
||||||
def create_bench_tensors(
|
def create_bench_tensors(
|
||||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||||
) -> list[BenchmarkTensors]:
|
) -> list[BenchmarkTensors]:
|
||||||
m, n, k = shape
|
m, n, k = shape
|
||||||
|
|
||||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||||
|
|
||||||
|
|
||||||
def bench(
|
def bench(
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -37,7 +36,7 @@ def main(
|
|||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
|
|||||||
@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
import math
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Callable
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||||
orig_dtype = x.dtype
|
orig_dtype = x.dtype
|
||||||
x = x.to(torch.float32)
|
x = x.to(torch.float32)
|
||||||
if residual is not None:
|
if residual is not None:
|
||||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def rmsnorm_naive(
|
def rmsnorm_naive(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
|
|||||||
def rmsnorm_flashinfer(
|
def rmsnorm_flashinfer(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
|||||||
def rmsnorm_vllm(
|
def rmsnorm_vllm(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor] = None,
|
residual: torch.Tensor | None = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import nvtx
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
rotary_dim: Optional[int],
|
rotary_dim: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_decode(
|
def benchmark_decode(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_prefill(
|
def benchmark_prefill(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@ -2,8 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -55,7 +55,7 @@ class Bench:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
cuda_graph_params: CudaGraphBenchParams | None,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
description: str,
|
description: str,
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, NamedTuple, Optional, Union
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
|||||||
class UniformDistribution(Distribution):
|
class UniformDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
min_val: Union[int, float],
|
min_val: int | float,
|
||||||
max_val: Union[int, float],
|
max_val: int | float,
|
||||||
is_integer: bool = True,
|
is_integer: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.min_val = min_val
|
self.min_val = min_val
|
||||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ConstantDistribution(Distribution):
|
class ConstantDistribution(Distribution):
|
||||||
def __init__(self, value: Union[int, float]) -> None:
|
def __init__(self, value: int | float) -> None:
|
||||||
self.value = value
|
self.value = value
|
||||||
self.max_val = value
|
self.max_val = value
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ZipfDistribution(Distribution):
|
class ZipfDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class PoissonDistribution(Distribution):
|
class PoissonDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
|||||||
class LognormalDistribution(Distribution):
|
class LognormalDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
mean: Optional[float] = None,
|
mean: float | None = None,
|
||||||
sigma: Optional[float] = None,
|
sigma: float | None = None,
|
||||||
average: Optional[int] = None,
|
average: int | None = None,
|
||||||
median_ratio: Optional[float] = None,
|
median_ratio: float | None = None,
|
||||||
max_val: Optional[int] = None,
|
max_val: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.average = average
|
self.average = average
|
||||||
self.median_ratio = median_ratio
|
self.median_ratio = median_ratio
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from datetime import datetime
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import NamedTuple, Union
|
from typing import NamedTuple
|
||||||
|
|
||||||
import aiohttp # type: ignore
|
import aiohttp # type: ignore
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
@ -169,7 +169,7 @@ class MovingAverage:
|
|||||||
class DebugStats:
|
class DebugStats:
|
||||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||||
"ttft_ms": MetricStats(),
|
"ttft_ms": MetricStats(),
|
||||||
@ -636,7 +636,7 @@ async def client_main(
|
|||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
curr_time_sec: float = time.perf_counter()
|
curr_time_sec: float = time.perf_counter()
|
||||||
time_since_last_turn: Union[str, float] = "N/A"
|
time_since_last_turn: str | float = "N/A"
|
||||||
if conv_id in time_of_last_turn:
|
if conv_id in time_of_last_turn:
|
||||||
time_since_last_turn = round(
|
time_since_last_turn = round(
|
||||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||||
@ -928,13 +928,13 @@ async def main_mp(
|
|||||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||||
)
|
)
|
||||||
|
|
||||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||||
# Do not estimate the RPS if the number of samples is very low
|
# Do not estimate the RPS if the number of samples is very low
|
||||||
# (threshold can be tuned if needed)
|
# (threshold can be tuned if needed)
|
||||||
rps = "N/A"
|
rps = "N/A"
|
||||||
|
|
||||||
runtime_left_sec: Union[str, float] = round(
|
runtime_left_sec: str | float = round(
|
||||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||||
)
|
)
|
||||||
if percent < 0.05:
|
if percent < 0.05:
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
import tqdm # type: ignore
|
import tqdm # type: ignore
|
||||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def content_is_valid(
|
def content_is_valid(
|
||||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
content: str, min_content_len: int | None, max_content_len: int | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if min_content_len and len(content) < min_content_len:
|
if min_content_len and len(content) < min_content_len:
|
||||||
return False
|
return False
|
||||||
@ -37,7 +37,7 @@ def content_is_valid(
|
|||||||
|
|
||||||
|
|
||||||
def print_stats(
|
def print_stats(
|
||||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||||
) -> None:
|
) -> None:
|
||||||
# Collect statistics
|
# Collect statistics
|
||||||
stats = []
|
stats = []
|
||||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
|||||||
seed: int,
|
seed: int,
|
||||||
input_file: str,
|
input_file: str,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
max_items: Optional[int],
|
max_items: int | None,
|
||||||
min_content_len: Optional[int] = None,
|
min_content_len: int | None = None,
|
||||||
max_content_len: Optional[int] = None,
|
max_content_len: int | None = None,
|
||||||
min_turns: Optional[int] = None,
|
min_turns: int | None = None,
|
||||||
max_turns: Optional[int] = None,
|
max_turns: int | None = None,
|
||||||
model: Optional[str] = None,
|
model: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if min_turns and max_turns:
|
if min_turns and max_turns:
|
||||||
assert min_turns <= max_turns
|
assert min_turns <= max_turns
|
||||||
|
|||||||
@ -198,13 +198,24 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
FetchContent_Declare(
|
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||||
oneDNN
|
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||||
GIT_TAG v3.9
|
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
|
||||||
GIT_PROGRESS TRUE
|
FetchContent_Declare(
|
||||||
GIT_SHALLOW TRUE
|
oneDNN
|
||||||
)
|
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
message(STATUS "Downloading oneDNN from GitHub")
|
||||||
|
FetchContent_Declare(
|
||||||
|
oneDNN
|
||||||
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
|
GIT_TAG v3.9
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
GIT_SHALLOW TRUE
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(USE_ACL)
|
if(USE_ACL)
|
||||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
||||||
@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||||
set(ONEDNN_VERBOSE "ON")
|
set(ONEDNN_VERBOSE "OFF")
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
FetchContent_MakeAvailable(oneDNN)
|
||||||
@ -309,4 +320,4 @@ define_gpu_extension_target(
|
|||||||
WITH_SOABI
|
WITH_SOABI
|
||||||
)
|
)
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import enum
|
import enum
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from cutlass_library import *
|
from cutlass_library import *
|
||||||
|
|
||||||
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
|
|||||||
TmaWarpSpecializedCooperative = enum_auto()
|
TmaWarpSpecializedCooperative = enum_auto()
|
||||||
|
|
||||||
|
|
||||||
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
||||||
**DataTypeNames, # type: ignore
|
**DataTypeNames, # type: ignore
|
||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "u4b8",
|
VLLMDataType.u4b8: "u4b8",
|
||||||
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||||
**DataTypeTag, # type: ignore
|
**DataTypeTag, # type: ignore
|
||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||||
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
||||||
**DataTypeSize, # type: ignore
|
**DataTypeSize, # type: ignore
|
||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: 4,
|
VLLMDataType.u4b8: 4,
|
||||||
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||||
VLLMDataType.u4b8: "vllm::kU4B8",
|
VLLMDataType.u4b8: "vllm::kU4B8",
|
||||||
VLLMDataType.u8b128: "vllm::kU8B128",
|
VLLMDataType.u8b128: "vllm::kU8B128",
|
||||||
DataType.u4: "vllm::kU4",
|
DataType.u4: "vllm::kU4",
|
||||||
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
DataType.bf16: "vllm::kBfloat16",
|
DataType.bf16: "vllm::kBfloat16",
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||||
DataType.u8: "at::ScalarType::Byte",
|
DataType.u8: "at::ScalarType::Byte",
|
||||||
DataType.s8: "at::ScalarType::Char",
|
DataType.s8: "at::ScalarType::Char",
|
||||||
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
||||||
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
|||||||
DataType.f32: "at::ScalarType::Float",
|
DataType.f32: "at::ScalarType::Float",
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMKernelScheduleTag: dict[
|
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
|
||||||
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
|
||||||
] = {
|
|
||||||
**KernelScheduleTag, # type: ignore
|
**KernelScheduleTag, # type: ignore
|
||||||
**{
|
**{
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||||
|
|||||||
@ -9,7 +9,6 @@ from collections.abc import Iterable
|
|||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass, fields
|
from dataclasses import dataclass, fields
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
from vllm_cutlass_library_extension import (
|
from vllm_cutlass_library_extension import (
|
||||||
@ -259,7 +258,7 @@ class ScheduleConfig:
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
a: DataType
|
a: DataType
|
||||||
b: Union[DataType, VLLMDataType]
|
b: DataType | VLLMDataType
|
||||||
b_group_scale: DataType
|
b_group_scale: DataType
|
||||||
b_group_zeropoint: DataType
|
b_group_zeropoint: DataType
|
||||||
b_channel_scale: DataType
|
b_channel_scale: DataType
|
||||||
@ -280,7 +279,7 @@ class PrepackTypeConfig:
|
|||||||
class ImplConfig:
|
class ImplConfig:
|
||||||
types: TypeConfig
|
types: TypeConfig
|
||||||
schedules: list[ScheduleConfig]
|
schedules: list[ScheduleConfig]
|
||||||
heuristic: list[tuple[Optional[str], ScheduleConfig]]
|
heuristic: list[tuple[str | None, ScheduleConfig]]
|
||||||
|
|
||||||
|
|
||||||
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
|
||||||
|
|
||||||
###############################################################
|
###############################################################
|
||||||
# Stage to build openblas
|
# Stage to build openblas
|
||||||
@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
|||||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
|
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
|
||||||
|
|
||||||
ARG MAX_JOBS
|
ARG MAX_JOBS
|
||||||
ARG OPENBLAS_VERSION=0.3.29
|
ARG OPENBLAS_VERSION=0.3.30
|
||||||
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
|
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
|
||||||
&& source /opt/rh/gcc-toolset-13/enable \
|
&& source /opt/rh/gcc-toolset-13/enable \
|
||||||
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
|
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
|
||||||
@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
|
|||||||
FROM centos-deps-builder AS base-builder
|
FROM centos-deps-builder AS base-builder
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG OPENBLAS_VERSION=0.3.29
|
ARG OPENBLAS_VERSION=0.3.30
|
||||||
|
|
||||||
# Set Environment Variables for venv, cargo & openblas
|
# Set Environment Variables for venv, cargo & openblas
|
||||||
ENV VIRTUAL_ENV=/opt/vllm
|
ENV VIRTUAL_ENV=/opt/vllm
|
||||||
@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
|
|||||||
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
||||||
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
|
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
|
||||||
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
|
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
|
||||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
|
||||||
&& dnf clean all \
|
&& dnf clean all \
|
||||||
&& PREFIX=/usr/local make -C /openblas install \
|
&& PREFIX=/usr/local make -C /openblas install \
|
||||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||||
@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
|
|||||||
FROM base-builder AS torch-builder
|
FROM base-builder AS torch-builder
|
||||||
|
|
||||||
ARG MAX_JOBS
|
ARG MAX_JOBS
|
||||||
ARG TORCH_VERSION=2.6.0
|
ARG TORCH_VERSION=2.7.0
|
||||||
ARG _GLIBCXX_USE_CXX11_ABI=1
|
ARG _GLIBCXX_USE_CXX11_ABI=1
|
||||||
ARG OPENBLAS_VERSION=0.3.29
|
ARG OPENBLAS_VERSION=0.3.30
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
source /opt/rh/gcc-toolset-13/enable && \
|
source /opt/rh/gcc-toolset-13/enable && \
|
||||||
@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||||
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
|
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
|
||||||
|
|
||||||
ARG TORCHVISION_VERSION=0.21.0
|
ARG TORCHVISION_VERSION=0.22.0
|
||||||
ARG TORCHVISION_USE_NVJPEG=0
|
ARG TORCHVISION_USE_NVJPEG=0
|
||||||
ARG TORCHVISION_USE_FFMPEG=0
|
ARG TORCHVISION_USE_FFMPEG=0
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
BUILD_VERSION=${TORCHVISION_VERSION} \
|
BUILD_VERSION=${TORCHVISION_VERSION} \
|
||||||
uv build --wheel --out-dir /torchwheels/ --no-build-isolation
|
uv build --wheel --out-dir /torchwheels/ --no-build-isolation
|
||||||
|
|
||||||
ARG TORCHAUDIO_VERSION=2.6.0
|
ARG TORCHAUDIO_VERSION=2.7.0
|
||||||
ARG BUILD_SOX=1
|
ARG BUILD_SOX=1
|
||||||
ARG BUILD_KALDI=1
|
ARG BUILD_KALDI=1
|
||||||
ARG BUILD_RNNT=1
|
ARG BUILD_RNNT=1
|
||||||
@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
|
|||||||
|
|
||||||
ARG MAX_JOBS
|
ARG MAX_JOBS
|
||||||
ARG PYARROW_PARALLEL
|
ARG PYARROW_PARALLEL
|
||||||
ARG PYARROW_VERSION=19.0.1
|
ARG PYARROW_VERSION=21.0.0
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
source /opt/rh/gcc-toolset-13/enable && \
|
source /opt/rh/gcc-toolset-13/enable && \
|
||||||
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
|
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
|
||||||
@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
make install -j ${MAX_JOBS:-$(nproc)} && \
|
make install -j ${MAX_JOBS:-$(nproc)} && \
|
||||||
cd ../../python/ && \
|
cd ../../python/ && \
|
||||||
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
|
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
|
||||||
pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
|
|
||||||
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
|
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
|
||||||
python setup.py build_ext \
|
python setup.py build_ext \
|
||||||
--build-type=release --bundle-arrow-cpp \
|
--build-type=release --bundle-arrow-cpp \
|
||||||
@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
|
|||||||
&& make -j ${MAX_JOBS:-$(nproc)}
|
&& make -j ${MAX_JOBS:-$(nproc)}
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################
|
||||||
|
# Stage to build numba
|
||||||
|
###############################################################
|
||||||
|
|
||||||
|
FROM base-builder AS numba-builder
|
||||||
|
|
||||||
|
ARG MAX_JOBS
|
||||||
|
ARG NUMBA_VERSION=0.61.2
|
||||||
|
|
||||||
|
# Clone all required dependencies
|
||||||
|
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||||
|
git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
|
||||||
|
cd ./numba && \
|
||||||
|
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
|
||||||
|
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
|
||||||
|
fi && python -m build --wheel --installer=uv --outdir /numbawheels/
|
||||||
|
|
||||||
###############################################################
|
###############################################################
|
||||||
# Stage to build vllm - this stage builds and installs
|
# Stage to build vllm - this stage builds and installs
|
||||||
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
|
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
|
||||||
@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
|
|||||||
COPY --from=arrow-builder /tmp/control /dev/null
|
COPY --from=arrow-builder /tmp/control /dev/null
|
||||||
COPY --from=cv-builder /tmp/control /dev/null
|
COPY --from=cv-builder /tmp/control /dev/null
|
||||||
COPY --from=numa-builder /tmp/control /dev/null
|
COPY --from=numa-builder /tmp/control /dev/null
|
||||||
|
COPY --from=numba-builder /tmp/control /dev/null
|
||||||
|
|
||||||
ARG VLLM_TARGET_DEVICE=cpu
|
ARG VLLM_TARGET_DEVICE=cpu
|
||||||
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||||
@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
|||||||
# this step installs vllm and populates uv cache
|
# this step installs vllm and populates uv cache
|
||||||
# with all the transitive dependencies
|
# with all the transitive dependencies
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
dnf install llvm15 llvm15-devel -y && \
|
||||||
|
rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
|
||||||
source /opt/rh/gcc-toolset-13/enable && \
|
source /opt/rh/gcc-toolset-13/enable && \
|
||||||
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
||||||
uv pip install maturin && \
|
uv pip install maturin && \
|
||||||
@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||||
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
|
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
|
||||||
|
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||||
--mount=type=bind,src=.,dst=/src/,rw \
|
--mount=type=bind,src=.,dst=/src/,rw \
|
||||||
source /opt/rh/gcc-toolset-13/enable && \
|
source /opt/rh/gcc-toolset-13/enable && \
|
||||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||||
|
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
|
||||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||||
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
|
sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||||
|
uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
|
||||||
make -C /numactl install && \
|
make -C /numactl install && \
|
||||||
# sentencepiece.pc is in some pkgconfig inside uv cache
|
# sentencepiece.pc is in some pkgconfig inside uv cache
|
||||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
||||||
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||||
cd /src/ && \
|
cd /src/ && \
|
||||||
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
|
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
|
||||||
uv pip install /vllmwheel/*.whl
|
uv pip install /vllmwheel/*.whl
|
||||||
@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
|
|||||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
|
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG OPENBLAS_VERSION=0.3.29
|
ARG OPENBLAS_VERSION=0.3.30
|
||||||
|
|
||||||
# Set Environment Variables for venv & openblas
|
# Set Environment Variables for venv & openblas
|
||||||
ENV VIRTUAL_ENV=/opt/vllm
|
ENV VIRTUAL_ENV=/opt/vllm
|
||||||
@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
|
|||||||
COPY --from=numa-builder /tmp/control /dev/null
|
COPY --from=numa-builder /tmp/control /dev/null
|
||||||
COPY --from=lapack-builder /tmp/control /dev/null
|
COPY --from=lapack-builder /tmp/control /dev/null
|
||||||
COPY --from=openblas-builder /tmp/control /dev/null
|
COPY --from=openblas-builder /tmp/control /dev/null
|
||||||
|
COPY --from=numba-builder /tmp/control /dev/null
|
||||||
|
|
||||||
# install gcc-11, python, openblas, numactl, lapack
|
# install gcc-11, python, openblas, numactl, lapack
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
|
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
|
||||||
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
|
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
|
||||||
microdnf install --nodocs -y \
|
microdnf install --nodocs -y \
|
||||||
tar findutils openssl \
|
libomp tar findutils openssl llvm15 llvm15-devel \
|
||||||
pkgconfig xsimd g++ gcc-fortran libsndfile \
|
pkgconfig xsimd g++ gcc-fortran libsndfile \
|
||||||
libtiff libjpeg openjpeg2 zlib zeromq \
|
libtiff libjpeg openjpeg2 zlib zeromq \
|
||||||
freetype lcms2 libwebp tcl tk utf8proc \
|
freetype lcms2 libwebp tcl tk utf8proc \
|
||||||
harfbuzz fribidi libraqm libimagequant libxcb \
|
harfbuzz fribidi libraqm libimagequant libxcb util-linux \
|
||||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||||
&& microdnf clean all \
|
&& export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
|
||||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||||
&& python -m pip install -U pip uv --no-cache \
|
&& python -m pip install -U pip uv --no-cache \
|
||||||
&& make -C /numactl install \
|
&& make -C /numactl install \
|
||||||
@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||||
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
|
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
|
||||||
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
||||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||||
|
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
|
||||||
|
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
||||||
|
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@ -314,4 +340,4 @@ WORKDIR /workspace/
|
|||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
# install nixl from source code
|
||||||
|
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||||
|
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@ -16,7 +16,7 @@ Declare supported languages and capabilities:
|
|||||||
|
|
||||||
??? code "supported_languages and supports_transcription_only"
|
??? code "supported_languages and supports_transcription_only"
|
||||||
```python
|
```python
|
||||||
from typing import ClassVar, Mapping, Optional, Literal
|
from typing import ClassVar, Mapping, Literal
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
|||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
language: Optional[str],
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
to_language: Optional[str],
|
to_language: str | None,
|
||||||
) -> PromptType:
|
) -> PromptType:
|
||||||
# Example with a free-form instruction prompt
|
# Example with a free-form instruction prompt
|
||||||
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
||||||
@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
|||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
language: Optional[str],
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
to_language: Optional[str],
|
to_language: str | None,
|
||||||
) -> PromptType:
|
) -> PromptType:
|
||||||
if language is None:
|
if language is None:
|
||||||
raise ValueError("Language must be specified")
|
raise ValueError("Language must be specified")
|
||||||
@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method (
|
|||||||
??? code "validate_language()"
|
??? code "validate_language()"
|
||||||
```python
|
```python
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_language(cls, language: Optional[str]) -> Optional[str]:
|
def validate_language(cls, language: str | None) -> str | None:
|
||||||
if language is None:
|
if language is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||||
@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
|||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
) -> Optional[int]:
|
) -> int | None:
|
||||||
# Return None if unknown; otherwise return an estimate.
|
# Return None if unknown; otherwise return an estimate.
|
||||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||||
```
|
```
|
||||||
|
|||||||
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
|||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from typing import TYPE_CHECKING, Optional
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def update_state(
|
def update_state(
|
||||||
self,
|
self,
|
||||||
batch_update: Optional["BatchUpdate"],
|
batch_update: "BatchUpdate" | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Called when there are new output tokens, prior
|
"""Called when there are new output tokens, prior
|
||||||
to each forward pass.
|
to each forward pass.
|
||||||
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
|
|||||||
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
||||||
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
||||||
|
|
||||||
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
|
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
|
||||||
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
||||||
* Use the `BatchUpdate` members to update logits processor internal state
|
* Use the `BatchUpdate` members to update logits processor internal state
|
||||||
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
||||||
|
|||||||
@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
|
|||||||
??? code "Example custom logits processor definition"
|
??? code "Example custom logits processor definition"
|
||||||
|
|
||||||
``` python
|
``` python
|
||||||
from typing import Optional
|
|
||||||
import torch
|
import torch
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
|
|||||||
"""Never impacts greedy sampling"""
|
"""Never impacts greedy sampling"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
def update_state(self, batch_update: BatchUpdate | None):
|
||||||
if not batch_update:
|
if not batch_update:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ on HuggingFace model repository.
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from typing import Any, NamedTuple, Optional
|
from typing import Any, NamedTuple
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@ -30,11 +30,11 @@ question_per_audio_count = {
|
|||||||
|
|
||||||
class ModelRequestData(NamedTuple):
|
class ModelRequestData(NamedTuple):
|
||||||
engine_args: EngineArgs
|
engine_args: EngineArgs
|
||||||
prompt: Optional[str] = None
|
prompt: str | None = None
|
||||||
prompt_token_ids: Optional[dict[str, list[int]]] = None
|
prompt_token_ids: dict[str, list[int]] | None = None
|
||||||
multi_modal_data: Optional[dict[str, Any]] = None
|
multi_modal_data: dict[str, Any] | None = None
|
||||||
stop_token_ids: Optional[list[int]] = None
|
stop_token_ids: list[int] | None = None
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
lora_requests: list[LoRARequest] | None = None
|
||||||
|
|
||||||
|
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import TYPE_CHECKING, Optional
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||||
@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
|
|||||||
|
|
||||||
def get_finished(
|
def get_finished(
|
||||||
self, finished_req_ids: set[str]
|
self, finished_req_ids: set[str]
|
||||||
) -> tuple[Optional[set[str]], Optional[set[str]]]:
|
) -> tuple[set[str] | None, set[str] | None]:
|
||||||
if self._async_load:
|
if self._async_load:
|
||||||
meta = self._get_connector_metadata()
|
meta = self._get_connector_metadata()
|
||||||
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
|
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
|
||||||
|
|||||||
@ -33,8 +33,6 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the'
|
|||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor):
|
|||||||
def is_argmax_invariant(self) -> bool:
|
def is_argmax_invariant(self) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
def update_state(self, batch_update: BatchUpdate | None):
|
||||||
process_dict_updates(
|
process_dict_updates(
|
||||||
self.req_info,
|
self.req_info,
|
||||||
batch_update,
|
batch_update,
|
||||||
|
|||||||
@ -39,7 +39,7 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the'
|
|||||||
------------------------------------------------------------
|
------------------------------------------------------------
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
|||||||
def new_req_logits_processor(
|
def new_req_logits_processor(
|
||||||
self,
|
self,
|
||||||
params: SamplingParams,
|
params: SamplingParams,
|
||||||
) -> Optional[RequestLogitsProcessor]:
|
) -> RequestLogitsProcessor | None:
|
||||||
"""This method returns a new request-level logits processor, customized
|
"""This method returns a new request-level logits processor, customized
|
||||||
to the `target_token` value associated with a particular request.
|
to the `target_token` value associated with a particular request.
|
||||||
|
|
||||||
@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`Callable` request logits processor, or None
|
`Callable` request logits processor, or None
|
||||||
"""
|
"""
|
||||||
target_token: Optional[Any] = params.extra_args and params.extra_args.get(
|
target_token: Any | None = params.extra_args and params.extra_args.get(
|
||||||
"target_token"
|
"target_token"
|
||||||
)
|
)
|
||||||
if target_token is None:
|
if target_token is None:
|
||||||
|
|||||||
@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
|
|||||||
device, the first and third requests would not repeat the same token.
|
device, the first and third requests would not repeat the same token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
|
|||||||
def new_req_logits_processor(
|
def new_req_logits_processor(
|
||||||
self,
|
self,
|
||||||
params: SamplingParams,
|
params: SamplingParams,
|
||||||
) -> Optional[RequestLogitsProcessor]:
|
) -> RequestLogitsProcessor | None:
|
||||||
"""This method returns a new request-level logits processor, customized
|
"""This method returns a new request-level logits processor, customized
|
||||||
to the `target_token` value associated with a particular request.
|
to the `target_token` value associated with a particular request.
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
|
|
||||||
def create_test_prompts(
|
def create_test_prompts(
|
||||||
lora_path: str,
|
lora_path: str,
|
||||||
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
|
||||||
return [
|
return [
|
||||||
# this is an example of using quantization without LoRA
|
# this is an example of using quantization without LoRA
|
||||||
(
|
(
|
||||||
@ -56,7 +55,7 @@ def create_test_prompts(
|
|||||||
|
|
||||||
def process_requests(
|
def process_requests(
|
||||||
engine: LLMEngine,
|
engine: LLMEngine,
|
||||||
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
|
test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
|
||||||
):
|
):
|
||||||
"""Continuously process a list of prompts and handle the outputs."""
|
"""Continuously process a list of prompts and handle the outputs."""
|
||||||
request_id = 0
|
request_id = 0
|
||||||
@ -78,7 +77,7 @@ def process_requests(
|
|||||||
|
|
||||||
|
|
||||||
def initialize_engine(
|
def initialize_engine(
|
||||||
model: str, quantization: str, lora_repo: Optional[str]
|
model: str, quantization: str, lora_repo: str | None
|
||||||
) -> LLMEngine:
|
) -> LLMEngine:
|
||||||
"""Initialize the LLMEngine."""
|
"""Initialize the LLMEngine."""
|
||||||
|
|
||||||
|
|||||||
@ -7,8 +7,6 @@ for offline inference.
|
|||||||
Requires HuggingFace credentials for access to Llama2.
|
Requires HuggingFace credentials for access to Llama2.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||||
@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
|
|
||||||
def create_test_prompts(
|
def create_test_prompts(
|
||||||
lora_path: str,
|
lora_path: str,
|
||||||
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
|
||||||
"""Create a list of test prompts with their sampling parameters.
|
"""Create a list of test prompts with their sampling parameters.
|
||||||
|
|
||||||
2 requests for base model, 4 requests for the LoRA. We define 2
|
2 requests for base model, 4 requests for the LoRA. We define 2
|
||||||
@ -68,7 +66,7 @@ def create_test_prompts(
|
|||||||
|
|
||||||
def process_requests(
|
def process_requests(
|
||||||
engine: LLMEngine,
|
engine: LLMEngine,
|
||||||
test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
|
test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
|
||||||
):
|
):
|
||||||
"""Continuously process a list of prompts and handle the outputs."""
|
"""Continuously process a list of prompts and handle the outputs."""
|
||||||
request_id = 0
|
request_id = 0
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import albumentations
|
import albumentations
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -160,7 +159,7 @@ def load_example(
|
|||||||
file_paths: list[str],
|
file_paths: list[str],
|
||||||
mean: list[float] = None,
|
mean: list[float] = None,
|
||||||
std: list[float] = None,
|
std: list[float] = None,
|
||||||
indices: Union[list[int], None] = None,
|
indices: list[int] | None = None,
|
||||||
):
|
):
|
||||||
"""Build an input example by loading images in *file_paths*.
|
"""Build an input example by loading images in *file_paths*.
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
from typing import Callable, Optional, TypedDict
|
from collections.abc import Callable
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import zmq
|
import zmq
|
||||||
@ -71,7 +72,7 @@ class WorkerExtension:
|
|||||||
|
|
||||||
|
|
||||||
def rebuild_ipc(
|
def rebuild_ipc(
|
||||||
handle: tuple[Callable, tuple], device_id: Optional[int] = None
|
handle: tuple[Callable, tuple], device_id: int | None = None
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
func, args = handle
|
func, args = handle
|
||||||
list_args = list(args)
|
list_args = list(args)
|
||||||
@ -109,7 +110,7 @@ class ColocateWorkerExtension:
|
|||||||
self._zmq_ctx = zmq.Context()
|
self._zmq_ctx = zmq.Context()
|
||||||
socket = self._zmq_ctx.socket(zmq.REP)
|
socket = self._zmq_ctx.socket(zmq.REP)
|
||||||
socket.connect(zmq_handles[self.report_device_id()])
|
socket.connect(zmq_handles[self.report_device_id()])
|
||||||
buffer: Optional[torch.Tensor] = None
|
buffer: torch.Tensor | None = None
|
||||||
while True:
|
while True:
|
||||||
payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
|
payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
|
||||||
socket.recv_pyobj()
|
socket.recv_pyobj()
|
||||||
|
|||||||
@ -12,7 +12,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from typing import NamedTuple, Optional
|
from typing import NamedTuple
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser
|
|||||||
class ModelRequestData(NamedTuple):
|
class ModelRequestData(NamedTuple):
|
||||||
engine_args: EngineArgs
|
engine_args: EngineArgs
|
||||||
prompts: list[str]
|
prompts: list[str]
|
||||||
stop_token_ids: Optional[list[int]] = None
|
stop_token_ids: list[int] | None = None
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
lora_requests: list[LoRARequest] | None = None
|
||||||
|
|
||||||
|
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
|
|||||||
@ -9,7 +9,7 @@ using the chat template defined by the model.
|
|||||||
import os
|
import os
|
||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from typing import NamedTuple, Optional
|
from typing import NamedTuple
|
||||||
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple):
|
|||||||
engine_args: EngineArgs
|
engine_args: EngineArgs
|
||||||
prompt: str
|
prompt: str
|
||||||
image_data: list[Image]
|
image_data: list[Image]
|
||||||
stop_token_ids: Optional[list[int]] = None
|
stop_token_ids: list[int] | None = None
|
||||||
chat_template: Optional[str] = None
|
chat_template: str | None = None
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
lora_requests: list[LoRARequest] | None = None
|
||||||
|
|
||||||
|
|
||||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||||
@ -1251,7 +1251,7 @@ model_example_map = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
|
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
|
||||||
req_data = model_example_map[model](question, image_urls)
|
req_data = model_example_map[model](question, image_urls)
|
||||||
|
|
||||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||||
@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
|
|||||||
print("-" * 50)
|
print("-" * 50)
|
||||||
|
|
||||||
|
|
||||||
def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
|
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
|
||||||
req_data = model_example_map[model](question, image_urls)
|
req_data = model_example_map[model](question, image_urls)
|
||||||
|
|
||||||
# Disable other modalities to save memory
|
# Disable other modalities to save memory
|
||||||
|
|||||||
@ -11,7 +11,7 @@ on HuggingFace model repository.
|
|||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
|
||||||
|
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
|
|
||||||
@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict):
|
|||||||
|
|
||||||
|
|
||||||
QueryModality = Literal["text", "image", "text+image", "text+images"]
|
QueryModality = Literal["text", "image", "text+image", "text+images"]
|
||||||
Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
|
Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
|
||||||
|
|
||||||
|
|
||||||
class ModelRequestData(NamedTuple):
|
class ModelRequestData(NamedTuple):
|
||||||
engine_args: EngineArgs
|
engine_args: EngineArgs
|
||||||
prompt: Optional[str] = None
|
prompt: str | None = None
|
||||||
image: Optional[Image] = None
|
image: Image | None = None
|
||||||
query: Optional[str] = None
|
query: str | None = None
|
||||||
documents: Optional[ScoreMultiModalParam] = None
|
documents: ScoreMultiModalParam | None = None
|
||||||
|
|
||||||
|
|
||||||
def run_clip(query: Query) -> ModelRequestData:
|
def run_clip(query: Query) -> ModelRequestData:
|
||||||
@ -281,7 +281,7 @@ def get_query(modality: QueryModality):
|
|||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
|
||||||
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
def run_encode(model: str, modality: QueryModality, seed: int | None):
|
||||||
query = get_query(modality)
|
query = get_query(modality)
|
||||||
req_data = model_example_map[model](query)
|
req_data = model_example_map[model](query)
|
||||||
|
|
||||||
@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
|||||||
print("-" * 50)
|
print("-" * 50)
|
||||||
|
|
||||||
|
|
||||||
def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
def run_score(model: str, modality: QueryModality, seed: int | None):
|
||||||
query = get_query(modality)
|
query = get_query(modality)
|
||||||
req_data = model_example_map[model](query)
|
req_data = model_example_map[model](query)
|
||||||
|
|
||||||
|
|||||||
@ -23,7 +23,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Optional
|
from collections.abc import Callable
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import requests
|
import requests
|
||||||
@ -49,12 +49,9 @@ class Proxy:
|
|||||||
decode_instances: list[str],
|
decode_instances: list[str],
|
||||||
model: str,
|
model: str,
|
||||||
scheduling_policy: SchedulingPolicy,
|
scheduling_policy: SchedulingPolicy,
|
||||||
custom_create_completion: Optional[
|
custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||||
Callable[[Request], StreamingResponse]
|
custom_create_chat_completion: Callable[[Request], StreamingResponse]
|
||||||
] = None,
|
| None = None,
|
||||||
custom_create_chat_completion: Optional[
|
|
||||||
Callable[[Request], StreamingResponse]
|
|
||||||
] = None,
|
|
||||||
):
|
):
|
||||||
self.prefill_instances = prefill_instances
|
self.prefill_instances = prefill_instances
|
||||||
self.decode_instances = decode_instances
|
self.decode_instances = decode_instances
|
||||||
@ -348,9 +345,9 @@ class ProxyServer:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
args: argparse.Namespace,
|
args: argparse.Namespace,
|
||||||
scheduling_policy: Optional[SchedulingPolicy] = None,
|
scheduling_policy: SchedulingPolicy | None = None,
|
||||||
create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
|
create_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||||
create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
|
create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
|
||||||
):
|
):
|
||||||
self.validate_parsed_serve_args(args)
|
self.validate_parsed_serve_args(args)
|
||||||
self.port = args.port
|
self.port = args.port
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Any, Optional, Union
|
from typing import Any
|
||||||
|
|
||||||
import msgspec
|
import msgspec
|
||||||
import zmq
|
import zmq
|
||||||
@ -25,16 +25,16 @@ class KVCacheEvent(
|
|||||||
|
|
||||||
class BlockStored(KVCacheEvent):
|
class BlockStored(KVCacheEvent):
|
||||||
block_hashes: list[ExternalBlockHash]
|
block_hashes: list[ExternalBlockHash]
|
||||||
parent_block_hash: Optional[ExternalBlockHash]
|
parent_block_hash: ExternalBlockHash | None
|
||||||
token_ids: list[int]
|
token_ids: list[int]
|
||||||
block_size: int
|
block_size: int
|
||||||
lora_id: Optional[int]
|
lora_id: int | None
|
||||||
medium: Optional[str]
|
medium: str | None
|
||||||
|
|
||||||
|
|
||||||
class BlockRemoved(KVCacheEvent):
|
class BlockRemoved(KVCacheEvent):
|
||||||
block_hashes: list[ExternalBlockHash]
|
block_hashes: list[ExternalBlockHash]
|
||||||
medium: Optional[str]
|
medium: str | None
|
||||||
|
|
||||||
|
|
||||||
class AllBlocksCleared(KVCacheEvent):
|
class AllBlocksCleared(KVCacheEvent):
|
||||||
@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):
|
|||||||
|
|
||||||
|
|
||||||
class KVEventBatch(EventBatch):
|
class KVEventBatch(EventBatch):
|
||||||
events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
|
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
|
||||||
|
|
||||||
|
|
||||||
def process_event(event_batch):
|
def process_event(event_batch):
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
@ -43,7 +42,7 @@ async def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
prompt = "Who won the 2004 World Series?"
|
prompt = "Who won the 2004 World Series?"
|
||||||
final_output: Optional[RequestOutput] = None
|
final_output: RequestOutput | None = None
|
||||||
async for output in engine_client.generate(
|
async for output in engine_client.generate(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
sampling_params=sampling_params,
|
sampling_params=sampling_params,
|
||||||
|
|||||||
@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
|
|||||||
run: vllm serve BAAI/bge-reranker-base
|
run: vllm serve BAAI/bge-reranker-base
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import cohere
|
import cohere
|
||||||
from cohere import Client, ClientV2
|
from cohere import Client, ClientV2
|
||||||
|
|
||||||
@ -25,7 +23,7 @@ documents = [
|
|||||||
|
|
||||||
|
|
||||||
def cohere_rerank(
|
def cohere_rerank(
|
||||||
client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
|
client: Client | ClientV2, model: str, query: str, documents: list[str]
|
||||||
) -> dict:
|
) -> dict:
|
||||||
return client.rerank(model=model, query=query, documents=documents)
|
return client.rerank(model=model, query=query, documents=documents)
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model.
|
|||||||
import argparse
|
import argparse
|
||||||
import base64
|
import base64
|
||||||
import io
|
import io
|
||||||
from typing import Literal, Union
|
from typing import Literal
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from openai._types import NOT_GIVEN, NotGiven
|
from openai._types import NOT_GIVEN, NotGiven
|
||||||
@ -29,7 +29,7 @@ def create_chat_embeddings(
|
|||||||
*,
|
*,
|
||||||
messages: list[ChatCompletionMessageParam],
|
messages: list[ChatCompletionMessageParam],
|
||||||
model: str,
|
model: str,
|
||||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
|
||||||
) -> CreateEmbeddingResponse:
|
) -> CreateEmbeddingResponse:
|
||||||
"""
|
"""
|
||||||
Convenience function for accessing vLLM's Chat Embeddings API,
|
Convenience function for accessing vLLM's Chat Embeddings API,
|
||||||
|
|||||||
@ -1,21 +1,15 @@
|
|||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import enum
|
import enum
|
||||||
import os
|
import os
|
||||||
from typing import TYPE_CHECKING, Any, Literal
|
from typing import Any, Literal
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
import pydantic
|
import pydantic
|
||||||
|
from openai.types.chat import ChatCompletionChunk
|
||||||
if TYPE_CHECKING:
|
|
||||||
from openai.types.chat import ChatCompletionChunk
|
|
||||||
|
|
||||||
|
|
||||||
ConstraintsFormat = Literal[
|
ConstraintsFormat = Literal[
|
||||||
"choice",
|
"choice",
|
||||||
|
|||||||
@ -84,12 +84,6 @@ ignore = [
|
|||||||
"B007",
|
"B007",
|
||||||
# f-string format
|
# f-string format
|
||||||
"UP032",
|
"UP032",
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
"UP027",
|
|
||||||
"UP035",
|
|
||||||
"UP038",
|
|
||||||
"UP045",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ruff.format]
|
[tool.ruff.format]
|
||||||
|
|||||||
@ -10,7 +10,6 @@ wheel
|
|||||||
jinja2>=3.1.6
|
jinja2>=3.1.6
|
||||||
datasets # for benchmark scripts
|
datasets # for benchmark scripts
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
nixl==0.3.0 # for PD disaggregation
|
|
||||||
torch==2.8.0+xpu
|
torch==2.8.0+xpu
|
||||||
torchaudio
|
torchaudio
|
||||||
torchvision
|
torchvision
|
||||||
|
|||||||
5
setup.py
5
setup.py
@ -540,6 +540,11 @@ def get_gaudi_sw_version():
|
|||||||
|
|
||||||
|
|
||||||
def get_vllm_version() -> str:
|
def get_vllm_version() -> str:
|
||||||
|
# Allow overriding the version. This is useful to build platform-specific
|
||||||
|
# wheels (e.g. CPU, TPU) without modifying the source.
|
||||||
|
if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
|
||||||
|
return env_version
|
||||||
|
|
||||||
version = get_version(write_to="vllm/_version.py")
|
version = get_version(write_to="vllm/_version.py")
|
||||||
sep = "+" if "+" not in version else "." # dev versions might contain +
|
sep = "+" if "+" not in version else "." # dev versions might contain +
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import random
|
import random
|
||||||
from typing import Any, NamedTuple, Optional, cast
|
from typing import Any, NamedTuple, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@ -185,8 +185,8 @@ def _collect_mm_samples(
|
|||||||
output_len: int = 5,
|
output_len: int = 5,
|
||||||
base_items_per_request: int = 2,
|
base_items_per_request: int = 2,
|
||||||
num_mm_items_range_ratio: float = 0.0,
|
num_mm_items_range_ratio: float = 0.0,
|
||||||
limit_mm_per_prompt: Optional[dict[str, int]] = None,
|
limit_mm_per_prompt: dict[str, int] | None = None,
|
||||||
bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
|
bucket_config: dict[tuple[int, int, int], float] | None = None,
|
||||||
enable_multimodal_chat: bool = False,
|
enable_multimodal_chat: bool = False,
|
||||||
) -> list[SampleRequest]:
|
) -> list[SampleRequest]:
|
||||||
if limit_mm_per_prompt is None:
|
if limit_mm_per_prompt is None:
|
||||||
|
|||||||
@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need!
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Optional
|
from collections.abc import Callable
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
VLLM_CI_NO_SKIP: bool = False
|
VLLM_CI_NO_SKIP: bool = False
|
||||||
VLLM_CI_DTYPE: Optional[str] = None
|
VLLM_CI_DTYPE: str | None = None
|
||||||
VLLM_CI_HEAD_DTYPE: Optional[str] = None
|
VLLM_CI_HEAD_DTYPE: str | None = None
|
||||||
VLLM_CI_HF_DTYPE: Optional[str] = None
|
VLLM_CI_HF_DTYPE: str | None = None
|
||||||
|
|
||||||
environment_variables: dict[str, Callable[[], Any]] = {
|
environment_variables: dict[str, Callable[[], Any]] = {
|
||||||
# A model family has many models with the same architecture.
|
# A model family has many models with the same architecture.
|
||||||
|
|||||||
@ -2,9 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import weakref
|
import weakref
|
||||||
from collections.abc import Sequence
|
from collections.abc import Callable, Sequence
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Callable, Union
|
|
||||||
|
|
||||||
from torch import fx
|
from torch import fx
|
||||||
from torch._ops import OpOverload
|
from torch._ops import OpOverload
|
||||||
@ -44,7 +43,7 @@ class TestBackend:
|
|||||||
Inductor config is default-initialized from VllmConfig.CompilationConfig.
|
Inductor config is default-initialized from VllmConfig.CompilationConfig.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
|
def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
|
||||||
self.custom_passes = list(passes)
|
self.custom_passes = list(passes)
|
||||||
compile_config = get_current_vllm_config().compilation_config
|
compile_config = get_current_vllm_config().compilation_config
|
||||||
self.inductor_config = compile_config.inductor_compile_config
|
self.inductor_config = compile_config.inductor_compile_config
|
||||||
|
|||||||
@ -10,7 +10,7 @@ initialized randomly with a fixed seed.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module):
|
|||||||
self,
|
self,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
For tractable computation:
|
For tractable computation:
|
||||||
@ -217,7 +217,7 @@ class LlamaModel(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor],
|
input_ids: torch.Tensor | None,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
hidden_states = self.embedding_tokens(input_ids)
|
hidden_states = self.embedding_tokens(input_ids)
|
||||||
|
|||||||
@ -142,7 +142,7 @@ class TestScaledMMRSModel(_BaseScaledMMModel):
|
|||||||
return [torch.ops.vllm.reduce_scatter.default]
|
return [torch.ops.vllm.reduce_scatter.default]
|
||||||
|
|
||||||
def ops_in_model_after(self):
|
def ops_in_model_after(self):
|
||||||
return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
|
return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
|
||||||
|
|
||||||
|
|
||||||
class TestAGScaledMMModel(_BaseScaledMMModel):
|
class TestAGScaledMMModel(_BaseScaledMMModel):
|
||||||
@ -195,7 +195,7 @@ class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
|
|||||||
return [torch.ops.vllm.reduce_scatter.default]
|
return [torch.ops.vllm.reduce_scatter.default]
|
||||||
|
|
||||||
def ops_in_model_after(self):
|
def ops_in_model_after(self):
|
||||||
return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
|
return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
|
||||||
|
|
||||||
|
|
||||||
class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
|
class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
|
||||||
@ -243,9 +243,15 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
|
|||||||
@pytest.mark.parametrize("seq_len", [16])
|
@pytest.mark.parametrize("seq_len", [16])
|
||||||
@pytest.mark.parametrize("hidden_size", [16])
|
@pytest.mark.parametrize("hidden_size", [16])
|
||||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||||
|
@pytest.mark.parametrize("dynamic", [True, False])
|
||||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||||
def test_async_tp_pass_replace(
|
def test_async_tp_pass_replace(
|
||||||
test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype
|
test_model: str,
|
||||||
|
batch_size: int,
|
||||||
|
seq_len: int,
|
||||||
|
hidden_size: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
dynamic: bool,
|
||||||
):
|
):
|
||||||
if (
|
if (
|
||||||
test_model
|
test_model
|
||||||
@ -269,7 +275,15 @@ def test_async_tp_pass_replace(
|
|||||||
# torch.distributed and cuda
|
# torch.distributed and cuda
|
||||||
torch.multiprocessing.spawn(
|
torch.multiprocessing.spawn(
|
||||||
fn,
|
fn,
|
||||||
args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
|
args=(
|
||||||
|
num_processes,
|
||||||
|
test_model,
|
||||||
|
batch_size,
|
||||||
|
seq_len,
|
||||||
|
hidden_size,
|
||||||
|
dtype,
|
||||||
|
dynamic,
|
||||||
|
),
|
||||||
nprocs=nprocs,
|
nprocs=nprocs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -284,6 +298,7 @@ def async_tp_pass_on_test_model(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
hidden_size: int,
|
hidden_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
|
dynamic: bool,
|
||||||
):
|
):
|
||||||
current_platform.seed_everything(0)
|
current_platform.seed_everything(0)
|
||||||
|
|
||||||
@ -331,6 +346,9 @@ def async_tp_pass_on_test_model(
|
|||||||
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
|
(batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if dynamic:
|
||||||
|
torch._dynamo.mark_dynamic(hidden_states, 0)
|
||||||
|
|
||||||
compiled_model = torch.compile(model, backend=backend)
|
compiled_model = torch.compile(model, backend=backend)
|
||||||
compiled_model(hidden_states)
|
compiled_model(hidden_states)
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -1,11 +1,9 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Union
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
|||||||
|
|
||||||
|
|
||||||
def run_model(
|
def run_model(
|
||||||
compile_config: Union[int, CompilationConfig],
|
compile_config: int | CompilationConfig,
|
||||||
model: str,
|
model: str,
|
||||||
model_kwargs: dict[str, Any],
|
model_kwargs: dict[str, Any],
|
||||||
):
|
):
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import copy
|
import copy
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch._dynamo
|
import torch._dynamo
|
||||||
@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype()
|
|||||||
FP4_DTYPE = torch.uint8
|
FP4_DTYPE = torch.uint8
|
||||||
|
|
||||||
# globals needed for string-import custom Dynamo backend field
|
# globals needed for string-import custom Dynamo backend field
|
||||||
backend: Optional[TestBackend] = None
|
backend: TestBackend | None = None
|
||||||
backend_unfused: Optional[TestBackend] = None
|
backend_unfused: TestBackend | None = None
|
||||||
|
|
||||||
|
|
||||||
class AttentionQuantPatternModel(torch.nn.Module):
|
class AttentionQuantPatternModel(torch.nn.Module):
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -10,7 +9,7 @@ from vllm.config import CompilationLevel
|
|||||||
|
|
||||||
|
|
||||||
class MyMod(torch.nn.Module):
|
class MyMod(torch.nn.Module):
|
||||||
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||||
if cache is not None:
|
if cache is not None:
|
||||||
return x + cache
|
return x + cache
|
||||||
return x * 2
|
return x * 2
|
||||||
@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
|
|||||||
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
|
compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||||
# this is the function to be compiled
|
# this is the function to be compiled
|
||||||
return self.model(x, cache)
|
return self.model(x, cache)
|
||||||
|
|
||||||
def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
|
def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
|
||||||
# let torch.compile compile twice
|
# let torch.compile compile twice
|
||||||
if len(self.compiled_codes) == 2:
|
if len(self.compiled_codes) == 2:
|
||||||
dispatch_id = 0 if cache is None else 1
|
dispatch_id = 0 if cache is None else 1
|
||||||
|
|||||||
@ -21,7 +21,7 @@ import threading
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
|
from typing import Any, Callable, TypedDict, TypeVar, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
|
|||||||
|
|
||||||
_M = TypeVar("_M")
|
_M = TypeVar("_M")
|
||||||
|
|
||||||
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
|
_PromptMultiModalInput = list[_M] | list[list[_M]]
|
||||||
|
|
||||||
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
PromptImageInput = _PromptMultiModalInput[Image.Image]
|
||||||
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
|
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
|
||||||
@ -267,7 +267,7 @@ class HfRunner:
|
|||||||
|
|
||||||
return "cpu" if current_platform.is_cpu() else current_platform.device_type
|
return "cpu" if current_platform.is_cpu() else current_platform.device_type
|
||||||
|
|
||||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
def wrap_device(self, x: _T, device: str | None = None) -> _T:
|
||||||
if x is None or isinstance(x, (bool,)):
|
if x is None or isinstance(x, (bool,)):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
@ -287,14 +287,14 @@ class HfRunner:
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
*,
|
*,
|
||||||
model_kwargs: Optional[dict[str, Any]] = None,
|
model_kwargs: dict[str, Any] | None = None,
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
is_sentence_transformer: bool = False,
|
is_sentence_transformer: bool = False,
|
||||||
is_cross_encoder: bool = False,
|
is_cross_encoder: bool = False,
|
||||||
skip_tokenizer_init: bool = False,
|
skip_tokenizer_init: bool = False,
|
||||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||||
# Set this to avoid hanging issue
|
# Set this to avoid hanging issue
|
||||||
default_torch_num_threads: Optional[int] = None,
|
default_torch_num_threads: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
init_ctx = (
|
init_ctx = (
|
||||||
nullcontext()
|
nullcontext()
|
||||||
@ -319,7 +319,7 @@ class HfRunner:
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
*,
|
*,
|
||||||
model_kwargs: Optional[dict[str, Any]] = None,
|
model_kwargs: dict[str, Any] | None = None,
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
is_sentence_transformer: bool = False,
|
is_sentence_transformer: bool = False,
|
||||||
is_cross_encoder: bool = False,
|
is_cross_encoder: bool = False,
|
||||||
@ -406,11 +406,11 @@ class HfRunner:
|
|||||||
|
|
||||||
def get_inputs(
|
def get_inputs(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[list[int]]],
|
prompts: list[str] | list[list[int]],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]:
|
) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
|
||||||
if images is not None:
|
if images is not None:
|
||||||
assert len(prompts) == len(images)
|
assert len(prompts) == len(images)
|
||||||
|
|
||||||
@ -420,9 +420,7 @@ class HfRunner:
|
|||||||
if audios is not None:
|
if audios is not None:
|
||||||
assert len(prompts) == len(audios)
|
assert len(prompts) == len(audios)
|
||||||
|
|
||||||
all_inputs: list[
|
all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
|
||||||
Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]
|
|
||||||
] = []
|
|
||||||
for i, prompt in enumerate(prompts):
|
for i, prompt in enumerate(prompts):
|
||||||
if isinstance(prompt, str):
|
if isinstance(prompt, str):
|
||||||
processor_kwargs: dict[str, Any] = {
|
processor_kwargs: dict[str, Any] = {
|
||||||
@ -494,10 +492,10 @@ class HfRunner:
|
|||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[list[int]]],
|
prompts: list[str] | list[list[int]],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[list[list[int]], list[str]]]:
|
) -> list[tuple[list[list[int]], list[str]]]:
|
||||||
all_inputs = self.get_inputs(
|
all_inputs = self.get_inputs(
|
||||||
@ -522,11 +520,11 @@ class HfRunner:
|
|||||||
|
|
||||||
def generate_greedy(
|
def generate_greedy(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[list[int]]],
|
prompts: list[str] | list[list[int]],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[list[int], str]]:
|
) -> list[tuple[list[int], str]]:
|
||||||
outputs = self.generate(
|
outputs = self.generate(
|
||||||
@ -546,9 +544,9 @@ class HfRunner:
|
|||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
) -> list[tuple[list[list[int]], list[str]]]:
|
) -> list[tuple[list[list[int]], list[str]]]:
|
||||||
outputs = self.generate(
|
outputs = self.generate(
|
||||||
prompts,
|
prompts,
|
||||||
@ -574,9 +572,9 @@ class HfRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[list[torch.Tensor]]:
|
) -> list[list[torch.Tensor]]:
|
||||||
all_inputs = self.get_inputs(
|
all_inputs = self.get_inputs(
|
||||||
@ -624,7 +622,7 @@ class HfRunner:
|
|||||||
def _hidden_states_to_logprobs(
|
def _hidden_states_to_logprobs(
|
||||||
self,
|
self,
|
||||||
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
|
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
|
||||||
num_logprobs: Optional[int],
|
num_logprobs: int | None,
|
||||||
) -> tuple[list[dict[int, float]], int]:
|
) -> tuple[list[dict[int, float]], int]:
|
||||||
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
|
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
|
||||||
output_len = len(hidden_states)
|
output_len = len(hidden_states)
|
||||||
@ -652,10 +650,10 @@ class HfRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: Optional[int],
|
num_logprobs: int | None,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[TokensTextLogprobs]:
|
) -> list[TokensTextLogprobs]:
|
||||||
all_inputs = self.get_inputs(
|
all_inputs = self.get_inputs(
|
||||||
@ -734,20 +732,20 @@ class VllmRunner:
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
convert: ConvertOption = "auto",
|
convert: ConvertOption = "auto",
|
||||||
tokenizer_name: Optional[str] = None,
|
tokenizer_name: str | None = None,
|
||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = True,
|
trust_remote_code: bool = True,
|
||||||
seed: Optional[int] = 0,
|
seed: int | None = 0,
|
||||||
max_model_len: Optional[int] = 1024,
|
max_model_len: int | None = 1024,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
disable_log_stats: bool = True,
|
disable_log_stats: bool = True,
|
||||||
tensor_parallel_size: int = 1,
|
tensor_parallel_size: int = 1,
|
||||||
block_size: int = 16 if not torch.xpu.is_available() else 64,
|
block_size: int = 16 if not torch.xpu.is_available() else 64,
|
||||||
enable_chunked_prefill: Optional[bool] = False,
|
enable_chunked_prefill: bool | None = False,
|
||||||
swap_space: int = 4,
|
swap_space: int = 4,
|
||||||
enforce_eager: Optional[bool] = False,
|
enforce_eager: bool | None = False,
|
||||||
# Set this to avoid hanging issue
|
# Set this to avoid hanging issue
|
||||||
default_torch_num_threads: Optional[int] = None,
|
default_torch_num_threads: int | None = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
init_ctx = (
|
init_ctx = (
|
||||||
@ -785,10 +783,10 @@ class VllmRunner:
|
|||||||
|
|
||||||
def get_inputs(
|
def get_inputs(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
if any(
|
if any(
|
||||||
x is not None and len(x) != len(prompts) for x in [images, videos, audios]
|
x is not None and len(x) != len(prompts) for x in [images, videos, audios]
|
||||||
@ -824,11 +822,11 @@ class VllmRunner:
|
|||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[list[list[int]], list[str]]]:
|
) -> list[tuple[list[list[int]], list[str]]]:
|
||||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||||
@ -871,11 +869,11 @@ class VllmRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
sampling_params: SamplingParams,
|
sampling_params: SamplingParams,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
|
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||||
|
|
||||||
req_outputs = self.llm.generate(
|
req_outputs = self.llm.generate(
|
||||||
@ -894,11 +892,11 @@ class VllmRunner:
|
|||||||
|
|
||||||
def generate_greedy(
|
def generate_greedy(
|
||||||
self,
|
self,
|
||||||
prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
|
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> list[tuple[list[int], str]]:
|
) -> list[tuple[list[int], str]]:
|
||||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
||||||
@ -916,15 +914,15 @@ class VllmRunner:
|
|||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
num_logprobs: Optional[int],
|
num_logprobs: int | None,
|
||||||
num_prompt_logprobs: Optional[int] = None,
|
num_prompt_logprobs: int | None = None,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
stop_token_ids: Optional[list[int]] = None,
|
stop_token_ids: list[int] | None = None,
|
||||||
stop: Optional[list[str]] = None,
|
stop: list[str] | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
|
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||||
greedy_logprobs_params = SamplingParams(
|
greedy_logprobs_params = SamplingParams(
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
@ -957,7 +955,7 @@ class VllmRunner:
|
|||||||
perplexities = []
|
perplexities = []
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
output = cast(TokensTextLogprobsPromptLogprobs, output)
|
output = cast(TokensTextLogprobsPromptLogprobs, output)
|
||||||
token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
|
token_datas = cast(list[dict[int, Logprob] | None], output[3])
|
||||||
assert token_datas[0] is None
|
assert token_datas[0] is None
|
||||||
token_log_probs = []
|
token_log_probs = []
|
||||||
for token_data in token_datas[1:]:
|
for token_data in token_datas[1:]:
|
||||||
@ -976,10 +974,10 @@ class VllmRunner:
|
|||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
beam_width: int,
|
beam_width: int,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
concurrency_limit: Optional[int] = None,
|
concurrency_limit: int | None = None,
|
||||||
) -> list[tuple[list[list[int]], list[str]]]:
|
) -> list[tuple[list[list[int]], list[str]]]:
|
||||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||||
|
|
||||||
@ -1002,9 +1000,9 @@ class VllmRunner:
|
|||||||
def embed(
|
def embed(
|
||||||
self,
|
self,
|
||||||
prompts: list[str],
|
prompts: list[str],
|
||||||
images: Optional[PromptImageInput] = None,
|
images: PromptImageInput | None = None,
|
||||||
videos: Optional[PromptVideoInput] = None,
|
videos: PromptVideoInput | None = None,
|
||||||
audios: Optional[PromptAudioInput] = None,
|
audios: PromptAudioInput | None = None,
|
||||||
*args,
|
*args,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list[list[float]]:
|
) -> list[list[float]]:
|
||||||
@ -1023,8 +1021,8 @@ class VllmRunner:
|
|||||||
|
|
||||||
def score(
|
def score(
|
||||||
self,
|
self,
|
||||||
text_1: Union[str, list[str]],
|
text_1: list[str] | str,
|
||||||
text_2: Union[str, list[str]],
|
text_2: list[str] | str,
|
||||||
*args,
|
*args,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list[float]:
|
) -> list[float]:
|
||||||
@ -1226,8 +1224,8 @@ def _find_free_port() -> int:
|
|||||||
class LocalAssetServer:
|
class LocalAssetServer:
|
||||||
address: str
|
address: str
|
||||||
port: int
|
port: int
|
||||||
server: Optional[http.server.ThreadingHTTPServer]
|
server: http.server.ThreadingHTTPServer | None
|
||||||
thread: Optional[threading.Thread]
|
thread: threading.Thread | None
|
||||||
|
|
||||||
def __init__(self, address: str = "127.0.0.1") -> None:
|
def __init__(self, address: str = "127.0.0.1") -> None:
|
||||||
self.address = address
|
self.address = address
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -15,8 +15,8 @@ def _test_stopping(
|
|||||||
llm: LLM,
|
llm: LLM,
|
||||||
expected_output: str,
|
expected_output: str,
|
||||||
expected_reason: Any,
|
expected_reason: Any,
|
||||||
stop: Optional[list[str]] = None,
|
stop: list[str] | None = None,
|
||||||
stop_token_ids: Optional[list[int]] = None,
|
stop_token_ids: list[int] | None = None,
|
||||||
include_in_output: bool = False,
|
include_in_output: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
output = llm.generate(
|
output = llm.generate(
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import random
|
import random
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import msgspec
|
import msgspec
|
||||||
import msgspec.msgpack
|
import msgspec.msgpack
|
||||||
@ -78,8 +77,8 @@ class MockSubscriber:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
pub_endpoints: Union[str, list[str]],
|
pub_endpoints: str | list[str],
|
||||||
replay_endpoints: Optional[Union[str, list[str]]] = None,
|
replay_endpoints: str | list[str] | None = None,
|
||||||
topic: str = "",
|
topic: str = "",
|
||||||
decode_type=SampleBatch,
|
decode_type=SampleBatch,
|
||||||
):
|
):
|
||||||
@ -111,7 +110,7 @@ class MockSubscriber:
|
|||||||
self.last_seq = -1
|
self.last_seq = -1
|
||||||
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
|
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
|
||||||
|
|
||||||
def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
|
def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
|
||||||
"""Receive a single message with timeout"""
|
"""Receive a single message with timeout"""
|
||||||
if not self.sub.poll(timeout):
|
if not self.sub.poll(timeout):
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -5,9 +5,8 @@
|
|||||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from collections.abc import Callable
|
||||||
|
from typing import Any
|
||||||
from typing import Any, Callable
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import ray
|
import ray
|
||||||
|
|||||||
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, NamedTuple, Optional
|
from typing import Literal, NamedTuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
|
|||||||
|
|
||||||
class CPTestOptions(NamedTuple):
|
class CPTestOptions(NamedTuple):
|
||||||
multi_node_only: bool
|
multi_node_only: bool
|
||||||
load_format: Optional[str] = None
|
load_format: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -54,7 +54,7 @@ class CPTestSettings:
|
|||||||
dcp_base: int = 1,
|
dcp_base: int = 1,
|
||||||
multi_node_only: bool = False,
|
multi_node_only: bool = False,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
):
|
):
|
||||||
parallel_setups = []
|
parallel_setups = []
|
||||||
for eager_mode_val in [False]:
|
for eager_mode_val in [False]:
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, NamedTuple, Optional
|
from typing import Literal, NamedTuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):
|
|||||||
|
|
||||||
class EPTestOptions(NamedTuple):
|
class EPTestOptions(NamedTuple):
|
||||||
trust_remote_code: bool
|
trust_remote_code: bool
|
||||||
tokenizer_mode: Optional[str]
|
tokenizer_mode: str | None
|
||||||
load_format: Optional[str] = None
|
load_format: str | None = None
|
||||||
hf_overrides: Optional[str] = None
|
hf_overrides: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -40,9 +40,9 @@ class EPTestSettings:
|
|||||||
tp_base: int = 2,
|
tp_base: int = 2,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
tokenizer_mode: Optional[str] = None,
|
tokenizer_mode: str | None = None,
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
hf_overrides: Optional[str] = None,
|
hf_overrides: str | None = None,
|
||||||
):
|
):
|
||||||
return EPTestSettings(
|
return EPTestSettings(
|
||||||
parallel_setups=[
|
parallel_setups=[
|
||||||
@ -72,9 +72,9 @@ class EPTestSettings:
|
|||||||
tp_base: int = 2,
|
tp_base: int = 2,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
tokenizer_mode: Optional[str] = None,
|
tokenizer_mode: str | None = None,
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
hf_overrides: Optional[str] = None,
|
hf_overrides: str | None = None,
|
||||||
):
|
):
|
||||||
return EPTestSettings(
|
return EPTestSettings(
|
||||||
parallel_setups=[
|
parallel_setups=[
|
||||||
|
|||||||
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, NamedTuple, Optional
|
from typing import Literal, NamedTuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple):
|
|||||||
|
|
||||||
class PPTestOptions(NamedTuple):
|
class PPTestOptions(NamedTuple):
|
||||||
multi_node_only: bool
|
multi_node_only: bool
|
||||||
load_format: Optional[str] = None
|
load_format: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -52,7 +52,7 @@ class PPTestSettings:
|
|||||||
pp_base: int = 2,
|
pp_base: int = 2,
|
||||||
multi_node_only: bool = False,
|
multi_node_only: bool = False,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
):
|
):
|
||||||
return PPTestSettings(
|
return PPTestSettings(
|
||||||
parallel_setups=[
|
parallel_setups=[
|
||||||
@ -76,7 +76,7 @@ class PPTestSettings:
|
|||||||
pp_base: int = 2,
|
pp_base: int = 2,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
multi_node_only: bool = False,
|
multi_node_only: bool = False,
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
):
|
):
|
||||||
return PPTestSettings(
|
return PPTestSettings(
|
||||||
parallel_setups=[
|
parallel_setups=[
|
||||||
|
|||||||
@ -1,16 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from typing_extensions import LiteralString
|
||||||
|
|
||||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing_extensions import LiteralString
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"PP_SIZE, MODEL_NAME",
|
"PP_SIZE, MODEL_NAME",
|
||||||
|
|||||||
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, NamedTuple, Optional
|
from typing import Literal, NamedTuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
|
|||||||
|
|
||||||
class SPTestOptions(NamedTuple):
|
class SPTestOptions(NamedTuple):
|
||||||
multi_node_only: bool
|
multi_node_only: bool
|
||||||
load_format: Optional[str] = None
|
load_format: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -53,7 +53,7 @@ class SPTestSettings:
|
|||||||
pp_base: int = 1,
|
pp_base: int = 1,
|
||||||
multi_node_only: bool = False,
|
multi_node_only: bool = False,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
):
|
):
|
||||||
parallel_setups = []
|
parallel_setups = []
|
||||||
for eager_mode_val in [False, True]:
|
for eager_mode_val in [False, True]:
|
||||||
@ -84,7 +84,7 @@ class SPTestSettings:
|
|||||||
pp_base: int = 1,
|
pp_base: int = 1,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
multi_node_only: bool = False,
|
multi_node_only: bool = False,
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
):
|
):
|
||||||
parallel_setups = []
|
parallel_setups = []
|
||||||
for eager_mode_val in [False, True]:
|
for eager_mode_val in [False, True]:
|
||||||
@ -115,7 +115,7 @@ class SPTestSettings:
|
|||||||
pp_base: int = 1,
|
pp_base: int = 1,
|
||||||
runner: RunnerOption = "auto",
|
runner: RunnerOption = "auto",
|
||||||
multi_node_only: bool = False,
|
multi_node_only: bool = False,
|
||||||
load_format: Optional[str] = None,
|
load_format: str | None = None,
|
||||||
):
|
):
|
||||||
parallel_setups = []
|
parallel_setups = []
|
||||||
for fusion_val in [False, True]:
|
for fusion_val in [False, True]:
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import json
|
|||||||
from argparse import ArgumentError
|
from argparse import ArgumentError
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Annotated, Literal, Optional, Union
|
from typing import Annotated, Literal
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -115,9 +115,9 @@ class NestedConfig:
|
|||||||
class DummyConfig:
|
class DummyConfig:
|
||||||
regular_bool: bool = True
|
regular_bool: bool = True
|
||||||
"""Regular bool with default True"""
|
"""Regular bool with default True"""
|
||||||
optional_bool: Optional[bool] = None
|
optional_bool: bool | None = None
|
||||||
"""Optional bool with default None"""
|
"""Optional bool with default None"""
|
||||||
optional_literal: Optional[Literal["x", "y"]] = None
|
optional_literal: Literal["x", "y"] | None = None
|
||||||
"""Optional literal with default None"""
|
"""Optional literal with default None"""
|
||||||
tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
|
tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
|
||||||
"""Tuple with variable length"""
|
"""Tuple with variable length"""
|
||||||
@ -127,7 +127,7 @@ class DummyConfig:
|
|||||||
"""List with variable length"""
|
"""List with variable length"""
|
||||||
list_literal: list[Literal[1, 2]] = field(default_factory=list)
|
list_literal: list[Literal[1, 2]] = field(default_factory=list)
|
||||||
"""List with literal choices"""
|
"""List with literal choices"""
|
||||||
list_union: list[Union[str, type[object]]] = field(default_factory=list)
|
list_union: list[str | type[object]] = field(default_factory=list)
|
||||||
"""List with union type"""
|
"""List with union type"""
|
||||||
literal_literal: Literal[Literal[1], Literal[2]] = 1
|
literal_literal: Literal[Literal[1], Literal[2]] = 1
|
||||||
"""Literal of literals with default 1"""
|
"""Literal of literals with default 1"""
|
||||||
@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected):
|
|||||||
("type_hint", "expected"),
|
("type_hint", "expected"),
|
||||||
[
|
[
|
||||||
(Annotated[int, "annotation"], {int}),
|
(Annotated[int, "annotation"], {int}),
|
||||||
(Optional[int], {int, type(None)}),
|
(int | None, {int, type(None)}),
|
||||||
(Annotated[Optional[int], "annotation"], {int, type(None)}),
|
(Annotated[int | None, "annotation"], {int, type(None)}),
|
||||||
(Optional[Annotated[int, "annotation"]], {int, type(None)}),
|
(Annotated[int, "annotation"] | None, {int, type(None)}),
|
||||||
],
|
],
|
||||||
ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
|
ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
|
||||||
)
|
)
|
||||||
def test_get_type_hints(type_hint, expected):
|
def test_get_type_hints(type_hint, expected):
|
||||||
assert get_type_hints(type_hint) == expected
|
assert get_type_hints(type_hint) == expected
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import random
|
import random
|
||||||
from typing import Callable
|
from collections.abc import Callable
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
# imports for structured outputs tests
|
# imports for structured outputs tests
|
||||||
import json
|
import json
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import jsonschema
|
import jsonschema
|
||||||
import openai # use the official client for correctness check
|
import openai # use the official client for correctness check
|
||||||
@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st
|
|||||||
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
|
[(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
|
||||||
)
|
)
|
||||||
async def test_prompt_logprobs_chat(
|
async def test_prompt_logprobs_chat(
|
||||||
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
|
client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
|
||||||
):
|
):
|
||||||
params: dict = {
|
params: dict = {
|
||||||
"messages": [
|
"messages": [
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import openai # use the official client for correctness check
|
import openai # use the official client for correctness check
|
||||||
import pytest
|
import pytest
|
||||||
@ -166,7 +165,7 @@ async def test_function_tool_use(
|
|||||||
client: openai.AsyncOpenAI,
|
client: openai.AsyncOpenAI,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
stream: bool,
|
stream: bool,
|
||||||
tool_choice: Union[str, dict],
|
tool_choice: str | dict,
|
||||||
enable_thinking: bool,
|
enable_thinking: bool,
|
||||||
):
|
):
|
||||||
if not stream:
|
if not stream:
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Optional
|
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -38,13 +37,13 @@ class MockModelConfig:
|
|||||||
trust_remote_code: bool = False
|
trust_remote_code: bool = False
|
||||||
tokenizer_mode: str = "auto"
|
tokenizer_mode: str = "auto"
|
||||||
max_model_len: int = 100
|
max_model_len: int = 100
|
||||||
tokenizer_revision: Optional[str] = None
|
tokenizer_revision: str | None = None
|
||||||
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
||||||
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
||||||
logits_processor_pattern: Optional[str] = None
|
logits_processor_pattern: str | None = None
|
||||||
diff_sampling_param: Optional[dict] = None
|
diff_sampling_param: dict | None = None
|
||||||
allowed_local_media_path: str = ""
|
allowed_local_media_path: str = ""
|
||||||
allowed_media_domains: Optional[list[str]] = None
|
allowed_media_domains: list[str] | None = None
|
||||||
encoder_config = None
|
encoder_config = None
|
||||||
generation_config: str = "auto"
|
generation_config: str = "auto"
|
||||||
skip_tokenizer_init: bool = False
|
skip_tokenizer_init: bool = False
|
||||||
@ -56,7 +55,7 @@ class MockModelConfig:
|
|||||||
class MockLoRAResolver(LoRAResolver):
|
class MockLoRAResolver(LoRAResolver):
|
||||||
async def resolve_lora(
|
async def resolve_lora(
|
||||||
self, base_model_name: str, lora_name: str
|
self, base_model_name: str, lora_name: str
|
||||||
) -> Optional[LoRARequest]:
|
) -> LoRARequest | None:
|
||||||
if lora_name == "test-lora":
|
if lora_name == "test-lora":
|
||||||
return LoRARequest(
|
return LoRARequest(
|
||||||
lora_name="test-lora",
|
lora_name="test-lora",
|
||||||
|
|||||||
@ -1,16 +1,14 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import Any
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
from vllm.config.multimodal import MultiModalConfig
|
from vllm.config.multimodal import MultiModalConfig
|
||||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||||
@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM
|
|||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
|
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
ChatCompletionRequest,
|
ChatCompletionRequest,
|
||||||
@ -84,10 +83,10 @@ class StreamingToolReconstructor:
|
|||||||
def run_tool_extraction(
|
def run_tool_extraction(
|
||||||
tool_parser: ToolParser,
|
tool_parser: ToolParser,
|
||||||
model_output: str,
|
model_output: str,
|
||||||
request: Union[ChatCompletionRequest, None] = None,
|
request: ChatCompletionRequest | None = None,
|
||||||
streaming: bool = False,
|
streaming: bool = False,
|
||||||
assert_one_tool_per_delta: bool = True,
|
assert_one_tool_per_delta: bool = True,
|
||||||
) -> tuple[Union[str, None], list[ToolCall]]:
|
) -> tuple[str | None, list[ToolCall]]:
|
||||||
if streaming:
|
if streaming:
|
||||||
reconstructor = run_tool_extraction_streaming(
|
reconstructor = run_tool_extraction_streaming(
|
||||||
tool_parser,
|
tool_parser,
|
||||||
@ -105,7 +104,7 @@ def run_tool_extraction(
|
|||||||
def run_tool_extraction_nonstreaming(
|
def run_tool_extraction_nonstreaming(
|
||||||
tool_parser: ToolParser,
|
tool_parser: ToolParser,
|
||||||
model_output: str,
|
model_output: str,
|
||||||
request: Union[ChatCompletionRequest, None] = None,
|
request: ChatCompletionRequest | None = None,
|
||||||
) -> ExtractedToolCallInformation:
|
) -> ExtractedToolCallInformation:
|
||||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||||
return tool_parser.extract_tool_calls(model_output, request)
|
return tool_parser.extract_tool_calls(model_output, request)
|
||||||
@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming(
|
|||||||
def run_tool_extraction_streaming(
|
def run_tool_extraction_streaming(
|
||||||
tool_parser: ToolParser,
|
tool_parser: ToolParser,
|
||||||
model_deltas: Iterable[str],
|
model_deltas: Iterable[str],
|
||||||
request: Union[ChatCompletionRequest, None] = None,
|
request: ChatCompletionRequest | None = None,
|
||||||
assert_one_tool_per_delta: bool = True,
|
assert_one_tool_per_delta: bool = True,
|
||||||
) -> StreamingToolReconstructor:
|
) -> StreamingToolReconstructor:
|
||||||
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
request = request or ChatCompletionRequest(messages=[], model="test-model")
|
||||||
|
|||||||
@ -4,8 +4,6 @@
|
|||||||
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
|
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -103,14 +101,14 @@ async def test_matryoshka(
|
|||||||
run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
|
run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
|
||||||
|
|
||||||
if model_info.is_matryoshka:
|
if model_info.is_matryoshka:
|
||||||
valid_dimensions: list[Optional[int]] = [None]
|
valid_dimensions: list[int | None] = [None]
|
||||||
if model_info.matryoshka_dimensions is not None:
|
if model_info.matryoshka_dimensions is not None:
|
||||||
valid_dimensions += model_info.matryoshka_dimensions[:2]
|
valid_dimensions += model_info.matryoshka_dimensions[:2]
|
||||||
|
|
||||||
for dimensions in valid_dimensions:
|
for dimensions in valid_dimensions:
|
||||||
await make_request_and_correctness_test(dimensions)
|
await make_request_and_correctness_test(dimensions)
|
||||||
|
|
||||||
invalid_dimensions: list[Optional[int]] = [-1]
|
invalid_dimensions: list[int | None] = [-1]
|
||||||
if model_info.matryoshka_dimensions is not None:
|
if model_info.matryoshka_dimensions is not None:
|
||||||
assert 5 not in model_info.matryoshka_dimensions
|
assert 5 not in model_info.matryoshka_dimensions
|
||||||
invalid_dimensions.append(5)
|
invalid_dimensions.append(5)
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import multiprocessing
|
|||||||
import socket
|
import socket
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
|
|||||||
assert len(manager.processes) == 3
|
assert len(manager.processes) == 3
|
||||||
|
|
||||||
# Create a result capture for the thread
|
# Create a result capture for the thread
|
||||||
result: dict[str, Optional[Exception]] = {"exception": None}
|
result: dict[str, Exception | None] = {"exception": None}
|
||||||
|
|
||||||
def run_with_exception_capture():
|
def run_with_exception_capture():
|
||||||
try:
|
try:
|
||||||
@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args):
|
|||||||
assert len(manager.processes) == 3
|
assert len(manager.processes) == 3
|
||||||
|
|
||||||
# Create a result capture for the thread
|
# Create a result capture for the thread
|
||||||
result: dict[str, Optional[Exception]] = {"exception": None}
|
result: dict[str, Exception | None] = {"exception": None}
|
||||||
|
|
||||||
def run_with_exception_capture():
|
def run_with_exception_capture():
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from typing import Literal, Optional
|
from typing import Literal
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
||||||
@ -152,9 +152,9 @@ def audio_url():
|
|||||||
|
|
||||||
|
|
||||||
def _assert_mm_data_is_image_input(
|
def _assert_mm_data_is_image_input(
|
||||||
mm_data: Optional[MultiModalDataDict],
|
mm_data: MultiModalDataDict | None,
|
||||||
image_count: int,
|
image_count: int,
|
||||||
skipped_image_indices: Optional[list] = None,
|
skipped_image_indices: list | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
assert mm_data is not None
|
assert mm_data is not None
|
||||||
assert set(mm_data.keys()) == {"image"}
|
assert set(mm_data.keys()) == {"image"}
|
||||||
@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input(
|
|||||||
|
|
||||||
|
|
||||||
def _assert_mm_uuids(
|
def _assert_mm_uuids(
|
||||||
mm_uuids: Optional[MultiModalUUIDDict],
|
mm_uuids: MultiModalUUIDDict | None,
|
||||||
media_count: int,
|
media_count: int,
|
||||||
expected_uuids: list[Optional[str]],
|
expected_uuids: list[str | None],
|
||||||
modality: str = "image",
|
modality: str = "image",
|
||||||
) -> None:
|
) -> None:
|
||||||
if len(expected_uuids) > 0:
|
if len(expected_uuids) > 0:
|
||||||
@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]
|
|||||||
|
|
||||||
|
|
||||||
def _assert_mm_data_inputs(
|
def _assert_mm_data_inputs(
|
||||||
mm_data: Optional[MultiModalDataDict],
|
mm_data: MultiModalDataDict | None,
|
||||||
data_count: MultiModalDataCounts,
|
data_count: MultiModalDataCounts,
|
||||||
skipped_media_indices: Optional[dict[str, list]] = None, # modality -> list[int]
|
skipped_media_indices: dict[str, list] | None = None, # modality -> list[int]
|
||||||
) -> None:
|
) -> None:
|
||||||
assert mm_data is not None
|
assert mm_data is not None
|
||||||
assert set(data_count.keys()) == (set(mm_data.keys()))
|
assert set(data_count.keys()) == (set(mm_data.keys()))
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
import io
|
import io
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
import pybase64
|
import pybase64
|
||||||
@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt
|
|||||||
@dataclass
|
@dataclass
|
||||||
class MockModelConfig:
|
class MockModelConfig:
|
||||||
max_model_len: int = 100
|
max_model_len: int = 100
|
||||||
encoder_config: Optional[dict] = None
|
encoder_config: dict | None = None
|
||||||
|
|
||||||
|
|
||||||
class MockTokenizerResult:
|
class MockTokenizerResult:
|
||||||
|
|||||||
@ -12,7 +12,6 @@ import json
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
|
|||||||
INVALID = -9999999
|
INVALID = -9999999
|
||||||
|
|
||||||
|
|
||||||
def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
|
def download_and_cache_file(url: str, filename: str | None = None) -> str:
|
||||||
"""Download and cache a file from a URL."""
|
"""Download and cache a file from a URL."""
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = os.path.join("/tmp", url.split("/")[-1])
|
filename = os.path.join("/tmp", url.split("/")[-1])
|
||||||
@ -81,9 +80,9 @@ async def call_vllm_api(
|
|||||||
prompt: str,
|
prompt: str,
|
||||||
temperature: float,
|
temperature: float,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
stop: Optional[list[str]] = None,
|
stop: list[str] | None = None,
|
||||||
url: Optional[str] = None,
|
url: str | None = None,
|
||||||
seed: Optional[int] = None,
|
seed: int | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Call vLLM's OpenAI-compatible completions endpoint."""
|
"""Call vLLM's OpenAI-compatible completions endpoint."""
|
||||||
data = {
|
data = {
|
||||||
@ -112,8 +111,8 @@ def evaluate_gsm8k(
|
|||||||
host: str = "http://127.0.0.1",
|
host: str = "http://127.0.0.1",
|
||||||
port: int = 8000,
|
port: int = 8000,
|
||||||
temperature: float = 0.0,
|
temperature: float = 0.0,
|
||||||
seed: Optional[int] = 42,
|
seed: int | None = 42,
|
||||||
) -> dict[str, Union[float, int]]:
|
) -> dict[str, float | int]:
|
||||||
"""
|
"""
|
||||||
Evaluate GSM8K accuracy using vLLM serve endpoint.
|
Evaluate GSM8K accuracy using vLLM serve endpoint.
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -27,8 +26,8 @@ def ref_paged_attn(
|
|||||||
kv_lens: list[int],
|
kv_lens: list[int],
|
||||||
block_tables: torch.Tensor,
|
block_tables: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
sliding_window: Optional[int] = None,
|
sliding_window: int | None = None,
|
||||||
soft_cap: Optional[float] = None,
|
soft_cap: float | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
num_seqs = len(query_lens)
|
num_seqs = len(query_lens)
|
||||||
block_tables = block_tables.cpu().numpy()
|
block_tables = block_tables.cpu().numpy()
|
||||||
@ -94,12 +93,12 @@ def test_varlen_with_paged_kv(
|
|||||||
seq_lens: list[tuple[int, int]],
|
seq_lens: list[tuple[int, int]],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
head_size: int,
|
head_size: int,
|
||||||
sliding_window: Optional[int],
|
sliding_window: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
q_dtype: Optional[torch.dtype],
|
q_dtype: torch.dtype | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
current_platform.seed_everything(0)
|
current_platform.seed_everything(0)
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -50,7 +49,7 @@ def ref_masked_attention(
|
|||||||
key: torch.Tensor,
|
key: torch.Tensor,
|
||||||
value: torch.Tensor,
|
value: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
attn_mask: Optional[torch.Tensor] = None,
|
attn_mask: torch.Tensor | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
|
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
|
||||||
if attn_mask is not None:
|
if attn_mask is not None:
|
||||||
@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention(
|
|||||||
block_tables: torch.Tensor,
|
block_tables: torch.Tensor,
|
||||||
seq_lens: torch.Tensor,
|
seq_lens: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
alibi_slopes: Optional[torch.Tensor],
|
alibi_slopes: torch.Tensor | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
num_query_heads = query.shape[1]
|
num_query_heads = query.shape[1]
|
||||||
num_kv_heads = value_cache.shape[1]
|
num_kv_heads = value_cache.shape[1]
|
||||||
@ -415,7 +414,7 @@ def ref_multi_query_kv_attention(
|
|||||||
key: torch.Tensor,
|
key: torch.Tensor,
|
||||||
value: torch.Tensor,
|
value: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
alibi_bias: Optional[list[torch.Tensor]],
|
alibi_bias: list[torch.Tensor] | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
num_seqs = len(cu_seq_lens) - 1
|
num_seqs = len(cu_seq_lens) - 1
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -85,7 +84,7 @@ def test_cascade(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
fa_version: int,
|
fa_version: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -17,7 +16,7 @@ def cal_diff(
|
|||||||
y: torch.Tensor,
|
y: torch.Tensor,
|
||||||
name: str,
|
name: str,
|
||||||
use_fp8: bool = False,
|
use_fp8: bool = False,
|
||||||
diff_threshold: Optional[float] = None,
|
diff_threshold: float | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
x, y = x.double(), y.double()
|
x, y = x.double(), y.double()
|
||||||
cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
|
cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -34,8 +33,8 @@ def ref_paged_attn(
|
|||||||
kv_lens: list[int],
|
kv_lens: list[int],
|
||||||
block_tables: torch.Tensor,
|
block_tables: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
sliding_window: Optional[int] = None,
|
sliding_window: int | None = None,
|
||||||
soft_cap: Optional[float] = None,
|
soft_cap: float | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
num_seqs = len(query_lens)
|
num_seqs = len(query_lens)
|
||||||
block_tables = block_tables.cpu().numpy()
|
block_tables = block_tables.cpu().numpy()
|
||||||
@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
sliding_window: Optional[int],
|
sliding_window: int | None,
|
||||||
fa_version: int,
|
fa_version: int,
|
||||||
q_dtype: Optional[torch.dtype],
|
q_dtype: torch.dtype | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
if not is_fa_version_supported(fa_version):
|
if not is_fa_version_supported(fa_version):
|
||||||
@ -221,13 +220,13 @@ def test_varlen_with_paged_kv(
|
|||||||
seq_lens: list[tuple[int, int]],
|
seq_lens: list[tuple[int, int]],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
head_size: int,
|
head_size: int,
|
||||||
sliding_window: Optional[int],
|
sliding_window: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
fa_version: int,
|
fa_version: int,
|
||||||
q_dtype: Optional[torch.dtype],
|
q_dtype: torch.dtype | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
if not is_fa_version_supported(fa_version):
|
if not is_fa_version_supported(fa_version):
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import pytest
|
import pytest
|
||||||
@ -26,8 +25,8 @@ def ref_paged_attn(
|
|||||||
kv_lens: list[int],
|
kv_lens: list[int],
|
||||||
block_tables: torch.Tensor,
|
block_tables: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
sliding_window: Optional[int] = None,
|
sliding_window: int | None = None,
|
||||||
soft_cap: Optional[float] = None,
|
soft_cap: float | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
num_seqs = len(query_lens)
|
num_seqs = len(query_lens)
|
||||||
block_tables = block_tables.cpu().numpy()
|
block_tables = block_tables.cpu().numpy()
|
||||||
@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
sliding_window: Optional[int],
|
sliding_window: int | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
current_platform.seed_everything(0)
|
current_platform.seed_everything(0)
|
||||||
@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
sliding_window: Optional[int],
|
sliding_window: int | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
current_platform.seed_everything(0)
|
current_platform.seed_everything(0)
|
||||||
@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
pytest.skip("TODO: fix the accuracy issue")
|
pytest.skip("TODO: fix the accuracy issue")
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
|
|||||||
head_size: int,
|
head_size: int,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
# test doesn't work for num_heads = (16,16)
|
# test doesn't work for num_heads = (16,16)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import pytest
|
import pytest
|
||||||
@ -68,9 +67,7 @@ NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation.
|
|||||||
@torch.inference_mode
|
@torch.inference_mode
|
||||||
def test_flashinfer_trtllm_decode_with_baseline(
|
def test_flashinfer_trtllm_decode_with_baseline(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_lens: tuple[int, int],
|
max_seq_lens: tuple[int, int],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
|||||||
kv_layout: str,
|
kv_layout: str,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
window_left: int,
|
window_left: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
has_sinks: bool,
|
has_sinks: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
|||||||
@torch.inference_mode
|
@torch.inference_mode
|
||||||
def test_flashinfer_trtllm_prefill_with_baseline(
|
def test_flashinfer_trtllm_prefill_with_baseline(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[
|
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
|
||||||
],
|
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_lens: tuple[int, int],
|
max_seq_lens: tuple[int, int],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
|
|||||||
kv_layout: str,
|
kv_layout: str,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
window_left: int,
|
window_left: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
has_sinks: bool,
|
has_sinks: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -20,7 +19,7 @@ def merge_attn_states_torch(
|
|||||||
prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
||||||
suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
|
suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
|
||||||
suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
||||||
output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS]
|
output_lse: torch.Tensor | None = None, # [NUM_HEADS, NUM_TOKENS]
|
||||||
):
|
):
|
||||||
p_lse = prefix_lse
|
p_lse = prefix_lse
|
||||||
s_lse = suffix_lse
|
s_lse = suffix_lse
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -32,8 +31,8 @@ def ref_paged_attn(
|
|||||||
kv_lens: list[int],
|
kv_lens: list[int],
|
||||||
block_tables: torch.Tensor,
|
block_tables: torch.Tensor,
|
||||||
scale: float,
|
scale: float,
|
||||||
sliding_window: Optional[int] = None,
|
sliding_window: int | None = None,
|
||||||
soft_cap: Optional[float] = None,
|
soft_cap: float | None = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
num_seqs = len(query_lens)
|
num_seqs = len(query_lens)
|
||||||
block_tables = block_tables.cpu().numpy()
|
block_tables = block_tables.cpu().numpy()
|
||||||
@ -98,12 +97,12 @@ def test_triton_unified_attn(
|
|||||||
seq_lens: list[tuple[int, int]],
|
seq_lens: list[tuple[int, int]],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
head_size: int,
|
head_size: int,
|
||||||
sliding_window: Optional[int],
|
sliding_window: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
soft_cap: Optional[float],
|
soft_cap: float | None,
|
||||||
num_blocks: int,
|
num_blocks: int,
|
||||||
q_dtype: Optional[torch.dtype],
|
q_dtype: torch.dtype | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -31,13 +30,13 @@ EPS = 1e-6
|
|||||||
## Helpers
|
## Helpers
|
||||||
|
|
||||||
|
|
||||||
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
|
def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
|
||||||
return torch.as_tensor(x, dtype=torch.float32, device="cuda")
|
return torch.as_tensor(x, dtype=torch.float32, device="cuda")
|
||||||
|
|
||||||
|
|
||||||
def ref_rms_norm(
|
def ref_rms_norm(
|
||||||
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
|
rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
|
||||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||||
if residual is not None:
|
if residual is not None:
|
||||||
residual = residual.clone()
|
residual = residual.clone()
|
||||||
out, residual = rms_norm_layer.forward_native(x, residual)
|
out, residual = rms_norm_layer.forward_native(x, residual)
|
||||||
@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant(
|
|||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
scale_ub: Optional[torch.Tensor],
|
scale_ub: torch.Tensor | None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||||
if scale_ub is not None:
|
if scale_ub is not None:
|
||||||
assert quant_dtype == torch.float8_e4m3fn
|
assert quant_dtype == torch.float8_e4m3fn
|
||||||
|
|
||||||
@ -76,9 +75,9 @@ def ref_impl(
|
|||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
scale_ub: Optional[torch.Tensor],
|
scale_ub: torch.Tensor | None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||||
return ref_dynamic_per_token_quant(
|
return ref_dynamic_per_token_quant(
|
||||||
rms_norm_layer, x, quant_dtype, residual, scale_ub
|
rms_norm_layer, x, quant_dtype, residual, scale_ub
|
||||||
)
|
)
|
||||||
@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant(
|
|||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
scale_ub: Optional[torch.Tensor],
|
scale_ub: torch.Tensor | None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||||
if residual is not None:
|
if residual is not None:
|
||||||
residual = residual.clone()
|
residual = residual.clone()
|
||||||
out, scales = ops.rms_norm_dynamic_per_token_quant(
|
out, scales = ops.rms_norm_dynamic_per_token_quant(
|
||||||
@ -103,9 +102,9 @@ def ops_impl(
|
|||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
scale_ub: Optional[torch.Tensor],
|
scale_ub: torch.Tensor | None,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||||
return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
|
return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -68,7 +68,7 @@ def test_rotary_embedding(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
rotary_dim: Optional[int],
|
rotary_dim: int | None,
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
|
|||||||
@ -4,8 +4,6 @@
|
|||||||
Tests for miscellaneous utilities
|
Tests for miscellaneous utilities
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -17,7 +15,7 @@ def rotary_embedding_opcheck(
|
|||||||
rot,
|
rot,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
query: torch.Tensor,
|
query: torch.Tensor,
|
||||||
key: Optional[torch.Tensor] = None,
|
key: torch.Tensor | None = None,
|
||||||
):
|
):
|
||||||
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
|
cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
@ -19,11 +18,11 @@ from vllm.platforms import current_platform
|
|||||||
def causal_conv1d_ref(
|
def causal_conv1d_ref(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
bias: Optional[torch.Tensor] = None,
|
bias: torch.Tensor | None = None,
|
||||||
initial_states: Optional[torch.Tensor] = None,
|
initial_states: torch.Tensor | None = None,
|
||||||
return_final_states: bool = False,
|
return_final_states: bool = False,
|
||||||
final_states_out: Optional[torch.Tensor] = None,
|
final_states_out: torch.Tensor | None = None,
|
||||||
activation: Optional[str] = "silu",
|
activation: str | None = "silu",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
x: (batch, dim, seqlen)
|
x: (batch, dim, seqlen)
|
||||||
@ -117,12 +116,12 @@ def causal_conv1d_update_ref(
|
|||||||
def causal_conv1d_opcheck_fn(
|
def causal_conv1d_opcheck_fn(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
bias: Optional[torch.Tensor] = None,
|
bias: torch.Tensor | None = None,
|
||||||
cu_seq_len: Optional[torch.Tensor] = None,
|
cu_seq_len: torch.Tensor | None = None,
|
||||||
cache_indices: Optional[torch.Tensor] = None,
|
cache_indices: torch.Tensor | None = None,
|
||||||
has_initial_state: Optional[torch.Tensor] = None,
|
has_initial_state: torch.Tensor | None = None,
|
||||||
conv_states: Optional[torch.Tensor] = None,
|
conv_states: torch.Tensor | None = None,
|
||||||
activation: Optional[str] = "silu",
|
activation: str | None = "silu",
|
||||||
pad_slot_id: int = PAD_SLOT_ID,
|
pad_slot_id: int = PAD_SLOT_ID,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Optional, Union
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ from .mk_objects import (
|
|||||||
from .parallel_utils import ProcessGroupInfo
|
from .parallel_utils import ProcessGroupInfo
|
||||||
|
|
||||||
|
|
||||||
def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
|
def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
|
||||||
if t is None:
|
if t is None:
|
||||||
return f"{name} : None"
|
return f"{name} : None"
|
||||||
else:
|
else:
|
||||||
@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Config:
|
class Config:
|
||||||
Ms: Union[list[int], int]
|
Ms: list[int] | int
|
||||||
K: int
|
K: int
|
||||||
N: int
|
N: int
|
||||||
E: int
|
E: int
|
||||||
topks: Union[list[int], int]
|
topks: list[int] | int
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
quant_config: Optional[TestMoEQuantConfig]
|
quant_config: TestMoEQuantConfig | None
|
||||||
|
|
||||||
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
|
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
|
||||||
fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
|
fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
|
||||||
|
|
||||||
fused_moe_chunk_size: Optional[int]
|
fused_moe_chunk_size: int | None
|
||||||
world_size: int
|
world_size: int
|
||||||
|
|
||||||
torch_trace_dir_path: Optional[str] = None
|
torch_trace_dir_path: str | None = None
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.quant_config is None:
|
if self.quant_config is None:
|
||||||
@ -93,7 +93,7 @@ class Config:
|
|||||||
return self.Ms
|
return self.Ms
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def quant_dtype(self) -> Union[torch.dtype, str, None]:
|
def quant_dtype(self) -> torch.dtype | str | None:
|
||||||
assert self.quant_config is not None
|
assert self.quant_config is not None
|
||||||
return self.quant_config.quant_dtype
|
return self.quant_config.quant_dtype
|
||||||
|
|
||||||
@ -112,7 +112,7 @@ class Config:
|
|||||||
return self.quant_config.per_out_ch_quant
|
return self.quant_config.per_out_ch_quant
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def quant_block_shape(self) -> Optional[list[int]]:
|
def quant_block_shape(self) -> list[int] | None:
|
||||||
assert self.quant_config is not None
|
assert self.quant_config is not None
|
||||||
return self.quant_config.block_shape
|
return self.quant_config.block_shape
|
||||||
|
|
||||||
@ -209,7 +209,7 @@ class Config:
|
|||||||
info = prepare_finalize_info(self.prepare_finalize_type)
|
info = prepare_finalize_info(self.prepare_finalize_type)
|
||||||
return info.backend
|
return info.backend
|
||||||
|
|
||||||
def is_valid(self) -> tuple[bool, Optional[str]]:
|
def is_valid(self) -> tuple[bool, str | None]:
|
||||||
# Check prepare-finalize and fused-experts compatibility
|
# Check prepare-finalize and fused-experts compatibility
|
||||||
if self.is_batched_prepare_finalize():
|
if self.is_batched_prepare_finalize():
|
||||||
if not self.is_batched_fused_experts():
|
if not self.is_batched_fused_experts():
|
||||||
@ -280,10 +280,10 @@ class Config:
|
|||||||
class WeightTensors:
|
class WeightTensors:
|
||||||
w1: torch.Tensor
|
w1: torch.Tensor
|
||||||
w2: torch.Tensor
|
w2: torch.Tensor
|
||||||
w1_scale: Optional[torch.Tensor]
|
w1_scale: torch.Tensor | None
|
||||||
w2_scale: Optional[torch.Tensor]
|
w2_scale: torch.Tensor | None
|
||||||
w1_gs: Optional[torch.Tensor] = None
|
w1_gs: torch.Tensor | None = None
|
||||||
w2_gs: Optional[torch.Tensor] = None
|
w2_gs: torch.Tensor | None = None
|
||||||
|
|
||||||
def describe(self):
|
def describe(self):
|
||||||
s = ""
|
s = ""
|
||||||
@ -351,11 +351,11 @@ class WeightTensors:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class RankTensors:
|
class RankTensors:
|
||||||
hidden_states: torch.Tensor
|
hidden_states: torch.Tensor
|
||||||
hidden_states_scale: Optional[torch.Tensor]
|
hidden_states_scale: torch.Tensor | None
|
||||||
|
|
||||||
topk_weights: torch.Tensor
|
topk_weights: torch.Tensor
|
||||||
topk_ids: torch.Tensor
|
topk_ids: torch.Tensor
|
||||||
expert_map: Optional[torch.Tensor]
|
expert_map: torch.Tensor | None
|
||||||
|
|
||||||
def describe(self):
|
def describe(self):
|
||||||
s = ""
|
s = ""
|
||||||
@ -370,7 +370,7 @@ class RankTensors:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def make_hidden_states(
|
def make_hidden_states(
|
||||||
config: Config,
|
config: Config,
|
||||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||||
"""
|
"""
|
||||||
Return hidden_states
|
Return hidden_states
|
||||||
"""
|
"""
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user