diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index ba7c733be0b25..4021fede72153 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -8,7 +8,6 @@ import sys import time import traceback from dataclasses import dataclass, field -from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -28,13 +27,13 @@ class RequestFuncInput: prompt_len: int output_len: int model: str - model_name: Optional[str] = None - logprobs: Optional[int] = None - extra_body: Optional[dict] = None - multi_modal_content: Optional[dict | list[dict]] = None + model_name: str | None = None + logprobs: int | None = None + extra_body: dict | None = None + multi_modal_content: dict | list[dict] | None = None ignore_eos: bool = False - language: Optional[str] = None - request_id: Optional[str] = None + language: str | None = None + request_id: str | None = None @dataclass @@ -52,7 +51,7 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -133,7 +132,7 @@ async def async_request_tgi( async def async_request_trt_llm( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -204,7 +203,7 @@ async def async_request_trt_llm( async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -267,7 +266,7 @@ async def async_request_deepspeed_mii( async def async_request_openai_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -367,7 +366,7 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("chat/completions", "profile")), ( @@ -476,7 +475,7 @@ async def async_request_openai_chat_completions( async def async_request_openai_audio( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: # Lazy import without PlaceholderModule to avoid vllm dep. import soundfile @@ -610,7 +609,7 @@ def get_tokenizer( tokenizer_mode: str = "auto", trust_remote_code: bool = False, **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +) -> PreTrainedTokenizer | PreTrainedTokenizerFast: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path ): diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index b5e2613de1cd4..d7dc0e991c4d1 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -32,7 +32,6 @@ import dataclasses import json import random import time -from typing import Optional from transformers import PreTrainedTokenizerBase @@ -80,7 +79,7 @@ def sample_requests_from_dataset( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -128,7 +127,7 @@ def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, prefix_len: int, ) -> list[Request]: requests = [] diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index bb453791c1862..769f52dbab6ea 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -7,7 +7,6 @@ import dataclasses import json import random import time -from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -24,7 +23,7 @@ def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 58b9767d09390..059668f1789cc 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -32,7 +32,6 @@ import uuid import warnings from collections.abc import AsyncGenerator from dataclasses import dataclass -from typing import Optional import datasets import numpy as np @@ -316,7 +315,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: list[str], selected_percentiles: list[float], - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ) -> tuple[BenchmarkMetrics, list[int]]: actual_output_lens: list[int] = [] total_input = 0 @@ -436,9 +435,9 @@ async def benchmark( selected_percentile_metrics: list[str], selected_percentiles: list[str], ignore_eos: bool, - max_concurrency: Optional[int], + max_concurrency: int | None, structured_output_ratio: float, - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 98624abdf49fb..f0d661f9d5349 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -6,7 +6,7 @@ import math import os import time from types import TracebackType -from typing import Any, Optional, Union +from typing import Any def convert_to_pytorch_benchmark_format( @@ -92,7 +92,7 @@ class TimeCollector: def __init__(self, scale: int) -> None: self.cnt: int = 0 self._sum: int = 0 - self._max: Optional[int] = None + self._max: int | None = None self.scale = scale self.start_time: int = time.monotonic_ns() @@ -104,13 +104,13 @@ class TimeCollector: else: self._max = max(self._max, v) - def avg(self) -> Union[float, str]: + def avg(self) -> float | str: return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" - def max(self) -> Union[float, str]: + def max(self) -> float | str: return self._max / self.scale if self._max else "N/A" - def dump_avg_max(self) -> list[Union[float, str]]: + def dump_avg_max(self) -> list[float | str]: return [self.avg(), self.max()] def __enter__(self) -> None: @@ -118,8 +118,8 @@ class TimeCollector: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ) -> None: self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 9ec270bbd2e98..22fc2678fd1c9 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -6,8 +6,7 @@ import copy import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 02f8c593392c4..2deebf3ddb7ae 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -6,8 +6,7 @@ import copy import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable, Optional +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark @@ -53,7 +52,7 @@ def bench_int8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 @@ -108,7 +107,7 @@ def bench_fp8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn @@ -183,7 +182,7 @@ def bench( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) @@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run( dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]], - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 901524214469e..d809bf1db8cbc 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -3,10 +3,9 @@ import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]: def unfused_int8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -68,7 +67,7 @@ def unfused_int8_impl( def unfused_fp8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -85,7 +84,7 @@ def unfused_fp8_impl( def fused_impl( rms_norm_layer: RMSNorm, # this stores the weights x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): out, _ = ops.rms_norm_dynamic_per_token_quant( diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index e08e5680c191e..9a52ea7f47e3a 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Callable +from collections.abc import Callable from unittest.mock import patch import pandas as pd diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index 4cbdde5a5b2ca..df06a940e6d41 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -22,8 +22,8 @@ Example: import json import os import time +from collections.abc import Callable from contextlib import nullcontext -from typing import Callable, Optional import torch import torch.distributed as dist @@ -264,12 +264,12 @@ class CommunicatorBenchmark: def benchmark_allreduce_single( self, sequence_length: int, - allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], + allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], should_use_fn: Callable[[torch.Tensor], bool], context, num_warmup: int, num_trials: int, - ) -> Optional[float]: + ) -> float | None: """Benchmark method with CUDA graph optimization.""" try: # Create test tensor (2D: sequence_length x hidden_size) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 799b16999873f..39338f3387613 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -6,11 +6,12 @@ import copy import json import pickle import time +from collections.abc import Callable from dataclasses import dataclass from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Optional +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -158,7 +159,7 @@ def ref_group_gemm( seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, - add_inputs: Optional[bool], + add_inputs: bool | None, ): """ Torch group gemm reference implementation to test correctness of @@ -316,8 +317,8 @@ class BenchmarkContext: lora_rank: int sort_by_lora_id: bool dtype: torch.dtype - seq_length: Optional[int] = None - num_slices: Optional[int] = None # num_slices for slice based ops + seq_length: int | None = None + num_slices: int | None = None # num_slices for slice based ops def with_seq_length(self, seq_length: int) -> "BenchmarkContext": ctx = copy.copy(self) @@ -561,7 +562,7 @@ class BenchmarkTensors: } def bench_fn_kwargs( - self, op_type: OpType, add_inputs: Optional[bool] = None + self, op_type: OpType, add_inputs: bool | None = None ) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None @@ -575,7 +576,7 @@ class BenchmarkTensors: raise ValueError(f"Unrecognized optype {self}") def test_correctness( - self, op_type: OpType, expand_fn_add_inputs: Optional[bool] + self, op_type: OpType, expand_fn_add_inputs: bool | None ) -> bool: """ Test correctness of op_type implementation against a grouped gemm @@ -611,8 +612,8 @@ def bench_optype( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, - expand_fn_add_inputs: Optional[bool] = None, + cuda_graph_nops: int | None = None, + expand_fn_add_inputs: bool | None = None, test_correctness: bool = False, ) -> TMeasurement: assert arg_pool_size >= 1 @@ -679,7 +680,7 @@ def bench_torch_mm( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, + cuda_graph_nops: int | None = None, ) -> TMeasurement: """ Benchmark basic torch.mm as a roofline. @@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): +def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): compare = TBenchmark.Compare(timers) compare.print() diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 1b1c3b321cce4..e1d5239f5cc97 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -8,10 +8,9 @@ import math import os import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import pandas as pd import torch @@ -63,23 +62,23 @@ class BenchmarkTensors: a: torch.Tensor w_q: torch.Tensor - group_size: Optional[int] + group_size: int | None wtype: ScalarType w_g_s: torch.Tensor - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] + w_g_zp: torch.Tensor | None + w_ch_s: torch.Tensor | None + w_tok_s: torch.Tensor | None @dataclass class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + group_zero_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None def rand_data(shape, dtype=torch.float16, scale=1): @@ -93,8 +92,8 @@ def quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -113,7 +112,7 @@ def quantize_and_pack( def create_bench_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] + shape: tuple[int, int, int], types: TypeConfig, group_size: int | None ) -> list[BenchmarkTensors]: m, n, k = shape @@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]) return res -_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None -_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None +_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None +_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None def bench( diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 7e0376c18ecc7..8f9907952d24d 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -3,7 +3,6 @@ import random import time -from typing import Optional import torch @@ -37,7 +36,7 @@ def main( seed: int, do_profile: bool, device: str = "cuda", - kv_cache_dtype: Optional[str] = None, + kv_cache_dtype: str | None = None, ) -> None: current_platform.seed_everything(seed) diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index 1ccb5e08b3d57..bdc1eb733084e 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -3,8 +3,8 @@ import argparse import math +from collections.abc import Callable from contextlib import contextmanager -from typing import Callable from unittest.mock import patch import torch diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index af9841daadf24..d4b564d2ec6c9 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import random import time diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 0aace571064a0..93df14f0d95cc 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import random import time diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index 4cf633a81358d..d8d7f5bcf9dad 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Optional, Union import torch from flashinfer.norm import fused_add_rmsnorm, rmsnorm @@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module): def forward( self, x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: @@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module): def rmsnorm_naive( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) @@ -65,7 +64,7 @@ def rmsnorm_naive( def rmsnorm_flashinfer( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape @@ -89,7 +88,7 @@ def rmsnorm_flashinfer( def rmsnorm_vllm( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index b81baf17a8c67..24869c91a8d70 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate -from typing import Optional import nvtx import torch @@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora( seq_len: int, num_heads: int, head_size: int, - rotary_dim: Optional[int], + rotary_dim: int | None, dtype: torch.dtype, seed: int, device: str, diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 6ddab46214577..f7cdc25794cae 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -4,7 +4,6 @@ import csv import os from datetime import datetime -from typing import Optional import flashinfer import torch @@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): @torch.no_grad() def benchmark_decode( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_len: int, num_heads: tuple[int, int] = (64, 8), diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 131df74c7de1b..7993354475fcc 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -4,7 +4,6 @@ import csv import os from datetime import datetime -from typing import Optional import flashinfer import torch @@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): @torch.no_grad() def benchmark_prefill( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_len: int, num_heads: tuple[int, int] = (64, 8), diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 4bbb36bb43592..a9af811bbe9ca 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses -from collections.abc import Iterable -from typing import Any, Callable, Optional +from collections.abc import Callable, Iterable +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -55,7 +55,7 @@ class Bench: def __init__( self, - cuda_graph_params: Optional[CudaGraphBenchParams], + cuda_graph_params: CudaGraphBenchParams | None, label: str, sub_label: str, description: str, diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py index 67b937930d58c..2674899d1cc56 100644 --- a/benchmarks/multi_turn/bench_dataset.py +++ b/benchmarks/multi_turn/bench_dataset.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from statistics import mean -from typing import Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple import numpy as np # type: ignore import pandas as pd # type: ignore @@ -35,8 +35,8 @@ class Distribution(ABC): class UniformDistribution(Distribution): def __init__( self, - min_val: Union[int, float], - max_val: Union[int, float], + min_val: int | float, + max_val: int | float, is_integer: bool = True, ) -> None: self.min_val = min_val @@ -56,7 +56,7 @@ class UniformDistribution(Distribution): class ConstantDistribution(Distribution): - def __init__(self, value: Union[int, float]) -> None: + def __init__(self, value: int | float) -> None: self.value = value self.max_val = value @@ -68,7 +68,7 @@ class ConstantDistribution(Distribution): class ZipfDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + def __init__(self, alpha: float, max_val: int | None = None) -> None: self.alpha = alpha self.max_val = max_val @@ -83,7 +83,7 @@ class ZipfDistribution(Distribution): class PoissonDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + def __init__(self, alpha: float, max_val: int | None = None) -> None: self.alpha = alpha self.max_val = max_val @@ -100,11 +100,11 @@ class PoissonDistribution(Distribution): class LognormalDistribution(Distribution): def __init__( self, - mean: Optional[float] = None, - sigma: Optional[float] = None, - average: Optional[int] = None, - median_ratio: Optional[float] = None, - max_val: Optional[int] = None, + mean: float | None = None, + sigma: float | None = None, + average: int | None = None, + median_ratio: float | None = None, + max_val: int | None = None, ) -> None: self.average = average self.median_ratio = median_ratio diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 233ed460fc8d5..2b0a6da60c256 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -13,7 +13,7 @@ from datetime import datetime from enum import Enum from http import HTTPStatus from statistics import mean -from typing import NamedTuple, Union +from typing import NamedTuple import aiohttp # type: ignore import numpy as np # type: ignore @@ -169,7 +169,7 @@ class MovingAverage: class DebugStats: def __init__(self, logger: logging.Logger, window_size: int) -> None: self.logger = logger - self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { + self.metrics: dict[str, MovingAverage | MetricStats] = { "moving_avg_ttft_ms": MovingAverage(window_size), "moving_avg_tpot_ms": MovingAverage(window_size), "ttft_ms": MetricStats(), @@ -636,7 +636,7 @@ async def client_main( if args.verbose: curr_time_sec: float = time.perf_counter() - time_since_last_turn: Union[str, float] = "N/A" + time_since_last_turn: str | float = "N/A" if conv_id in time_of_last_turn: time_since_last_turn = round( curr_time_sec - time_of_last_turn[conv_id], 3 @@ -928,13 +928,13 @@ async def main_mp( f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 ) - rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) + rps: str | float = round(len(client_metrics) / runtime_sec, 3) if len(client_metrics) < (5 * bench_args.num_clients): # Do not estimate the RPS if the number of samples is very low # (threshold can be tuned if needed) rps = "N/A" - runtime_left_sec: Union[str, float] = round( + runtime_left_sec: str | float = round( (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 ) if percent < 0.05: diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py index c3622c99a2e53..fccab4d0ce21a 100644 --- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py +++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py @@ -13,7 +13,7 @@ import argparse import json import random from statistics import mean -from typing import Any, Optional +from typing import Any import pandas as pd # type: ignore import tqdm # type: ignore @@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool: def content_is_valid( - content: str, min_content_len: Optional[int], max_content_len: Optional[int] + content: str, min_content_len: int | None, max_content_len: int | None ) -> bool: if min_content_len and len(content) < min_content_len: return False @@ -37,7 +37,7 @@ def content_is_valid( def print_stats( - conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None + conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None ) -> None: # Collect statistics stats = [] @@ -109,12 +109,12 @@ def convert_sharegpt_to_openai( seed: int, input_file: str, output_file: str, - max_items: Optional[int], - min_content_len: Optional[int] = None, - max_content_len: Optional[int] = None, - min_turns: Optional[int] = None, - max_turns: Optional[int] = None, - model: Optional[str] = None, + max_items: int | None, + min_content_len: int | None = None, + max_content_len: int | None = None, + min_turns: int | None = None, + max_turns: int | None = None, + model: str | None = None, ) -> None: if min_turns and max_turns: assert min_turns <= max_turns diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index 5e742d0b02932..34fb64c413db2 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -from typing import Union from cutlass_library import * @@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum): TmaWarpSpecializedCooperative = enum_auto() -VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { **DataTypeNames, # type: ignore **{ VLLMDataType.u4b8: "u4b8", @@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { }, } -VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { **DataTypeTag, # type: ignore **{ VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", @@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { }, } -VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { +VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { **DataTypeSize, # type: ignore **{ VLLMDataType.u4b8: 4, @@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { }, } -VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u8b128: "vllm::kU8B128", DataType.u4: "vllm::kU4", @@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.bf16: "vllm::kBfloat16", } -VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { DataType.u8: "at::ScalarType::Byte", DataType.s8: "at::ScalarType::Char", DataType.e4m3: "at::ScalarType::Float8_e4m3fn", @@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.f32: "at::ScalarType::Float", } -VLLMKernelScheduleTag: dict[ - Union[MixedInputKernelScheduleType, KernelScheduleType], str -] = { +VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { **KernelScheduleTag, # type: ignore **{ MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index d29a199c5d32f..8bd17ba69cec1 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -9,7 +9,6 @@ from collections.abc import Iterable from copy import deepcopy from dataclasses import dataclass, fields from functools import reduce -from typing import Optional, Union import jinja2 from vllm_cutlass_library_extension import ( @@ -259,7 +258,7 @@ class ScheduleConfig: @dataclass(frozen=True) class TypeConfig: a: DataType - b: Union[DataType, VLLMDataType] + b: DataType | VLLMDataType b_group_scale: DataType b_group_zeropoint: DataType b_channel_scale: DataType @@ -280,7 +279,7 @@ class PrepackTypeConfig: class ImplConfig: types: TypeConfig schedules: list[ScheduleConfig] - heuristic: list[tuple[Optional[str], ScheduleConfig]] + heuristic: list[tuple[str | None, ScheduleConfig]] def generate_sch_sig(schedule_config: ScheduleConfig) -> str: diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 62e58e5c6ac58..4ce748ce1fed4 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -16,7 +16,7 @@ Declare supported languages and capabilities: ??? code "supported_languages and supports_transcription_only" ```python - from typing import ClassVar, Mapping, Optional, Literal + from typing import ClassVar, Mapping, Literal import numpy as np import torch from torch import nn @@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt audio: np.ndarray, stt_config: SpeechToTextConfig, model_config: ModelConfig, - language: Optional[str], + language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, - to_language: Optional[str], + to_language: str | None, ) -> PromptType: # Example with a free-form instruction prompt task_word = "Transcribe" if task_type == "transcribe" else "Translate" @@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: audio: np.ndarray, stt_config: SpeechToTextConfig, model_config: ModelConfig, - language: Optional[str], + language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, - to_language: Optional[str], + to_language: str | None, ) -> PromptType: if language is None: raise ValueError("Language must be specified") @@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method ( ??? code "validate_language()" ```python @classmethod - def validate_language(cls, language: Optional[str]) -> Optional[str]: + def validate_language(cls, language: str | None) -> str | None: if language is None: logger.warning( "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") @@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: audio_duration_s: float, stt_config: SpeechToTextConfig, model_config: ModelConfig, - ) -> Optional[int]: + ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example ``` diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 20d78ca3aae2c..da61d2a85e466 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus from collections.abc import Sequence from dataclasses import dataclass from enum import Enum, auto - from typing import TYPE_CHECKING, Optional + from typing import TYPE_CHECKING import torch @@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus @abstractmethod def update_state( self, - batch_update: Optional["BatchUpdate"], + batch_update: "BatchUpdate" | None, ) -> None: """Called when there are new output tokens, prior to each forward pass. @@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling -* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: +* `update_state(self, batch_update: "BatchUpdate" | None) -> None`: * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step * Use the `BatchUpdate` members to update logits processor internal state * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 201b340c5972c..b8ad53863cd7a 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes ??? code "Example custom logits processor definition" ``` python - from typing import Optional import torch from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams @@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes """Never impacts greedy sampling""" return False - def update_state(self, batch_update: Optional[BatchUpdate]): + def update_state(self, batch_update: BatchUpdate | None): if not batch_update: return diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index a36664e470450..c4eed2037781a 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -10,7 +10,7 @@ on HuggingFace model repository. import os from dataclasses import asdict -from typing import Any, NamedTuple, Optional +from typing import Any, NamedTuple from huggingface_hub import snapshot_download from transformers import AutoTokenizer @@ -30,11 +30,11 @@ question_per_audio_count = { class ModelRequestData(NamedTuple): engine_args: EngineArgs - prompt: Optional[str] = None - prompt_token_ids: Optional[dict[str, list[int]]] = None - multi_modal_data: Optional[dict[str, Any]] = None - stop_token_ids: Optional[list[int]] = None - lora_requests: Optional[list[LoRARequest]] = None + prompt: str | None = None + prompt_token_ids: dict[str, list[int]] | None = None + multi_modal_data: dict[str, Any] | None = None + stop_token_ids: list[int] | None = None + lora_requests: list[LoRARequest] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py index 0abe7d1612610..5b2acea4c9457 100644 --- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py +++ b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py @@ -3,7 +3,7 @@ # ruff: noqa: E501 import logging from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( @@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector): def get_finished( self, finished_req_ids: set[str] - ) -> tuple[Optional[set[str]], Optional[set[str]]]: + ) -> tuple[set[str] | None, set[str] | None]: if self._async_load: meta = self._get_connector_metadata() assert isinstance(meta, RogueSharedStorageConnectorMetadata) diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/offline_inference/logits_processor/custom.py index 4112a498f37ab..72e7ce24d7cc8 100644 --- a/examples/offline_inference/logits_processor/custom.py +++ b/examples/offline_inference/logits_processor/custom.py @@ -33,8 +33,6 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the' ------------------------------------------------------------ """ -from typing import Optional - import torch from vllm import LLM, SamplingParams @@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor): def is_argmax_invariant(self) -> bool: return False - def update_state(self, batch_update: Optional[BatchUpdate]): + def update_state(self, batch_update: BatchUpdate | None): process_dict_updates( self.req_info, batch_update, diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py index 4c19bb4ce2bae..87cd7473fa9f1 100644 --- a/examples/offline_inference/logits_processor/custom_req.py +++ b/examples/offline_inference/logits_processor/custom_req.py @@ -39,7 +39,7 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the' ------------------------------------------------------------ """ -from typing import Any, Optional +from typing import Any import torch @@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): def new_req_logits_processor( self, params: SamplingParams, - ) -> Optional[RequestLogitsProcessor]: + ) -> RequestLogitsProcessor | None: """This method returns a new request-level logits processor, customized to the `target_token` value associated with a particular request. @@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): Returns: `Callable` request logits processor, or None """ - target_token: Optional[Any] = params.extra_args and params.extra_args.get( + target_token: Any | None = params.extra_args and params.extra_args.get( "target_token" ) if target_token is None: diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py index 62947d122e01c..3bb82a786040b 100644 --- a/examples/offline_inference/logits_processor/custom_req_init.py +++ b/examples/offline_inference/logits_processor/custom_req_init.py @@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda" device, the first and third requests would not repeat the same token. """ -from typing import Optional - import torch from vllm import LLM, SamplingParams @@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): def new_req_logits_processor( self, params: SamplingParams, - ) -> Optional[RequestLogitsProcessor]: + ) -> RequestLogitsProcessor | None: """This method returns a new request-level logits processor, customized to the `target_token` value associated with a particular request. diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 00d4cb9eb4c41..dc5c6202fa57b 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -8,7 +8,6 @@ Requires HuggingFace credentials for access. """ import gc -from typing import Optional import torch from huggingface_hub import snapshot_download @@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( lora_path: str, -) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, LoRARequest | None]]: return [ # this is an example of using quantization without LoRA ( @@ -56,7 +55,7 @@ def create_test_prompts( def process_requests( engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], + test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]], ): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -78,7 +77,7 @@ def process_requests( def initialize_engine( - model: str, quantization: str, lora_repo: Optional[str] + model: str, quantization: str, lora_repo: str | None ) -> LLMEngine: """Initialize the LLMEngine.""" diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 6040683c68bcd..6c23cf342e06b 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -7,8 +7,6 @@ for offline inference. Requires HuggingFace credentials for access to Llama2. """ -from typing import Optional - from huggingface_hub import snapshot_download from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams @@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( lora_path: str, -) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, LoRARequest | None]]: """Create a list of test prompts with their sampling parameters. 2 requests for base model, 4 requests for the LoRA. We define 2 @@ -68,7 +66,7 @@ def create_test_prompts( def process_requests( engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], + test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]], ): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 1a5879a6d35f5..2c73ed6aa6083 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -3,7 +3,6 @@ import argparse import datetime import os -from typing import Union import albumentations import numpy as np @@ -160,7 +159,7 @@ def load_example( file_paths: list[str], mean: list[float] = None, std: list[float] = None, - indices: Union[list[int], None] = None, + indices: list[int] | None = None, ): """Build an input example by loading images in *file_paths*. diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index c0e60b9793407..13def88439ef2 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc -from typing import Callable, Optional, TypedDict +from collections.abc import Callable +from typing import TypedDict import torch import zmq @@ -71,7 +72,7 @@ class WorkerExtension: def rebuild_ipc( - handle: tuple[Callable, tuple], device_id: Optional[int] = None + handle: tuple[Callable, tuple], device_id: int | None = None ) -> torch.Tensor: func, args = handle list_args = list(args) @@ -109,7 +110,7 @@ class ColocateWorkerExtension: self._zmq_ctx = zmq.Context() socket = self._zmq_ctx.socket(zmq.REP) socket.connect(zmq_handles[self.report_device_id()]) - buffer: Optional[torch.Tensor] = None + buffer: torch.Tensor | None = None while True: payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = ( socket.recv_pyobj() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 9fd9da3b0855e..1f09dabaf74c8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -12,7 +12,7 @@ import os import random from contextlib import contextmanager from dataclasses import asdict -from typing import NamedTuple, Optional +from typing import NamedTuple from huggingface_hub import snapshot_download from transformers import AutoTokenizer @@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): engine_args: EngineArgs prompts: list[str] - stop_token_ids: Optional[list[int]] = None - lora_requests: Optional[list[LoRARequest]] = None + stop_token_ids: list[int] | None = None + lora_requests: list[LoRARequest] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index c37d40a23ac20..accb6c742a2b6 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -9,7 +9,7 @@ using the chat template defined by the model. import os from argparse import Namespace from dataclasses import asdict -from typing import NamedTuple, Optional +from typing import NamedTuple from huggingface_hub import snapshot_download from PIL.Image import Image @@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple): engine_args: EngineArgs prompt: str image_data: list[Image] - stop_token_ids: Optional[list[int]] = None - chat_template: Optional[str] = None - lora_requests: Optional[list[LoRARequest]] = None + stop_token_ids: list[int] | None = None + chat_template: str | None = None + lora_requests: list[LoRARequest] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on @@ -1251,7 +1251,7 @@ model_example_map = { } -def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]): +def run_generate(model, question: str, image_urls: list[str], seed: int | None): req_data = model_example_map[model](question, image_urls) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} @@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int print("-" * 50) -def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]): +def run_chat(model: str, question: str, image_urls: list[str], seed: int | None): req_data = model_example_map[model](question, image_urls) # Disable other modalities to save memory diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py index 33ffb59014d8f..1ce2cdc436d6a 100644 --- a/examples/offline_inference/vision_language_pooling.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -11,7 +11,7 @@ on HuggingFace model repository. from argparse import Namespace from dataclasses import asdict from pathlib import Path -from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args +from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args from PIL.Image import Image @@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict): QueryModality = Literal["text", "image", "text+image", "text+images"] -Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery] +Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery class ModelRequestData(NamedTuple): engine_args: EngineArgs - prompt: Optional[str] = None - image: Optional[Image] = None - query: Optional[str] = None - documents: Optional[ScoreMultiModalParam] = None + prompt: str | None = None + image: Image | None = None + query: str | None = None + documents: ScoreMultiModalParam | None = None def run_clip(query: Query) -> ModelRequestData: @@ -281,7 +281,7 @@ def get_query(modality: QueryModality): raise ValueError(msg) -def run_encode(model: str, modality: QueryModality, seed: Optional[int]): +def run_encode(model: str, modality: QueryModality, seed: int | None): query = get_query(modality) req_data = model_example_map[model](query) @@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): print("-" * 50) -def run_score(model: str, modality: QueryModality, seed: Optional[int]): +def run_score(model: str, modality: QueryModality, seed: int | None): query = get_query(modality) req_data = model_example_map[model](query) diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index 1df11d9d84957..2b8482ec717af 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -23,7 +23,7 @@ import logging import os import sys from abc import ABC, abstractmethod -from typing import Callable, Optional +from collections.abc import Callable import aiohttp import requests @@ -49,12 +49,9 @@ class Proxy: decode_instances: list[str], model: str, scheduling_policy: SchedulingPolicy, - custom_create_completion: Optional[ - Callable[[Request], StreamingResponse] - ] = None, - custom_create_chat_completion: Optional[ - Callable[[Request], StreamingResponse] - ] = None, + custom_create_completion: Callable[[Request], StreamingResponse] | None = None, + custom_create_chat_completion: Callable[[Request], StreamingResponse] + | None = None, ): self.prefill_instances = prefill_instances self.decode_instances = decode_instances @@ -348,9 +345,9 @@ class ProxyServer: def __init__( self, args: argparse.Namespace, - scheduling_policy: Optional[SchedulingPolicy] = None, - create_completion: Optional[Callable[[Request], StreamingResponse]] = None, - create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None, + scheduling_policy: SchedulingPolicy | None = None, + create_completion: Callable[[Request], StreamingResponse] | None = None, + create_chat_completion: Callable[[Request], StreamingResponse] | None = None, ): self.validate_parsed_serve_args(args) self.port = args.port diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index f4b79b5e13020..19f6bd5726102 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional, Union +from typing import Any import msgspec import zmq @@ -25,16 +25,16 @@ class KVCacheEvent( class BlockStored(KVCacheEvent): block_hashes: list[ExternalBlockHash] - parent_block_hash: Optional[ExternalBlockHash] + parent_block_hash: ExternalBlockHash | None token_ids: list[int] block_size: int - lora_id: Optional[int] - medium: Optional[str] + lora_id: int | None + medium: str | None class BlockRemoved(KVCacheEvent): block_hashes: list[ExternalBlockHash] - medium: Optional[str] + medium: str | None class AllBlocksCleared(KVCacheEvent): @@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent): class KVEventBatch(EventBatch): - events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]] + events: list[BlockStored | BlockRemoved | AllBlocksCleared] def process_event(event_batch): diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py index cb230913a422f..b46cea5619671 100644 --- a/examples/online_serving/multi_instance_data_parallel.py +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -from typing import Optional from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -43,7 +42,7 @@ async def main(): ) prompt = "Who won the 2004 World Series?" - final_output: Optional[RequestOutput] = None + final_output: RequestOutput | None = None async for output in engine_client.generate( prompt=prompt, sampling_params=sampling_params, diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py index 63c9ff9e93980..b32209967be9a 100644 --- a/examples/online_serving/pooling/cohere_rerank_client.py +++ b/examples/online_serving/pooling/cohere_rerank_client.py @@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example. run: vllm serve BAAI/bge-reranker-base """ -from typing import Union - import cohere from cohere import Client, ClientV2 @@ -25,7 +23,7 @@ documents = [ def cohere_rerank( - client: Union[Client, ClientV2], model: str, query: str, documents: list[str] + client: Client | ClientV2, model: str, query: str, documents: list[str] ) -> dict: return client.rerank(model=model, query=query, documents=documents) diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 16ac4378c6863..25ab865a4ee43 100644 --- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model. import argparse import base64 import io -from typing import Literal, Union +from typing import Literal from openai import OpenAI from openai._types import NOT_GIVEN, NotGiven @@ -29,7 +29,7 @@ def create_chat_embeddings( *, messages: list[ChatCompletionMessageParam], model: str, - encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN, + encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN, ) -> CreateEmbeddingResponse: """ Convenience function for accessing vLLM's Chat Embeddings API, diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py index 3ea6c73e90e8f..02853a95469a6 100644 --- a/examples/online_serving/structured_outputs/structured_outputs.py +++ b/examples/online_serving/structured_outputs/structured_outputs.py @@ -1,21 +1,15 @@ # ruff: noqa: E501 # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - import argparse import asyncio import enum import os -from typing import TYPE_CHECKING, Any, Literal +from typing import Any, Literal import openai import pydantic - -if TYPE_CHECKING: - from openai.types.chat import ChatCompletionChunk - +from openai.types.chat import ChatCompletionChunk ConstraintsFormat = Literal[ "choice", diff --git a/pyproject.toml b/pyproject.toml index 49a7a0b8b1210..eb9bdb593baac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,12 +84,6 @@ ignore = [ "B007", # f-string format "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", - "UP027", - "UP035", - "UP038", - "UP045", ] [tool.ruff.format] diff --git a/tests/benchmarks/test_random_dataset.py b/tests/benchmarks/test_random_dataset.py index 90527dbeae28c..68e4afdcbe521 100644 --- a/tests/benchmarks/test_random_dataset.py +++ b/tests/benchmarks/test_random_dataset.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Any, NamedTuple, Optional, cast +from typing import Any, NamedTuple, cast import numpy as np import pytest @@ -185,8 +185,8 @@ def _collect_mm_samples( output_len: int = 5, base_items_per_request: int = 2, num_mm_items_range_ratio: float = 0.0, - limit_mm_per_prompt: Optional[dict[str, int]] = None, - bucket_config: Optional[dict[tuple[int, int, int], float]] = None, + limit_mm_per_prompt: dict[str, int] | None = None, + bucket_config: dict[tuple[int, int, int], float] | None = None, enable_multimodal_chat: bool = False, ) -> list[SampleRequest]: if limit_mm_per_prompt is None: diff --git a/tests/ci_envs.py b/tests/ci_envs.py index d16ecce1ef8dd..596a05b9e5f33 100644 --- a/tests/ci_envs.py +++ b/tests/ci_envs.py @@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need! """ import os -from typing import TYPE_CHECKING, Any, Callable, Optional +from collections.abc import Callable +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: VLLM_CI_NO_SKIP: bool = False - VLLM_CI_DTYPE: Optional[str] = None - VLLM_CI_HEAD_DTYPE: Optional[str] = None - VLLM_CI_HF_DTYPE: Optional[str] = None + VLLM_CI_DTYPE: str | None = None + VLLM_CI_HEAD_DTYPE: str | None = None + VLLM_CI_HF_DTYPE: str | None = None environment_variables: dict[str, Callable[[], Any]] = { # A model family has many models with the same architecture. diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 36bc832a1329e..ef1fdd4f9daef 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -2,9 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref -from collections.abc import Sequence +from collections.abc import Callable, Sequence from copy import deepcopy -from typing import Callable, Union from torch import fx from torch._ops import OpOverload @@ -44,7 +43,7 @@ class TestBackend: Inductor config is default-initialized from VllmConfig.CompilationConfig. """ - def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]): + def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]): self.custom_passes = list(passes) compile_config = get_current_vllm_config().compilation_config self.inductor_config = compile_config.inductor_compile_config diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 08f59283a6db5..45317b456af48 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -10,7 +10,7 @@ initialized randomly with a fixed seed. """ from dataclasses import dataclass -from typing import Any, Optional +from typing import Any import pytest import torch @@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, ) -> tuple[torch.Tensor, torch.Tensor]: """ For tractable computation: @@ -217,7 +217,7 @@ class LlamaModel(nn.Module): def forward( self, - input_ids: Optional[torch.Tensor], + input_ids: torch.Tensor | None, positions: torch.Tensor, ) -> torch.Tensor: hidden_states = self.embedding_tokens(input_ids) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 4bcefb30b2e6e..9bfd72260436b 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import dataclasses import pytest diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 8ccae4cfb9df2..2f3794c90b204 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,11 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import logging import tempfile -from typing import Any, Union +from typing import Any import pytest import torch @@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): def run_model( - compile_config: Union[int, CompilationConfig], + compile_config: int | CompilationConfig, model: str, model_kwargs: dict[str, Any], ): diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 0f2e3bffbd311..d1ab85cfb875c 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -from typing import Optional import pytest import torch._dynamo @@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype() FP4_DTYPE = torch.uint8 # globals needed for string-import custom Dynamo backend field -backend: Optional[TestBackend] = None -backend_unfused: Optional[TestBackend] = None +backend: TestBackend | None = None +backend_unfused: TestBackend | None = None class AttentionQuantPatternModel(torch.nn.Module): diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 34db5a999cbd8..b2fff822bbbb5 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import torch @@ -10,7 +9,7 @@ from vllm.config import CompilationLevel class MyMod(torch.nn.Module): - def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): if cache is not None: return x + cache return x * 2 @@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE ) - def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): # this is the function to be compiled return self.model(x, cache) - def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): + def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None): # let torch.compile compile twice if len(self.compiled_codes) == 2: dispatch_id = 0 if cache is None else 1 diff --git a/tests/conftest.py b/tests/conftest.py index 4713e12385965..2fde7f97836d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ import threading from collections.abc import Generator from contextlib import nullcontext from enum import Enum -from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast +from typing import Any, Callable, TypedDict, TypeVar, cast import numpy as np import pytest @@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _M = TypeVar("_M") -_PromptMultiModalInput = Union[list[_M], list[list[_M]]] +_PromptMultiModalInput = list[_M] | list[list[_M]] PromptImageInput = _PromptMultiModalInput[Image.Image] PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]] @@ -267,7 +267,7 @@ class HfRunner: return "cpu" if current_platform.is_cpu() else current_platform.device_type - def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + def wrap_device(self, x: _T, device: str | None = None) -> _T: if x is None or isinstance(x, (bool,)): return x @@ -287,14 +287,14 @@ class HfRunner: model_name: str, dtype: str = "auto", *, - model_kwargs: Optional[dict[str, Any]] = None, + model_kwargs: dict[str, Any] | None = None, trust_remote_code: bool = True, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, # Set this to avoid hanging issue - default_torch_num_threads: Optional[int] = None, + default_torch_num_threads: int | None = None, ) -> None: init_ctx = ( nullcontext() @@ -319,7 +319,7 @@ class HfRunner: model_name: str, dtype: str = "auto", *, - model_kwargs: Optional[dict[str, Any]] = None, + model_kwargs: dict[str, Any] | None = None, trust_remote_code: bool = True, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, @@ -406,11 +406,11 @@ class HfRunner: def get_inputs( self, - prompts: Union[list[str], list[list[int]]], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, - ) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]: + prompts: list[str] | list[list[int]], + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, + ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]: if images is not None: assert len(prompts) == len(images) @@ -420,9 +420,7 @@ class HfRunner: if audios is not None: assert len(prompts) == len(audios) - all_inputs: list[ - Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]] - ] = [] + all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = [] for i, prompt in enumerate(prompts): if isinstance(prompt, str): processor_kwargs: dict[str, Any] = { @@ -494,10 +492,10 @@ class HfRunner: def generate( self, - prompts: Union[list[str], list[list[int]]], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + prompts: list[str] | list[list[int]], + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[list[int]], list[str]]]: all_inputs = self.get_inputs( @@ -522,11 +520,11 @@ class HfRunner: def generate_greedy( self, - prompts: Union[list[str], list[list[int]]], + prompts: list[str] | list[list[int]], max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[int], str]]: outputs = self.generate( @@ -546,9 +544,9 @@ class HfRunner: prompts: list[str], beam_width: int, max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, ) -> list[tuple[list[list[int]], list[str]]]: outputs = self.generate( prompts, @@ -574,9 +572,9 @@ class HfRunner: self, prompts: list[str], max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[list[torch.Tensor]]: all_inputs = self.get_inputs( @@ -624,7 +622,7 @@ class HfRunner: def _hidden_states_to_logprobs( self, hidden_states: tuple[tuple[torch.Tensor, ...], ...], - num_logprobs: Optional[int], + num_logprobs: int | None, ) -> tuple[list[dict[int, float]], int]: seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) output_len = len(hidden_states) @@ -652,10 +650,10 @@ class HfRunner: self, prompts: list[str], max_tokens: int, - num_logprobs: Optional[int], - images: Optional[PromptImageInput] = None, - audios: Optional[PromptAudioInput] = None, - videos: Optional[PromptVideoInput] = None, + num_logprobs: int | None, + images: PromptImageInput | None = None, + audios: PromptAudioInput | None = None, + videos: PromptVideoInput | None = None, **kwargs: Any, ) -> list[TokensTextLogprobs]: all_inputs = self.get_inputs( @@ -734,20 +732,20 @@ class VllmRunner: model_name: str, runner: RunnerOption = "auto", convert: ConvertOption = "auto", - tokenizer_name: Optional[str] = None, + tokenizer_name: str | None = None, tokenizer_mode: str = "auto", trust_remote_code: bool = True, - seed: Optional[int] = 0, - max_model_len: Optional[int] = 1024, + seed: int | None = 0, + max_model_len: int | None = 1024, dtype: str = "auto", disable_log_stats: bool = True, tensor_parallel_size: int = 1, block_size: int = 16 if not torch.xpu.is_available() else 64, - enable_chunked_prefill: Optional[bool] = False, + enable_chunked_prefill: bool | None = False, swap_space: int = 4, - enforce_eager: Optional[bool] = False, + enforce_eager: bool | None = False, # Set this to avoid hanging issue - default_torch_num_threads: Optional[int] = None, + default_torch_num_threads: int | None = None, **kwargs, ) -> None: init_ctx = ( @@ -785,10 +783,10 @@ class VllmRunner: def get_inputs( self, - prompts: Union[list[str], list[torch.Tensor], list[list[int]]], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + prompts: list[str] | list[torch.Tensor] | list[list[int]], + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, ) -> list[dict[str, Any]]: if any( x is not None and len(x) != len(prompts) for x in [images, videos, audios] @@ -824,11 +822,11 @@ class VllmRunner: def generate( self, - prompts: Union[list[str], list[torch.Tensor], list[list[int]]], + prompts: list[str] | list[torch.Tensor] | list[list[int]], sampling_params: SamplingParams, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) @@ -871,11 +869,11 @@ class VllmRunner: self, prompts: list[str], sampling_params: SamplingParams, - images: Optional[PromptImageInput] = None, - audios: Optional[PromptAudioInput] = None, - videos: Optional[PromptVideoInput] = None, + images: PromptImageInput | None = None, + audios: PromptAudioInput | None = None, + videos: PromptVideoInput | None = None, **kwargs: Any, - ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]: + ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) req_outputs = self.llm.generate( @@ -894,11 +892,11 @@ class VllmRunner: def generate_greedy( self, - prompts: Union[list[str], list[torch.Tensor], list[list[int]]], + prompts: list[str] | list[torch.Tensor] | list[list[int]], max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) @@ -916,15 +914,15 @@ class VllmRunner: self, prompts: list[str], max_tokens: int, - num_logprobs: Optional[int], - num_prompt_logprobs: Optional[int] = None, - images: Optional[PromptImageInput] = None, - audios: Optional[PromptAudioInput] = None, - videos: Optional[PromptVideoInput] = None, - stop_token_ids: Optional[list[int]] = None, - stop: Optional[list[str]] = None, + num_logprobs: int | None, + num_prompt_logprobs: int | None = None, + images: PromptImageInput | None = None, + audios: PromptAudioInput | None = None, + videos: PromptVideoInput | None = None, + stop_token_ids: list[int] | None = None, + stop: list[str] | None = None, **kwargs: Any, - ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]: + ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]: greedy_logprobs_params = SamplingParams( temperature=0.0, max_tokens=max_tokens, @@ -957,7 +955,7 @@ class VllmRunner: perplexities = [] for output in outputs: output = cast(TokensTextLogprobsPromptLogprobs, output) - token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + token_datas = cast(list[dict[int, Logprob] | None], output[3]) assert token_datas[0] is None token_log_probs = [] for token_data in token_datas[1:]: @@ -976,10 +974,10 @@ class VllmRunner: prompts: list[str], beam_width: int, max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, - concurrency_limit: Optional[int] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, + concurrency_limit: int | None = None, ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) @@ -1002,9 +1000,9 @@ class VllmRunner: def embed( self, prompts: list[str], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, *args, **kwargs, ) -> list[list[float]]: @@ -1023,8 +1021,8 @@ class VllmRunner: def score( self, - text_1: Union[str, list[str]], - text_2: Union[str, list[str]], + text_1: list[str] | str, + text_2: list[str] | str, *args, **kwargs, ) -> list[float]: @@ -1226,8 +1224,8 @@ def _find_free_port() -> int: class LocalAssetServer: address: str port: int - server: Optional[http.server.ThreadingHTTPServer] - thread: Optional[threading.Thread] + server: http.server.ThreadingHTTPServer | None + thread: threading.Thread | None def __init__(self, address: str = "127.0.0.1") -> None: self.address = address diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index d59b394393e34..6b829c2610359 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import pytest @@ -15,8 +15,8 @@ def _test_stopping( llm: LLM, expected_output: str, expected_reason: Any, - stop: Optional[list[str]] = None, - stop_token_ids: Optional[list[int]] = None, + stop: list[str] | None = None, + stop_token_ids: list[int] | None = None, include_in_output: bool = False, ) -> None: output = llm.generate( diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index 47ceb45057c97..9c146a3323d90 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Optional, Union import msgspec import msgspec.msgpack @@ -78,8 +77,8 @@ class MockSubscriber: def __init__( self, - pub_endpoints: Union[str, list[str]], - replay_endpoints: Optional[Union[str, list[str]]] = None, + pub_endpoints: str | list[str], + replay_endpoints: str | list[str] | None = None, topic: str = "", decode_type=SampleBatch, ): @@ -111,7 +110,7 @@ class MockSubscriber: self.last_seq = -1 self.decoder = msgspec.msgpack.Decoder(type=decode_type) - def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]: + def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None: """Receive a single message with timeout""" if not self.sub.poll(timeout): return None diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index c61c4584d8376..ba80ee6fb83ba 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -5,9 +5,8 @@ Run `pytest tests/distributed/test_comm_ops.py`. """ -from __future__ import annotations - -from typing import Any, Callable +from collections.abc import Callable +from typing import Any import pytest import ray diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 89c2c9f8badeb..149b502a85a75 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple): class CPTestOptions(NamedTuple): multi_node_only: bool - load_format: Optional[str] = None + load_format: str | None = None @dataclass @@ -54,7 +54,7 @@ class CPTestSettings: dcp_base: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for eager_mode_val in [False]: diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index 8a9ddcd58cfce..0228d42a76a0f 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple): class EPTestOptions(NamedTuple): trust_remote_code: bool - tokenizer_mode: Optional[str] - load_format: Optional[str] = None - hf_overrides: Optional[str] = None + tokenizer_mode: str | None + load_format: str | None = None + hf_overrides: str | None = None @dataclass @@ -40,9 +40,9 @@ class EPTestSettings: tp_base: int = 2, runner: RunnerOption = "auto", trust_remote_code: bool = False, - tokenizer_mode: Optional[str] = None, - load_format: Optional[str] = None, - hf_overrides: Optional[str] = None, + tokenizer_mode: str | None = None, + load_format: str | None = None, + hf_overrides: str | None = None, ): return EPTestSettings( parallel_setups=[ @@ -72,9 +72,9 @@ class EPTestSettings: tp_base: int = 2, runner: RunnerOption = "auto", trust_remote_code: bool = False, - tokenizer_mode: Optional[str] = None, - load_format: Optional[str] = None, - hf_overrides: Optional[str] = None, + tokenizer_mode: str | None = None, + load_format: str | None = None, + hf_overrides: str | None = None, ): return EPTestSettings( parallel_setups=[ diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 43f0c9dd1a85a..24f62cff299a0 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple): class PPTestOptions(NamedTuple): multi_node_only: bool - load_format: Optional[str] = None + load_format: str | None = None @dataclass @@ -52,7 +52,7 @@ class PPTestSettings: pp_base: int = 2, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: Optional[str] = None, + load_format: str | None = None, ): return PPTestSettings( parallel_setups=[ @@ -76,7 +76,7 @@ class PPTestSettings: pp_base: int = 2, runner: RunnerOption = "auto", multi_node_only: bool = False, - load_format: Optional[str] = None, + load_format: str | None = None, ): return PPTestSettings( parallel_setups=[ diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 2c9f474640088..2f2b43cb4cc2b 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,16 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - -from typing import TYPE_CHECKING - import pytest +from typing_extensions import LiteralString from ..utils import compare_two_settings, create_new_process_for_each_test -if TYPE_CHECKING: - from typing_extensions import LiteralString - @pytest.mark.parametrize( "PP_SIZE, MODEL_NAME", diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 0847687cf2f9a..a431bf30fc890 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple): class SPTestOptions(NamedTuple): multi_node_only: bool - load_format: Optional[str] = None + load_format: str | None = None @dataclass @@ -53,7 +53,7 @@ class SPTestSettings: pp_base: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for eager_mode_val in [False, True]: @@ -84,7 +84,7 @@ class SPTestSettings: pp_base: int = 1, runner: RunnerOption = "auto", multi_node_only: bool = False, - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for eager_mode_val in [False, True]: @@ -115,7 +115,7 @@ class SPTestSettings: pp_base: int = 1, runner: RunnerOption = "auto", multi_node_only: bool = False, - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for fusion_val in [False, True]: diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 9d367349fc2e5..78928a53942f9 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,7 +5,7 @@ import json from argparse import ArgumentError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Annotated, Literal, Optional, Union +from typing import Annotated, Literal import pytest @@ -115,9 +115,9 @@ class NestedConfig: class DummyConfig: regular_bool: bool = True """Regular bool with default True""" - optional_bool: Optional[bool] = None + optional_bool: bool | None = None """Optional bool with default None""" - optional_literal: Optional[Literal["x", "y"]] = None + optional_literal: Literal["x", "y"] | None = None """Optional literal with default None""" tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3)) """Tuple with variable length""" @@ -127,7 +127,7 @@ class DummyConfig: """List with variable length""" list_literal: list[Literal[1, 2]] = field(default_factory=list) """List with literal choices""" - list_union: list[Union[str, type[object]]] = field(default_factory=list) + list_union: list[str | type[object]] = field(default_factory=list) """List with union type""" literal_literal: Literal[Literal[1], Literal[2]] = 1 """Literal of literals with default 1""" @@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected): ("type_hint", "expected"), [ (Annotated[int, "annotation"], {int}), - (Optional[int], {int, type(None)}), - (Annotated[Optional[int], "annotation"], {int, type(None)}), - (Optional[Annotated[int, "annotation"]], {int, type(None)}), + (int | None, {int, type(None)}), + (Annotated[int | None, "annotation"], {int, type(None)}), + (Annotated[int, "annotation"] | None, {int, type(None)}), ], - ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"], + ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"], ) def test_get_type_hints(type_hint, expected): assert get_type_hints(type_hint) == expected diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index 5df859df42da7..682420a83a442 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -3,7 +3,7 @@ import asyncio import random -from typing import Callable +from collections.abc import Callable import openai import pytest diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 14181c6b8b16b..fa8ae55d14a23 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -3,7 +3,6 @@ # imports for structured outputs tests import json -from typing import Optional import jsonschema import openai # use the official client for correctness check @@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)], ) async def test_prompt_logprobs_chat( - client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int] + client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None ): params: dict = { "messages": [ diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index e64f68cad7c83..44d4176655375 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime -from typing import Union import openai # use the official client for correctness check import pytest @@ -166,7 +165,7 @@ async def test_function_tool_use( client: openai.AsyncOpenAI, model_name: str, stream: bool, - tool_choice: Union[str, dict], + tool_choice: str | dict, enable_thinking: bool, ): if not stream: diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index aa4ee603647e4..a85418d5b5f4e 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -4,7 +4,6 @@ from contextlib import suppress from dataclasses import dataclass, field from http import HTTPStatus -from typing import Optional from unittest.mock import AsyncMock, MagicMock import pytest @@ -38,13 +37,13 @@ class MockModelConfig: trust_remote_code: bool = False tokenizer_mode: str = "auto" max_model_len: int = 100 - tokenizer_revision: Optional[str] = None + tokenizer_revision: str | None = None multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) - logits_processor_pattern: Optional[str] = None - diff_sampling_param: Optional[dict] = None + logits_processor_pattern: str | None = None + diff_sampling_param: dict | None = None allowed_local_media_path: str = "" - allowed_media_domains: Optional[list[str]] = None + allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" skip_tokenizer_init: bool = False @@ -56,7 +55,7 @@ class MockModelConfig: class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str - ) -> Optional[LoRARequest]: + ) -> LoRARequest | None: if lora_name == "test-lora": return LoRARequest( lora_name="test-lora", diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 10224dee0efe8..d1367b4eeaf62 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,16 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - import asyncio from contextlib import suppress from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import Any from unittest.mock import AsyncMock, MagicMock import pytest import pytest_asyncio +from openai import OpenAI from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.protocol import ChatCompletionRequest @@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer -if TYPE_CHECKING: - from openai import OpenAI - GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b" diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index cfa4d3584e709..7489a406224a5 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Union from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, @@ -84,10 +83,10 @@ class StreamingToolReconstructor: def run_tool_extraction( tool_parser: ToolParser, model_output: str, - request: Union[ChatCompletionRequest, None] = None, + request: ChatCompletionRequest | None = None, streaming: bool = False, assert_one_tool_per_delta: bool = True, -) -> tuple[Union[str, None], list[ToolCall]]: +) -> tuple[str | None, list[ToolCall]]: if streaming: reconstructor = run_tool_extraction_streaming( tool_parser, @@ -105,7 +104,7 @@ def run_tool_extraction( def run_tool_extraction_nonstreaming( tool_parser: ToolParser, model_output: str, - request: Union[ChatCompletionRequest, None] = None, + request: ChatCompletionRequest | None = None, ) -> ExtractedToolCallInformation: request = request or ChatCompletionRequest(messages=[], model="test-model") return tool_parser.extract_tool_calls(model_output, request) @@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming( def run_tool_extraction_streaming( tool_parser: ToolParser, model_deltas: Iterable[str], - request: Union[ChatCompletionRequest, None] = None, + request: ChatCompletionRequest | None = None, assert_one_tool_per_delta: bool = True, ) -> StreamingToolReconstructor: request = request or ChatCompletionRequest(messages=[], model="test-model") diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py index 92df43d7dbdcf..ba9fb64262772 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py @@ -4,8 +4,6 @@ Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. """ -from typing import Optional - import openai import pytest @@ -103,14 +101,14 @@ async def test_matryoshka( run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions) if model_info.is_matryoshka: - valid_dimensions: list[Optional[int]] = [None] + valid_dimensions: list[int | None] = [None] if model_info.matryoshka_dimensions is not None: valid_dimensions += model_info.matryoshka_dimensions[:2] for dimensions in valid_dimensions: await make_request_and_correctness_test(dimensions) - invalid_dimensions: list[Optional[int]] = [-1] + invalid_dimensions: list[int | None] = [-1] if model_info.matryoshka_dimensions is not None: assert 5 not in model_info.matryoshka_dimensions invalid_dimensions.append(5) diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index e548f52e1e94d..3fadbf2ef0dd0 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -5,7 +5,6 @@ import multiprocessing import socket import threading import time -from typing import Optional from unittest.mock import patch import pytest @@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args): assert len(manager.processes) == 3 # Create a result capture for the thread - result: dict[str, Optional[Exception]] = {"exception": None} + result: dict[str, Exception | None] = {"exception": None} def run_with_exception_capture(): try: @@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args): assert len(manager.processes) == 3 # Create a result capture for the thread - result: dict[str, Optional[Exception]] = {"exception": None} + result: dict[str, Exception | None] = {"exception": None} def run_with_exception_capture(): try: diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index dcd196ebdd772..224b68412e60a 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -3,7 +3,7 @@ import warnings from collections.abc import Mapping -from typing import Literal, Optional +from typing import Literal import pytest from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy @@ -152,9 +152,9 @@ def audio_url(): def _assert_mm_data_is_image_input( - mm_data: Optional[MultiModalDataDict], + mm_data: MultiModalDataDict | None, image_count: int, - skipped_image_indices: Optional[list] = None, + skipped_image_indices: list | None = None, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"image"} @@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input( def _assert_mm_uuids( - mm_uuids: Optional[MultiModalUUIDDict], + mm_uuids: MultiModalUUIDDict | None, media_count: int, - expected_uuids: list[Optional[str]], + expected_uuids: list[str | None], modality: str = "image", ) -> None: if len(expected_uuids) > 0: @@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int] def _assert_mm_data_inputs( - mm_data: Optional[MultiModalDataDict], + mm_data: MultiModalDataDict | None, data_count: MultiModalDataCounts, - skipped_media_indices: Optional[dict[str, list]] = None, # modality -> list[int] + skipped_media_indices: dict[str, list] | None = None, # modality -> list[int] ) -> None: assert mm_data is not None assert set(data_count.keys()) == (set(mm_data.keys())) diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py index f93978c3e6e72..c811a6ba63cb5 100644 --- a/tests/entrypoints/test_renderer.py +++ b/tests/entrypoints/test_renderer.py @@ -3,7 +3,6 @@ import io from dataclasses import dataclass -from typing import Optional from unittest.mock import AsyncMock, MagicMock import pybase64 @@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt @dataclass class MockModelConfig: max_model_len: int = 100 - encoder_config: Optional[dict] = None + encoder_config: dict | None = None class MockTokenizerResult: diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py index 9edec7a78ca23..c7799607912b6 100644 --- a/tests/evals/gsm8k/gsm8k_eval.py +++ b/tests/evals/gsm8k/gsm8k_eval.py @@ -12,7 +12,6 @@ import json import os import time from collections.abc import Generator -from typing import Optional, Union import aiohttp import numpy as np @@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm INVALID = -9999999 -def download_and_cache_file(url: str, filename: Optional[str] = None) -> str: +def download_and_cache_file(url: str, filename: str | None = None) -> str: """Download and cache a file from a URL.""" if filename is None: filename = os.path.join("/tmp", url.split("/")[-1]) @@ -81,9 +80,9 @@ async def call_vllm_api( prompt: str, temperature: float, max_tokens: int, - stop: Optional[list[str]] = None, - url: Optional[str] = None, - seed: Optional[int] = None, + stop: list[str] | None = None, + url: str | None = None, + seed: int | None = None, ) -> str: """Call vLLM's OpenAI-compatible completions endpoint.""" data = { @@ -112,8 +111,8 @@ def evaluate_gsm8k( host: str = "http://127.0.0.1", port: int = 8000, temperature: float = 0.0, - seed: Optional[int] = 42, -) -> dict[str, Union[float, int]]: + seed: int | None = 42, +) -> dict[str, float | int]: """ Evaluate GSM8K accuracy using vLLM serve endpoint. diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 88b21a9b84d64..1dec46e33f22e 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -27,8 +26,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -94,12 +93,12 @@ def test_varlen_with_paged_kv( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, - sliding_window: Optional[int], + sliding_window: int | None, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 16e544eb3cf9f..15cdb950a7db5 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Optional import pytest import torch @@ -50,7 +49,7 @@ def ref_masked_attention( key: torch.Tensor, value: torch.Tensor, scale: float, - attn_mask: Optional[torch.Tensor] = None, + attn_mask: torch.Tensor | None = None, ) -> torch.Tensor: attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() if attn_mask is not None: @@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention( block_tables: torch.Tensor, seq_lens: torch.Tensor, scale: float, - alibi_slopes: Optional[torch.Tensor], + alibi_slopes: torch.Tensor | None, ) -> None: num_query_heads = query.shape[1] num_kv_heads = value_cache.shape[1] @@ -415,7 +414,7 @@ def ref_multi_query_kv_attention( key: torch.Tensor, value: torch.Tensor, scale: float, - alibi_bias: Optional[list[torch.Tensor]], + alibi_bias: list[torch.Tensor] | None, dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index 58e8bd592ba43..4295f852f95bb 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -85,7 +84,7 @@ def test_cascade( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, fa_version: int, ) -> None: diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py index dad1510ce532b..a60f4e385a893 100644 --- a/tests/kernels/attention/test_cutlass_mla_decode.py +++ b/tests/kernels/attention/test_cutlass_mla_decode.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random -from typing import Optional import pytest import torch @@ -17,7 +16,7 @@ def cal_diff( y: torch.Tensor, name: str, use_fp8: bool = False, - diff_threshold: Optional[float] = None, + diff_threshold: float | None = None, ) -> None: x, y = x.double(), y.double() cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12) diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index d39f0a593ed41..18995545552ea 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -34,8 +33,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, - sliding_window: Optional[int], + sliding_window: int | None, fa_version: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") if not is_fa_version_supported(fa_version): @@ -221,13 +220,13 @@ def test_varlen_with_paged_kv( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, - sliding_window: Optional[int], + sliding_window: int | None, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, fa_version: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") if not is_fa_version_supported(fa_version): diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 52cd10fdc5be0..82ec2ef14e56c 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import flashinfer import pytest @@ -26,8 +25,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], - sliding_window: Optional[int], + soft_cap: float | None, + sliding_window: int | None, ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) @@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], - sliding_window: Optional[int], + soft_cap: float | None, + sliding_window: int | None, ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) @@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, ) -> None: pytest.skip("TODO: fix the accuracy issue") torch.set_default_device("cuda") @@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 61157429ec9cc..00f06da5a47b4 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import flashinfer import pytest @@ -68,9 +67,7 @@ NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. @torch.inference_mode def test_flashinfer_trtllm_decode_with_baseline( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_lens: tuple[int, int], num_heads: tuple[int, int], @@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline( kv_layout: str, block_size: int, window_left: int, - soft_cap: Optional[float], + soft_cap: float | None, has_sinks: bool, ) -> None: torch.set_default_device("cuda") @@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline( @torch.inference_mode def test_flashinfer_trtllm_prefill_with_baseline( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_lens: tuple[int, int], num_heads: tuple[int, int], @@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( kv_layout: str, block_size: int, window_left: int, - soft_cap: Optional[float], + soft_cap: float | None, has_sinks: bool, ) -> None: torch.set_default_device("cuda") diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index eb9204dfaf158..9b084f2f660b2 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -20,7 +19,7 @@ def merge_attn_states_torch( prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS] suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS] - output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS] + output_lse: torch.Tensor | None = None, # [NUM_HEADS, NUM_TOKENS] ): p_lse = prefix_lse s_lse = suffix_lse diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index fba82cfdadbdf..bf4d2179af5f9 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -32,8 +31,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -98,12 +97,12 @@ def test_triton_unified_attn( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, - sliding_window: Optional[int], + sliding_window: int | None, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index 52133ec53d1d7..418c700bbf003 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union import pytest import torch @@ -31,13 +30,13 @@ EPS = 1e-6 ## Helpers -def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: +def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor: return torch.as_tensor(x, dtype=torch.float32, device="cuda") def ref_rms_norm( - rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor] -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None +) -> tuple[torch.Tensor, torch.Tensor | None]: if residual is not None: residual = residual.clone() out, residual = rms_norm_layer.forward_native(x, residual) @@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant( rms_norm_layer: RMSNorm, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: if scale_ub is not None: assert quant_dtype == torch.float8_e4m3fn @@ -76,9 +75,9 @@ def ref_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: return ref_dynamic_per_token_quant( rms_norm_layer, x, quant_dtype, residual, scale_ub ) @@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant( weight: torch.Tensor, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: if residual is not None: residual = residual.clone() out, scales = ops.rms_norm_dynamic_per_token_quant( @@ -103,9 +102,9 @@ def ops_impl( weight: torch.Tensor, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 799e0a3f2a2bd..e1ddc5de067bb 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable from itertools import product -from typing import Callable, Optional import pytest import torch @@ -68,7 +68,7 @@ def test_rotary_embedding( seq_len: int, num_heads: int, head_size: int, - rotary_dim: Optional[int], + rotary_dim: int | None, dtype: torch.dtype, seed: int, device: str, diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index 0a292a3e2ae70..30c64e0bd72a7 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -4,8 +4,6 @@ Tests for miscellaneous utilities """ -from typing import Optional - import pytest import torch @@ -17,7 +15,7 @@ def rotary_embedding_opcheck( rot, positions: torch.Tensor, query: torch.Tensor, - key: Optional[torch.Tensor] = None, + key: torch.Tensor | None = None, ): cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype) diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index fea6b94481b60..d9023490d7fc2 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -19,11 +18,11 @@ from vllm.platforms import current_platform def causal_conv1d_ref( x: torch.Tensor, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, - initial_states: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, + initial_states: torch.Tensor | None = None, return_final_states: bool = False, - final_states_out: Optional[torch.Tensor] = None, - activation: Optional[str] = "silu", + final_states_out: torch.Tensor | None = None, + activation: str | None = "silu", ): """ x: (batch, dim, seqlen) @@ -117,12 +116,12 @@ def causal_conv1d_update_ref( def causal_conv1d_opcheck_fn( x: torch.Tensor, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, - cu_seq_len: Optional[torch.Tensor] = None, - cache_indices: Optional[torch.Tensor] = None, - has_initial_state: Optional[torch.Tensor] = None, - conv_states: Optional[torch.Tensor] = None, - activation: Optional[str] = "silu", + bias: torch.Tensor | None = None, + cu_seq_len: torch.Tensor | None = None, + cache_indices: torch.Tensor | None = None, + has_initial_state: torch.Tensor | None = None, + conv_states: torch.Tensor | None = None, + activation: str | None = "silu", pad_slot_id: int = PAD_SLOT_ID, ): """ diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index ff12d1fb9a805..94a305a063c3a 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Any import torch @@ -35,7 +35,7 @@ from .mk_objects import ( from .parallel_utils import ProcessGroupInfo -def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str: +def _describe_tensor(t: torch.Tensor | None, name: str) -> str: if t is None: return f"{name} : None" else: @@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str: @dataclass class Config: - Ms: Union[list[int], int] + Ms: list[int] | int K: int N: int E: int - topks: Union[list[int], int] + topks: list[int] | int dtype: torch.dtype - quant_config: Optional[TestMoEQuantConfig] + quant_config: TestMoEQuantConfig | None prepare_finalize_type: mk.FusedMoEPrepareAndFinalize fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute - fused_moe_chunk_size: Optional[int] + fused_moe_chunk_size: int | None world_size: int - torch_trace_dir_path: Optional[str] = None + torch_trace_dir_path: str | None = None def __post_init__(self): if self.quant_config is None: @@ -93,7 +93,7 @@ class Config: return self.Ms @property - def quant_dtype(self) -> Union[torch.dtype, str, None]: + def quant_dtype(self) -> torch.dtype | str | None: assert self.quant_config is not None return self.quant_config.quant_dtype @@ -112,7 +112,7 @@ class Config: return self.quant_config.per_out_ch_quant @property - def quant_block_shape(self) -> Optional[list[int]]: + def quant_block_shape(self) -> list[int] | None: assert self.quant_config is not None return self.quant_config.block_shape @@ -209,7 +209,7 @@ class Config: info = prepare_finalize_info(self.prepare_finalize_type) return info.backend - def is_valid(self) -> tuple[bool, Optional[str]]: + def is_valid(self) -> tuple[bool, str | None]: # Check prepare-finalize and fused-experts compatibility if self.is_batched_prepare_finalize(): if not self.is_batched_fused_experts(): @@ -280,10 +280,10 @@ class Config: class WeightTensors: w1: torch.Tensor w2: torch.Tensor - w1_scale: Optional[torch.Tensor] - w2_scale: Optional[torch.Tensor] - w1_gs: Optional[torch.Tensor] = None - w2_gs: Optional[torch.Tensor] = None + w1_scale: torch.Tensor | None + w2_scale: torch.Tensor | None + w1_gs: torch.Tensor | None = None + w2_gs: torch.Tensor | None = None def describe(self): s = "" @@ -351,11 +351,11 @@ class WeightTensors: @dataclass class RankTensors: hidden_states: torch.Tensor - hidden_states_scale: Optional[torch.Tensor] + hidden_states_scale: torch.Tensor | None topk_weights: torch.Tensor topk_ids: torch.Tensor - expert_map: Optional[torch.Tensor] + expert_map: torch.Tensor | None def describe(self): s = "" @@ -370,7 +370,7 @@ class RankTensors: @staticmethod def make_hidden_states( config: Config, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, torch.Tensor | None]: """ Return hidden_states """ diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py index 7d555202afe6a..95db6327c4f10 100644 --- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py +++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py @@ -4,7 +4,6 @@ import copy from enum import Enum from itertools import product -from typing import Optional import torch from tqdm import tqdm @@ -82,7 +81,7 @@ def make_feature_matrix(csv_file_path: str): import pandas as pd def add_to_results( - config: Config, success: Result, results_df: Optional[pd.DataFrame] = None + config: Config, success: Result, results_df: pd.DataFrame | None = None ): config_dict = asdict(config) config_dict["prepare_finalize_type"] = config_dict[ @@ -121,7 +120,7 @@ def make_feature_matrix(csv_file_path: str): product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES) ) - results_df: Optional[pd.DataFrame] = None + results_df: pd.DataFrame | None = None for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm( combinations ): diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 174b2d1781ae0..aa41f89cae7dc 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional, Union import torch @@ -43,25 +42,25 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe @dataclass class TestMoEQuantConfig: - quant_dtype: Union[torch.dtype, str, None] + quant_dtype: torch.dtype | str | None per_out_ch_quant: bool per_act_token_quant: bool - block_shape: Optional[list[int]] + block_shape: list[int] | None @dataclass class PrepareFinalizeInfo: activation_format: mk.FusedMoEActivationFormat - supported_dtypes: list[Union[torch.dtype, str]] + supported_dtypes: list[torch.dtype | str] blocked_quantization_support: bool - backend: Optional[str] + backend: str | None supports_apply_weight_on_input: bool = True @dataclass class ExpertInfo: activation_format: mk.FusedMoEActivationFormat - supported_dtypes: list[Union[torch.dtype, str]] + supported_dtypes: list[torch.dtype | str] blocked_quantization_support: bool supports_chunking: bool supports_expert_map: bool @@ -78,7 +77,7 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = [] standard_format = mk.FusedMoEActivationFormat.Standard batched_format = mk.FusedMoEActivationFormat.BatchedExperts -common_float_types: list[Union[torch.dtype, str]] = [ +common_float_types: list[torch.dtype | str] = [ torch.float8_e4m3fn, torch.bfloat16, torch.float16, @@ -92,9 +91,9 @@ fp8_types = [torch.float8_e4m3fn] def register_prepare_and_finalize( kind, activation_format: mk.FusedMoEActivationFormat, - supported_dtypes: list[Union[torch.dtype, str]], + supported_dtypes: list[torch.dtype | str], blocked_quantization_support: bool, - backend: Optional[str], + backend: str | None, force_multigpu: bool = False, supports_apply_weight_on_input: bool = True, ): @@ -121,7 +120,7 @@ def register_prepare_and_finalize( def register_experts( kind, activation_format: mk.FusedMoEActivationFormat, - supported_dtypes: list[Union[torch.dtype, str]], + supported_dtypes: list[torch.dtype | str], blocked_quantization_support: bool, supports_chunking: bool, supports_expert_map: bool, @@ -340,7 +339,7 @@ if cutlass_fp4_supported(): supports_expert_map=False, ) -MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [ +MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [ None, # per-channel / per-column weights and per-tensor activations TestMoEQuantConfig( @@ -395,7 +394,7 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): def make_prepare_finalize( prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, - backend: Optional[str], + backend: str | None, moe: FusedMoEConfig, quant_config: FusedMoEQuantConfig, ) -> mk.FusedMoEPrepareAndFinalize: diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index 7802129d3d48f..4aad820635ad7 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -3,11 +3,12 @@ import dataclasses import os import traceback -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any, Concatenate import torch from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage] -from typing_extensions import Concatenate, ParamSpec +from typing_extensions import ParamSpec from vllm.config import VllmConfig, set_current_vllm_config from vllm.distributed import init_distributed_environment, initialize_model_parallel @@ -58,9 +59,9 @@ def _worker_parallel_launch( world_local_size: int, node_rank: int, init_method: str, - worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None], - vllm_config: Optional[VllmConfig], - env_dict: Optional[dict], + worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None], + vllm_config: VllmConfig | None, + env_dict: dict | None, *args: P.args, **kwargs: P.kwargs, ) -> None: diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index 48e5c4659b49a..a3e264c5f5e28 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +from collections.abc import Callable from itertools import product -from typing import Any, Callable +from typing import Any import torch diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py index fb9e5df281f1d..d83b63e187c2f 100644 --- a/tests/kernels/moe/parallel_utils.py +++ b/tests/kernels/moe/parallel_utils.py @@ -7,12 +7,13 @@ DeepEP test utilities import dataclasses import os import traceback -from typing import Callable, Optional +from collections.abc import Callable +from typing import Concatenate import torch from torch.distributed import ProcessGroup from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage] -from typing_extensions import Concatenate, ParamSpec +from typing_extensions import ParamSpec from vllm.utils import get_open_port, has_deep_ep @@ -126,8 +127,8 @@ def make_deepep_ht_a2a( pgi: ProcessGroupInfo, dp_size: int, ht_args: DeepEPHTArgs, - q_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + q_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, ): import deep_ep @@ -153,8 +154,8 @@ def make_deepep_ll_a2a( pg: ProcessGroup, pgi: ProcessGroupInfo, deepep_ll_args: DeepEPLLArgs, - q_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + q_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, ): import deep_ep @@ -185,10 +186,10 @@ def make_deepep_a2a( pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, - deepep_ht_args: Optional[DeepEPHTArgs], - deepep_ll_args: Optional[DeepEPLLArgs], - q_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + deepep_ht_args: DeepEPHTArgs | None, + deepep_ll_args: DeepEPLLArgs | None, + q_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, ): if deepep_ht_args is not None: assert deepep_ll_args is None diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 09cede3fbcc77..2dce099770f08 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional import pytest import torch @@ -55,7 +54,7 @@ vllm_config.scheduler_config.max_model_len = 8192 @dataclass class BatchedMMConfig: in_dtype: torch.dtype - quant_dtype: Optional[torch.dtype] + quant_dtype: torch.dtype | None out_dtype: torch.dtype num_experts: int max_tokens_per_expert: int @@ -115,7 +114,7 @@ def test_batched_mm( K: int, N: int, dtype: torch.dtype, - block_shape: Optional[list[int]], + block_shape: list[int] | None, per_act_token_quant: bool, ): current_platform.seed_everything(7) @@ -242,7 +241,7 @@ def test_fused_moe_batched_experts( topk: int, dtype: torch.dtype, per_act_token_quant: bool, - block_shape: Optional[list[int]], + block_shape: list[int] | None, input_scales: bool, ): current_platform.seed_everything(7) diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py index 996a4538d1054..39138be83bccb 100644 --- a/tests/kernels/moe/test_count_expert_num_tokens.py +++ b/tests/kernels/moe/test_count_expert_num_tokens.py @@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels """ import dataclasses -from typing import Optional import pytest import torch @@ -16,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens @dataclasses.dataclass class TestTensors: topk_ids: torch.Tensor - expert_map: Optional[torch.Tensor] = None + expert_map: torch.Tensor | None = None def to_device(self, device: str): self.topk_ids = self.topk_ids.to(device=device) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index b82cea61bd4ea..4330eda251f75 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -3,7 +3,6 @@ import copy import dataclasses from math import prod -from typing import Optional import pytest import torch @@ -85,16 +84,16 @@ class MOETensors: @dataclasses.dataclass class MOETensors8Bit(MOETensors): # quantized - a_q: Optional[torch.Tensor] = None # a -> a_q - w1_q: Optional[torch.Tensor] = None # w1 -> w1_q - w2_q: Optional[torch.Tensor] = None # w2 -> w2_q - a_scale: Optional[torch.Tensor] = None - w1_scale: Optional[torch.Tensor] = None - w2_scale: Optional[torch.Tensor] = None + a_q: torch.Tensor | None = None # a -> a_q + w1_q: torch.Tensor | None = None # w1 -> w1_q + w2_q: torch.Tensor | None = None # w2 -> w2_q + a_scale: torch.Tensor | None = None + w1_scale: torch.Tensor | None = None + w2_scale: torch.Tensor | None = None # dequantized - a_d: Optional[torch.Tensor] = None # a -> a_q -> a_d - w1_d: Optional[torch.Tensor] = None # w1 -> w1_q -> w1_d - w2_d: Optional[torch.Tensor] = None # w2 -> w2_q -> w2_d + a_d: torch.Tensor | None = None # a -> a_q -> a_d + w1_d: torch.Tensor | None = None # w1 -> w1_q -> w1_d + w2_d: torch.Tensor | None = None # w2 -> w2_q -> w2_d @staticmethod def make_moe_tensors_8bit( @@ -209,7 +208,7 @@ def run_8_bit( topk_ids: torch.Tensor, per_act_token: bool, per_out_ch: bool, - num_local_experts: Optional[int] = None, + num_local_experts: int | None = None, ) -> torch.Tensor: assert not any( [ @@ -280,7 +279,7 @@ def test_cutlass_moe_8_bit_no_graph( per_act_token: bool, per_out_ch: bool, monkeypatch, - ep_size: Optional[int] = None, + ep_size: int | None = None, ): current_platform.seed_everything(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index e68c5bfa5946f..65cd3e110a0fa 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -7,7 +7,6 @@ fp8 block-quantized case. """ import dataclasses -from typing import Optional import pytest import torch.distributed @@ -92,13 +91,13 @@ class TestConfig: block_size: list[int] # configs for testing low-latency kernels low_latency: bool - use_fp8_dispatch: Optional[bool] = False + use_fp8_dispatch: bool | None = False @dataclasses.dataclass class TestTensors: rank_tokens: torch.Tensor # all ranks make this many tokens - rank_token_scales: Optional[torch.Tensor] + rank_token_scales: torch.Tensor | None topk: torch.Tensor topk_weights: torch.Tensor config: TestConfig @@ -143,7 +142,7 @@ def make_ll_modular_kernel( max_tokens_per_rank: int, dp_size: int, hidden_size: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, test_config: TestConfig, quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: @@ -179,7 +178,7 @@ def make_ht_modular_kernel( pgi: ProcessGroupInfo, dp_size: int, num_local_experts: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, test_config: TestConfig, quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: @@ -249,8 +248,8 @@ def deepep_deepgemm_moe_impl( test_tensors: TestTensors, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, ) -> torch.Tensor: test_config = test_tensors.config num_experts = test_config.num_experts diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index a1dabea1f0c7d..527c20fe6f80b 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -5,7 +5,6 @@ Test deepep dispatch-combine logic """ import dataclasses -from typing import Optional, Union import pytest import torch.distributed @@ -90,7 +89,7 @@ class TestConfig: @dataclasses.dataclass class TestTensors: rank_tokens: torch.Tensor # all ranks make this many tokens - rank_token_scales: Optional[torch.Tensor] + rank_token_scales: torch.Tensor | None topk: torch.Tensor topk_weights: torch.Tensor config: TestConfig @@ -128,12 +127,12 @@ def make_modular_kernel( dp_size: int, num_experts: int, num_local_experts: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, use_fp8_dispatch: bool, quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: - ht_args: Optional[DeepEPHTArgs] = None - ll_args: Optional[DeepEPLLArgs] = None + ht_args: DeepEPHTArgs | None = None + ll_args: DeepEPLLArgs | None = None if low_latency_mode: ll_args = DeepEPLLArgs( @@ -148,16 +147,14 @@ def make_modular_kernel( ) ht_args = DeepEPHTArgs(num_local_experts=num_local_experts) - a2a: Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = ( - make_deepep_a2a( - pg=pg, - pgi=pgi, - dp_size=dp_size, - q_dtype=q_dtype, - block_shape=None, - deepep_ht_args=ht_args, - deepep_ll_args=ll_args, - ) + a2a: DeepEPHTPrepareAndFinalize | DeepEPLLPrepareAndFinalize = make_deepep_a2a( + pg=pg, + pgi=pgi, + dp_size=dp_size, + q_dtype=q_dtype, + block_shape=None, + deepep_ht_args=ht_args, + deepep_ll_args=ll_args, ) num_dispatchers = pgi.world_size // dp_size @@ -184,8 +181,8 @@ def deep_ep_moe_impl( test_tensors: TestTensors, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, num_experts: int, use_fp8_dispatch: bool, per_act_token_quant: bool, @@ -281,8 +278,8 @@ def torch_moe_impl( test_tensors: TestTensors, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, using_fp8_dispatch: bool, per_act_token_quant: bool, ): @@ -340,8 +337,8 @@ def _deep_ep_moe( config: TestConfig, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, use_fp8_dispatch: bool, per_act_token_quant: bool, ): diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index b028e676f086f..a86185a2dc461 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -5,7 +5,7 @@ import copy import textwrap import traceback from itertools import product -from typing import Any, Optional +from typing import Any import pytest import torch @@ -245,10 +245,10 @@ def test_modular_kernel_combinations_multigpu( n: int, e: int, dtype: torch.dtype, - quant_config: Optional[TestMoEQuantConfig], + quant_config: TestMoEQuantConfig | None, prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, - chunk_size: Optional[int], + chunk_size: int | None, world_size: int, pytestconfig, ): @@ -287,10 +287,10 @@ def test_modular_kernel_combinations_singlegpu( n: int, e: int, dtype: torch.dtype, - quant_config: Optional[TestMoEQuantConfig], + quant_config: TestMoEQuantConfig | None, prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, - chunk_size: Optional[int], + chunk_size: int | None, world_size: int, pytestconfig, ): diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index f357d149bd071..6b391c173f0bc 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -6,7 +6,7 @@ Run `pytest tests/kernels/test_moe.py`. """ import functools -from typing import Callable, Optional, Union +from collections.abc import Callable import pytest import torch @@ -80,7 +80,7 @@ vllm_config.scheduler_config.max_model_len = 8192 def run_moe_test( - baseline: Union[Callable, torch.Tensor], + baseline: Callable | torch.Tensor, moe_fn: Callable, a: torch.Tensor, w1: torch.Tensor, @@ -88,7 +88,7 @@ def run_moe_test( score: torch.Tensor, topk: int, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, padding: bool = False, use_compile: bool = False, use_cudagraph: bool = False, @@ -212,7 +212,7 @@ def test_fused_moe( score: torch.Tensor, topk: int, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, ) -> torch.Tensor: topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) return m_fused_moe_fn( diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index f92526e749557..6f779c6950150 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -5,8 +5,6 @@ Run `pytest tests/kernels/moe/test_moe_align_block_size.py`. """ -from typing import Optional - import pytest import torch @@ -94,7 +92,7 @@ def torch_moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, pad_sorted_ids: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index a6214437d404a..da9fe33a1c620 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -5,8 +5,6 @@ Run `pytest tests/kernels/test_moe_permute_unpermute.py`. """ -from typing import Optional - import numpy as np import pytest import torch @@ -34,8 +32,8 @@ def torch_permute( n_expert: int, n_local_expert: int, start_expert: int, - expert_map: Optional[torch.Tensor] = None, - align_block_size: Optional[int] = None, + expert_map: torch.Tensor | None = None, + align_block_size: int | None = None, fill_invalid_expert: int = -1, ) -> list[torch.Tensor]: n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1] @@ -210,7 +208,7 @@ def test_moe_permute_unpermute( n_expert: int, ep_size: int, dtype: torch.dtype, - align_block_size: Optional[int], + align_block_size: int | None, ): if not moe_permute_unpermute_supported(): pytest.skip("moe_permute_unpermute is not supported on this platform.") diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index dceed34f35125..7a5d10a87b741 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -4,7 +4,6 @@ import importlib.metadata from dataclasses import dataclass from importlib.util import find_spec -from typing import Optional import pytest import torch @@ -103,7 +102,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): assert output -def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: Optional[float] = None): +def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = None): # Note we add an extra bias of 1 to the linear layer x_glu, x_linear = torch.chunk(x, 2, dim=-1) if limit is not None: @@ -510,7 +509,7 @@ def test_trtllm_gen_mxfp4_fused_moe( hidden_size: int, alpha: float, beta: float, - limit: Optional[float], + limit: float | None, act_type: str, transpose_optimized: bool, ): @@ -660,7 +659,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe( hidden_size: int, alpha: float, beta: float, - limit: Optional[float], + limit: float | None, ): torch.manual_seed(42) device = "cuda:0" @@ -811,9 +810,9 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe( num_tokens: int, intermediate_size: int, hidden_size: int, - alpha: Optional[float], - beta: Optional[float], - limit: Optional[float], + alpha: float | None, + beta: float | None, + limit: float | None, ): torch.manual_seed(42) device = "cuda:0" diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 4c7c6c6a4f529..ac7f3fc5e6f05 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -73,7 +72,7 @@ def pplx_cutlass_moe( out_dtype, per_act_token: bool, per_out_ch: bool, - group_name: Optional[str], + group_name: str | None, ): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 223f095c0b553..e665c636fa265 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -9,7 +9,7 @@ import copy import itertools import textwrap import traceback -from typing import Callable, Optional, Union +from collections.abc import Callable import pytest import torch @@ -89,7 +89,7 @@ def torch_prepare( a: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, - max_num_tokens: Optional[int] = None, + max_num_tokens: int | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: assert topk_ids.dim() == 2 assert topk_ids.shape[0] == a.shape[0] @@ -214,10 +214,10 @@ def create_pplx_prepare_finalize( dp_size: int, world_size: int, in_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype], - block_shape: Optional[list[int]], + quant_dtype: torch.dtype | None, + block_shape: list[int] | None, per_act_token_quant: bool, - group_name: Optional[str], + group_name: str | None, ): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize, @@ -274,18 +274,14 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor: return t[(r * chunk) : (r + 1) * chunk] -def maybe_chunk_by_rank( - t: Optional[torch.Tensor], r: int, w: int -) -> Optional[torch.Tensor]: +def maybe_chunk_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None: if t is not None: return chunk_by_rank(t, r, w) else: return t -def chunk_scales_by_rank( - t: Optional[torch.Tensor], r: int, w: int -) -> Optional[torch.Tensor]: +def chunk_scales_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None: if t is not None and t.numel() > 1: chunk = rank_chunk(t.shape[0], r, w) return t[(r * chunk) : (r + 1) * chunk] @@ -293,9 +289,7 @@ def chunk_scales_by_rank( return t -def chunk_scales( - t: Optional[torch.Tensor], start: int, end: int -) -> Optional[torch.Tensor]: +def chunk_scales(t: torch.Tensor | None, start: int, end: int) -> torch.Tensor | None: if t is not None and t.numel() > 1: return t[start:end] else: @@ -313,10 +307,10 @@ def pplx_prepare_finalize( topk_weight: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, - quant_dtype: Optional[torch.dtype], - block_shape: Optional[list[int]], + quant_dtype: torch.dtype | None, + block_shape: list[int] | None, per_act_token_quant: bool, - group_name: Optional[str], + group_name: str | None, ) -> torch.Tensor: assert torch.cuda.current_device() == pgi.local_rank @@ -409,8 +403,8 @@ def _pplx_prepare_finalize( score: torch.Tensor, topk: torch.Tensor, num_experts: int, - quant_dtype: Optional[torch.dtype], - block_shape: Optional[list[int]], + quant_dtype: torch.dtype | None, + block_shape: list[int] | None, per_act_token_quant: bool, use_internode: bool, ): @@ -479,7 +473,7 @@ def test_pplx_prepare_finalize_slow( dtype: torch.dtype, world_dp_size: tuple[int, int], per_act_token_quant: bool, - block_shape: Optional[list[int]], + block_shape: list[int] | None, use_internode: bool, ): if dtype == torch.float8_e4m3fn: @@ -521,7 +515,7 @@ def test_pplx_prepare_finalize_slow( def pplx_moe( - group_name: Optional[str], + group_name: str | None, rank: int, world_size: int, dp_size: int, @@ -530,17 +524,17 @@ def pplx_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant=False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, use_compile: bool = False, use_cudagraphs: bool = True, - shared_experts: Optional[torch.nn.Module] = None, -) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + shared_experts: torch.nn.Module | None = None, +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: num_tokens, hidden_dim = a.shape num_experts = w1.shape[0] topk = topk_ids.shape[1] @@ -657,13 +651,13 @@ def _pplx_moe( score: torch.Tensor, topk: int, num_experts: int, - w1_s: Optional[torch.Tensor] = None, - w2_s: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_s: torch.Tensor | None = None, + w2_s: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, use_internode: bool = False, - shared_experts: Optional[torch.nn.Module] = None, + shared_experts: torch.nn.Module | None = None, ): try: if use_internode: @@ -812,7 +806,7 @@ def test_pplx_moe_slow( dtype: torch.dtype, world_dp_size: tuple[int, int], per_act_token_quant: bool, - block_shape: Optional[list[int]], + block_shape: list[int] | None, use_internode: bool, ): current_platform.seed_everything(7) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 9466dacb0c111..65ce4073ad5bc 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union import torch @@ -27,13 +26,13 @@ def triton_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant=False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> torch.Tensor: quant_config = FusedMoEQuantConfig.make( quant_dtype, @@ -54,13 +53,13 @@ def batched_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) @@ -94,13 +93,13 @@ def naive_batched_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) @@ -129,8 +128,8 @@ def naive_batched_moe( def chunk_scales( - scales: Optional[torch.Tensor], start: int, end: int -) -> Optional[torch.Tensor]: + scales: torch.Tensor | None, start: int, end: int +) -> torch.Tensor | None: if scales is not None: if scales.numel() == 1: return scales @@ -144,10 +143,10 @@ def make_quantized_test_activations( m: int, k: int, in_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + quant_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, per_act_token_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: a = torch.randn((E, m, k), device="cuda", dtype=in_dtype) / 10 a_q = a a_scale = None @@ -172,11 +171,11 @@ def make_quantized_test_activations( def moe_quantize_weights( w: torch.Tensor, - w_s: Optional[torch.Tensor], - quant_dtype: Union[torch.dtype, str, None], + w_s: torch.Tensor | None, + quant_dtype: torch.dtype | str | None, per_token_quant: bool, - block_shape: Optional[list[int]], -) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + block_shape: list[int] | None, +) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]: assert ( quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8 @@ -220,10 +219,10 @@ def make_test_weight( rows: int, cols: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Union[torch.dtype, str, None] = None, - block_shape: Optional[list[int]] = None, + quant_dtype: torch.dtype | str | None = None, + block_shape: list[int] | None = None, per_out_ch_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]: w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15 w_gs = None @@ -262,12 +261,12 @@ def make_test_weights( n: int, k: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Union[torch.dtype, str, None] = None, - block_shape: Optional[list[int]] = None, + quant_dtype: torch.dtype | str | None = None, + block_shape: list[int] | None = None, per_out_ch_quant: bool = False, ) -> tuple[ - tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]], - tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]], + tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None], + tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None], ]: return ( make_test_weight( @@ -295,9 +294,9 @@ def make_test_quant_config( n: int, k: int, in_dtype: torch.dtype, - quant_dtype: Union[torch.dtype, str, None] = None, + quant_dtype: torch.dtype | str | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]: (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights( e, @@ -310,8 +309,8 @@ def make_test_quant_config( ) # Hacky/trivial scales for nvfp4. - a1_gscale: Optional[torch.Tensor] = None - a2_gscale: Optional[torch.Tensor] = None + a1_gscale: torch.Tensor | None = None + a2_gscale: torch.Tensor | None = None if quant_dtype == "nvfp4": a1_gscale = torch.ones((e,), device="cuda", dtype=torch.float32) a2_gscale = torch.ones((e,), device="cuda", dtype=torch.float32) @@ -348,9 +347,9 @@ def fused_moe( score: torch.Tensor, topk: int, renormalize: bool = False, - quant_config: Optional[FusedMoEQuantConfig] = None, + quant_config: FusedMoEQuantConfig | None = None, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, ) -> torch.Tensor: topk_weights, topk_ids, _ = fused_topk( hidden_states, score.float(), topk, renormalize @@ -378,7 +377,7 @@ class BaselineMM(torch.nn.Module): self.b = b.to(dtype=torch.float32) self.out_dtype = out_dtype - def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor | None]: return torch.mm(a.to(dtype=torch.float32), self.b).to(self.out_dtype), None @@ -422,8 +421,8 @@ class RealMLP(torch.nn.Module): quant_config=None, reduce_results: bool = True, prefix: str = "", - w1_s: Optional[torch.Tensor] = None, - w2_s: Optional[torch.Tensor] = None, + w1_s: torch.Tensor | None = None, + w2_s: torch.Tensor | None = None, ) -> None: from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -481,7 +480,7 @@ def make_shared_experts( N: int, K: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Union[torch.dtype, str, None] = None, + quant_dtype: torch.dtype | str | None = None, ) -> torch.nn.Module: from vllm.model_executor.layers.quantization.fp8 import Fp8Config diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index d892f2a5acc09..9d11a7ef64138 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union import torch @@ -15,13 +14,13 @@ ROCM_FP8FNUZ_MAX = 224.0 FP8_DTYPE = current_platform.fp8_dtype() -def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: +def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor: return torch.as_tensor(x, dtype=torch.float32, device="cuda") def ref_dynamic_per_token_quant( - x: torch.tensor, quant_dtype: torch.dtype, scale_ub: Optional[torch.tensor] = None -) -> tuple[torch.tensor, torch.tensor]: + x: torch.Tensor, quant_dtype: torch.dtype, scale_ub: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: assert quant_dtype in [torch.int8, FP8_DTYPE] if scale_ub is not None: assert quant_dtype == FP8_DTYPE @@ -76,8 +75,8 @@ def ref_dynamic_per_token_quant( # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant # kernel def ref_dynamic_per_tensor_fp8_quant( - x: torch.tensor, -) -> tuple[torch.tensor, torch.tensor]: + x: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: fp8_traits = torch.finfo(FP8_DTYPE) fp8_traits_max = ( ROCM_FP8FNUZ_MAX @@ -250,10 +249,10 @@ def per_block_cast_to_int8( def dequant( t: torch.Tensor, - scale: Optional[torch.Tensor], - block_shape: Optional[list[int]], + scale: torch.Tensor | None, + block_shape: list[int] | None, per_act_token_quant: bool, - out_dtype: Optional[torch.dtype] = torch.float32, + out_dtype: torch.dtype | None = torch.float32, ) -> torch.Tensor: if scale is not None: f32 = torch.float32 @@ -267,10 +266,10 @@ def dequant( def batched_dequant( t: torch.Tensor, - scale: Optional[torch.Tensor], - block_shape: Optional[list[int]], + scale: torch.Tensor | None, + block_shape: list[int] | None, per_act_token_quant: bool, - out_dtype: Optional[torch.dtype] = torch.float32, + out_dtype: torch.dtype | None = torch.float32, ) -> torch.Tensor: if scale is not None: assert t.shape[0] == scale.shape[0] @@ -289,9 +288,9 @@ def native_batched_masked_quant_matmul( B: torch.Tensor, C: torch.Tensor, num_expert_tokens: torch.Tensor, - A_scale: Optional[torch.Tensor] = None, - B_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, + A_scale: torch.Tensor | None = None, + B_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, per_act_token_quant: bool = False, ) -> torch.Tensor: num_expert_tokens_cpu = num_expert_tokens.clone() diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index a3d524fe90ed0..465e24fd7eb97 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -6,7 +6,6 @@ Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`. """ from dataclasses import dataclass -from typing import Optional import pytest import torch @@ -60,10 +59,10 @@ SCHEDULES = [ class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None @dataclass @@ -80,7 +79,7 @@ class Tensors: # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints, # Ch Scales Type, Tok Scales Type) TestTypeTuple = tuple[ - list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool + list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool ] TEST_TYPES = [ *( @@ -116,8 +115,8 @@ def cutlass_quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -143,7 +142,7 @@ def cutlass_quantize_and_pack( def create_test_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] + shape: tuple[int, int, int], types: TypeConfig, group_size: int | None ) -> Tensors: m, n, k = shape @@ -185,8 +184,8 @@ def create_test_tensors( def mm_test_helper( types: TypeConfig, tensors: Tensors, - group_size: Optional[int] = None, - schedule: Optional[str] = None, + group_size: int | None = None, + schedule: str | None = None, ): # CUTLASS upstream uses fp8 with fastaccum as reference # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406 diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index b32523bb85d9a..efa81de158d38 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -7,7 +7,6 @@ Run `pytest tests/kernels/quantization/test_machete_mm.py`. import math from dataclasses import dataclass, fields -from typing import Optional import pytest import torch @@ -50,11 +49,11 @@ MNK_SHAPES = [ class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + group_zero_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None @dataclass @@ -63,10 +62,10 @@ class Tensors: a_ref: torch.Tensor a: torch.Tensor w_q: torch.Tensor - w_g_s: Optional[torch.Tensor] - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] + w_g_s: torch.Tensor | None + w_g_zp: torch.Tensor | None + w_ch_s: torch.Tensor | None + w_tok_s: torch.Tensor | None # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints, @@ -74,7 +73,7 @@ class Tensors: # NOTE: None "Scale Type" means the act type is floating point # None "Output Type" means the output type is the same as the act type TestTypeTuple = tuple[ - list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool + list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool ] TEST_TYPES = [ # GPTQ style @@ -139,11 +138,11 @@ def rand_data(shape, dtype=torch.float16, scale=1, offset=0): return torch.randint(-8, 7, shape, dtype=dtype, device="cuda") -def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): +def maybe_convert_zeropoints(zps: torch.Tensor | None, s: torch.Tensor): return zps if zps is None else -1 * s * (zps.to(s.dtype)) -def group_size_valid(shape: tuple[int, int, int], group_size: Optional[int]) -> bool: +def group_size_valid(shape: tuple[int, int, int], group_size: int | None) -> bool: return group_size is None or group_size == -1 or shape[2] % group_size == 0 @@ -151,8 +150,8 @@ def machete_quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -178,8 +177,8 @@ def machete_quantize_and_pack( def create_test_tensors( shape: tuple[int, int, int], types: TypeConfig, - group_size: Optional[int], - subset_stride_factor: Optional[int] = None, + group_size: int | None, + subset_stride_factor: int | None = None, ) -> Tensors: m, n, k = shape factor = subset_stride_factor or 1 @@ -243,8 +242,8 @@ def create_test_tensors( def machete_mm_test_helper( types: TypeConfig, tensors: Tensors, - group_size: Optional[int] = None, - schedule: Optional[str] = None, + group_size: int | None = None, + schedule: str | None = None, ): output_ref = torch.matmul(tensors.a_ref, tensors.w_ref) output_ref_type = output_ref.dtype @@ -294,7 +293,7 @@ def machete_mm_test_helper( @pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x)) @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_all_schedules(shape, types: TypeConfig): - group_sizes: list[Optional[int]] = [] + group_sizes: list[int | None] = [] if types.group_scale_type is None: group_sizes = [None] else: @@ -323,7 +322,7 @@ def test_machete_all_schedules(shape, types: TypeConfig): @pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x)) @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_heuristic(shape, types: TypeConfig): - group_sizes: list[Optional[int]] = [] + group_sizes: list[int | None] = [] if types.group_scale_type is None: group_sizes = [None] else: diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 1026332d99f89..6633a8bbd3c60 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -6,7 +6,6 @@ Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`. """ import importlib -from typing import Optional import pytest import torch @@ -27,7 +26,7 @@ def torch_scaled_mm( scale_a: torch.Tensor, scale_b: torch.Tensor, out_dtype: type[torch.dtype], - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, ) -> torch.Tensor: out = torch.mm(a.to(torch.float32), b.to(torch.float32)) out = scale_a * out diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py index 9f78c177a81f0..c9eca1f86d3a1 100644 --- a/tests/kernels/test_onednn.py +++ b/tests/kernels/test_onednn.py @@ -2,8 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Integration tests for FlexAttention backend vs default backend""" -from typing import Optional - import pytest import torch @@ -38,8 +36,8 @@ def ref_int8_scaled_mm( b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - azp: Optional[torch.Tensor], - bias: Optional[torch.Tensor], + azp: torch.Tensor | None, + bias: torch.Tensor | None, output_type: torch.dtype, ): if azp is not None: diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 015424d9ee0f7..6c7ff984b4337 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -7,7 +7,7 @@ import random import unittest from collections.abc import Sequence from numbers import Number -from typing import Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple import pytest import torch @@ -96,10 +96,10 @@ class PackedQKVInputs(NamedTuple): query: torch.Tensor key: torch.Tensor value: torch.Tensor - q_start_loc_list: Optional[list[int]] - kv_start_loc_list: Optional[list[int]] - q_seq_lens: Optional[list[int]] - kv_seq_lens: Optional[list[int]] + q_start_loc_list: list[int] | None + kv_start_loc_list: list[int] | None + q_seq_lens: list[int] | None + kv_seq_lens: list[int] | None class PackedQKVO(NamedTuple): @@ -115,7 +115,7 @@ class PackedQKVO(NamedTuple): x head_size) known-correct attention output """ - packed_qkv: Optional[PackedQKVInputs] + packed_qkv: PackedQKVInputs | None ideal_output: torch.Tensor @@ -149,12 +149,12 @@ class PhaseTestParameters(NamedTuple): """ packed_qkvo: PackedQKVO - kv_mmap: Optional[KVMemoryMap] + kv_mmap: KVMemoryMap | None def maybe_make_int_tensor( - _list: Optional[list[int]], - device: Union[torch.device, str], + _list: list[int] | None, + device: torch.device | str, ) -> torch.Tensor: """ Convert Python int list to a 1D int torch.Tensor on `device` @@ -170,8 +170,8 @@ def maybe_make_int_tensor( def maybe_make_long_tensor( - _list: Optional[list[int]], - device: Union[torch.device, str], + _list: list[int] | None, + device: torch.device | str, ) -> torch.Tensor: """ Convert Python int list to a 1D long torch.Tensor on `device` @@ -186,7 +186,7 @@ def maybe_make_long_tensor( ) -def maybe_max(_list: Optional[list]) -> Optional[Number]: +def maybe_max(_list: list | None) -> Number | None: """ Returns: @@ -241,9 +241,9 @@ def ref_masked_attention( key: torch.Tensor, value: torch.Tensor, scale: float, - custom_mask: Optional[torch.Tensor] = None, - q_seq_lens: Optional[list] = None, - kv_seq_lens: Optional[list] = None, + custom_mask: torch.Tensor | None = None, + q_seq_lens: list | None = None, + kv_seq_lens: list | None = None, ) -> torch.Tensor: """ "Golden" masked attention reference. Supports two types of masking: @@ -302,11 +302,11 @@ def ref_masked_attention( def make_qkv( batch_size: int, max_q_seq_len: int, - max_kv_seq_len: Optional[int], + max_kv_seq_len: int | None, num_heads: int, head_size: int, - device: Union[torch.device, str], - force_kv_seq_lens: Optional[list[int]] = None, + device: torch.device | str, + force_kv_seq_lens: list[int] | None = None, attn_type: AttentionType = AttentionType.ENCODER_DECODER, force_max_len: bool = False, ) -> tuple[QKVInputs, QKVInputs, QKVInputs]: @@ -436,7 +436,7 @@ def make_qkv( def pack_tensor( - unpacked_tensor: torch.Tensor, seq_lens: list[int], device: Union[torch.device, str] + unpacked_tensor: torch.Tensor, seq_lens: list[int], device: torch.device | str ) -> tuple[torch.Tensor, list[int]]: """ Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an @@ -470,7 +470,7 @@ def pack_tensor( return packed_tensor, start_loc_list -def pack_qkv(qkv: QKVInputs, device: Union[torch.device, str]) -> PackedQKVInputs: +def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs: """ Individually pack each of Q, K and V, each with dimensions batch_size x padded_seq_len x num_heads x head_size, into respective number_of_tokens x @@ -594,19 +594,19 @@ def make_alibi_bias( def _make_metadata_tensors( - seq_lens: Optional[list[int]], - context_lens: Optional[list[int]], - encoder_seq_lens: Optional[list[int]], - device: Union[torch.device, str], + seq_lens: list[int] | None, + context_lens: list[int] | None, + encoder_seq_lens: list[int] | None, + device: torch.device | str, ) -> tuple[ torch.Tensor, torch.Tensor, Any, Any, - Optional[torch.Tensor], + torch.Tensor | None, torch.Tensor, torch.Tensor, - Optional[int], + int | None, ]: """ Build scalar & tensor values required to build attention metadata structure. @@ -678,7 +678,7 @@ def make_kv_cache( num_heads: int, head_size: int, block_size: int, - device: Union[torch.device, str], + device: torch.device | str, backend: str, default_val: float = 0.0, ) -> torch.Tensor: @@ -726,18 +726,18 @@ def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int: return (num_tokens + block_size) // block_size -def make_empty_slot_mapping_tensor(device: Union[torch.device, str]): +def make_empty_slot_mapping_tensor(device: torch.device | str): return maybe_make_long_tensor([], device) -def make_empty_block_tables_tensor(device: Union[torch.device, str]): +def make_empty_block_tables_tensor(device: torch.device | str): return torch.tensor([], device=device) def split_slot_mapping( slot_mapping_list: torch.Tensor, seq_lens: list[int], - device: Union[torch.device, str], + device: torch.device | str, ): """ Split a slot mapping into valid prefill- and decode-phase slot mappings. @@ -799,7 +799,7 @@ def split_slot_mapping( def make_block_tables_slot_mapping( block_size: int, seq_lens: list[int], - device: Union[torch.device, str], + device: torch.device | str, block_base_addr: int = 0, ) -> tuple[torch.Tensor, list[int], int]: """ @@ -880,11 +880,11 @@ def make_block_tables_slot_mapping( def make_test_metadata( attn_backend: _Backend, is_prompt: bool, - seq_lens: Optional[list[int]], - decoder_test_params: Optional[PhaseTestParameters], - device: Union[torch.device, str], - encoder_test_params: Optional[PhaseTestParameters] = None, - cross_test_params: Optional[PhaseTestParameters] = None, + seq_lens: list[int] | None, + decoder_test_params: PhaseTestParameters | None, + device: torch.device | str, + encoder_test_params: PhaseTestParameters | None = None, + cross_test_params: PhaseTestParameters | None = None, ) -> AttentionMetadata: """ Construct fake attention metadata for a given test phase @@ -1142,16 +1142,16 @@ def torch_experts( topk_weight: torch.Tensor, topk_ids: torch.Tensor, global_num_experts: int = -1, - b_bias1: Optional[torch.Tensor] = None, - b_bias2: Optional[torch.Tensor] = None, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + b_bias1: torch.Tensor | None = None, + b_bias2: torch.Tensor | None = None, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant=False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, apply_router_weights_on_input: bool = False, ) -> torch.Tensor: assert ( @@ -1261,10 +1261,10 @@ def torch_moe( w2: torch.Tensor, score: torch.Tensor, topk: int, - b_bias1: Optional[torch.Tensor] = None, - b_bias2: Optional[torch.Tensor] = None, + b_bias1: torch.Tensor | None = None, + b_bias2: torch.Tensor | None = None, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, ) -> torch.Tensor: score = torch.softmax(score, dim=-1, dtype=torch.float32) topk_weight, topk_ids = torch.topk(score, topk) @@ -1298,15 +1298,13 @@ def torch_moe_single(a, w, score, topk): # A special version of op check that has a restricted default set of test_utils # and a patched version of allclose that supports fp8 types. def opcheck( - op: Union[ - torch._ops.OpOverload, - torch._ops.OpOverloadPacket, - torch._library.custom_ops.CustomOpDef, - ], + op: torch._ops.OpOverload + | torch._ops.OpOverloadPacket + | torch._library.custom_ops.CustomOpDef, args: tuple[Any, ...], - kwargs: Optional[dict[str, Any]] = None, + kwargs: dict[str, Any] | None = None, *, - test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS, + test_utils: str | Sequence[str] = ALL_OPCHECK_TEST_UTILS, raise_exception: bool = True, cond: bool = True, ) -> dict[str, str]: @@ -1338,7 +1336,7 @@ def baseline_scaled_mm( scale_a: torch.Tensor, scale_b: torch.Tensor, out_dtype: type[torch.dtype], - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, ) -> torch.Tensor: # We treat N-dimensional group scaling as extended numpy-style broadcasting # in numpy simply stretches dimensions with an extent of 1 to match diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 695e06e7c1d63..8f18f01441932 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -4,7 +4,6 @@ import random from copy import deepcopy from dataclasses import dataclass -from typing import Optional from unittest.mock import patch import pytest @@ -106,7 +105,7 @@ def skip_cuda_with_stage_false(request): def get_random_id_to_index( num_loras: int, num_slots: int, log: bool = True -) -> list[Optional[int]]: +) -> list[int | None]: """Creates a random lora_id_to_index mapping. Args: @@ -122,7 +121,7 @@ def get_random_id_to_index( "num_loras must be less than or equal to num_slots." ) - slots: list[Optional[int]] = [None] * num_slots + slots: list[int | None] = [None] * num_slots random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() for lora_id, slot_idx in enumerate(random_slot_selections, start=1): slots[slot_idx] = lora_id @@ -134,7 +133,7 @@ def get_random_id_to_index( def populate_loras( - id_to_index: list[Optional[int]], + id_to_index: list[int | None], layer: BaseLayerWithLoRA, layer_weights: torch.Tensor, generate_embeddings_tensor: int = 0, diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 0d9431bd7aaea..50fd63d35cded 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys -from typing import Union import vllm from vllm import LLM @@ -27,7 +26,7 @@ def do_sample( llm: vllm.LLM, lora_path: str, lora_id: int, - tensorizer_config_dict: Union[dict, None] = None, + tensorizer_config_dict: dict | None = None, ) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 @@ -73,9 +72,7 @@ def do_sample( return generated_texts -def generate_and_test( - llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None -): +def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None): print("lora adapter created") print("lora 1") assert ( diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 894263bd0ba38..1800ca107a426 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional import pytest @@ -20,7 +19,7 @@ class TestConfig: max_loras: int = 2 max_lora_rank: int = 16 max_model_len: int = 4096 - mm_processor_kwargs: Optional[dict[str, int]] = None + mm_processor_kwargs: dict[str, int] | None = None def __post_init__(self): if self.mm_processor_kwargs is None: @@ -61,7 +60,7 @@ class Qwen2VLTester: self, images: list[ImageAsset], expected_outputs: list[str], - lora_id: Optional[int] = None, + lora_id: int | None = None, temperature: float = 0, max_tokens: int = 5, ): @@ -92,7 +91,7 @@ class Qwen2VLTester: self, images: list[ImageAsset], expected_outputs: list[list[str]], - lora_id: Optional[int] = None, + lora_id: int | None = None, temperature: float = 0, beam_width: int = 2, max_tokens: int = 5, diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py index c70e58a375c78..9b5dedc4327fb 100644 --- a/tests/lora/test_resolver.py +++ b/tests/lora/test_resolver.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest @@ -14,7 +13,7 @@ class DummyLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str - ) -> Optional[LoRARequest]: + ) -> LoRARequest | None: if lora_name == "test_lora": return LoRARequest( lora_name=lora_name, diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index c861a52d68721..eb026c2ec0209 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import OrderedDict -from typing import NamedTuple, Optional +from typing import NamedTuple from unittest.mock import patch import pytest @@ -21,7 +21,7 @@ class LoRANameParserTestConfig(NamedTuple): name: str module_name: str is_lora_a: bool - weights_mapper: Optional[WeightsMapper] = None + weights_mapper: WeightsMapper | None = None def test_parse_fine_tuned_lora_name_valid(): diff --git a/tests/lora/utils.py b/tests/lora/utils.py index b522aa6b08743..d30b77f094665 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -4,7 +4,6 @@ import json import os from dataclasses import dataclass -from typing import Optional, Union import torch from safetensors.torch import save_file @@ -81,7 +80,7 @@ class DummyLoRAManager: module_name: str, input_dim: int, output_dims: list[int], - noop_lora_index: Optional[list[int]] = None, + noop_lora_index: list[int] | None = None, rank: int = 8, ): base_loras: list[LoRALayerWeights] = [] @@ -113,7 +112,7 @@ def assert_close(a, b): @dataclass class PunicaTensors: inputs_tensor: torch.Tensor - lora_weights: Union[torch.Tensor, list[torch.Tensor]] + lora_weights: torch.Tensor | list[torch.Tensor] our_out_tensor: torch.Tensor ref_out_tensor: torch.Tensor b_seq_start_loc: torch.Tensor diff --git a/tests/model_executor/model_loader/tensorizer_loader/conftest.py b/tests/model_executor/model_loader/tensorizer_loader/conftest.py index add6d3742ff53..74724a3b398dd 100644 --- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py +++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable +from collections.abc import Callable import pytest diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 12aad4cb8da0f..bf290079469aa 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -76,7 +75,7 @@ class Relu3(ReLUSquaredActivation): ], ) def test_enabled_ops( - env: Optional[str], + env: str | None, torch_level: int, use_inductor: bool, ops_enabled: list[int], diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index b161cc7153b8f..ad37d1ad82c03 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -138,7 +137,7 @@ def test_models( example_prompts, max_tokens, num_logprobs ) - prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None + prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None prompt_token_ids = [] for prompt in example_prompts: diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index abedd15b0d7eb..fd2df329f17f9 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable +from collections.abc import Callable import pytest diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py index 43f6066b1c85e..cfa09635effc1 100644 --- a/tests/models/language/generation_ppl_test/ppl_utils.py +++ b/tests/models/language/generation_ppl_test/ppl_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/docs/transformers/perplexity -from typing import Optional, cast +from typing import cast import pytest import torch @@ -85,7 +85,7 @@ def wikitext_ppl_test( n_tokens = 0 for output in outputs: output = cast(TokensTextLogprobsPromptLogprobs, output) - token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + token_datas = cast(list[dict[int, Logprob] | None], output[3]) assert token_datas[0] is None token_log_probs = [] diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 261ab80ae86bc..4ac40656bc62a 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Optional import pytest @@ -13,7 +12,7 @@ def run_embedding_correctness_test( hf_model: "HfRunner", inputs: list[str], vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None, + dimensions: int | None = None, ): hf_outputs = hf_model.encode(inputs) if dimensions: diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index c9574dca498ee..c8deffbf66dba 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest @@ -66,7 +65,7 @@ def test_models( pooling_type="MEAN", normalize=False ) - max_model_len: Optional[int] = 512 + max_model_len: int | None = 512 if model in [ "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/stsb-roberta-base-v2", diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 14308ac06c03e..0adc9b5cf25f6 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import numpy as np import openai import pytest diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 65ad49fad3653..f2a8177377491 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -3,7 +3,6 @@ import tempfile from collections.abc import Sequence -from typing import Optional import mteb import numpy as np @@ -51,7 +50,7 @@ class VllmMtebEncoder(mteb.Encoder): def predict( self, - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ) -> np.ndarray: @@ -100,7 +99,7 @@ class ScoreClientMtebEncoder(mteb.Encoder): def predict( self, - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ) -> np.ndarray: @@ -294,7 +293,7 @@ def mteb_test_rerank_models_hf( original_predict = hf_model.predict def _predict( - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ): diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 9e95dd74c3978..2927a37111364 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import numpy as np import pytest @@ -111,7 +111,7 @@ class GemmaMtebEncoder(VllmMtebEncoder): def predict( self, - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ) -> np.ndarray: diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index ef08b1916aa5f..e39dfc888779e 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Optional import pytest from transformers import AutoModelForSpeechSeq2Seq @@ -18,8 +17,8 @@ HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], -) -> tuple[list[int], str, Optional[SampleLogprobs]]: + vllm_output: tuple[list[int], str, SampleLogprobs | None], +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize hf output to be comparable with vllm output.""" output_ids, output_str, out_logprobs = vllm_output @@ -46,7 +45,7 @@ def run_test( max_tokens: int, num_logprobs: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between hf and vllm. diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py index 132c69285c5c7..cbc7dfca0234d 100644 --- a/tests/models/multimodal/generation/test_phi4_multimodal.py +++ b/tests/models/multimodal/generation/test_phi4_multimodal.py @@ -3,7 +3,6 @@ import os from collections.abc import Sequence -from typing import Optional import librosa import pytest @@ -57,7 +56,7 @@ if current_platform.is_rocm(): def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]], + inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]], model: str, *, max_model_len: int, @@ -66,7 +65,7 @@ def run_test( num_logprobs: int, mm_limit: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between hf and vllm. diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index e69d44c6a1319..5619cecc081d2 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -3,7 +3,6 @@ import os from collections.abc import Sequence -from typing import Optional import librosa import pytest @@ -48,7 +47,7 @@ models = [model_path] def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str + vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str ): """Sanitize vllm output to be comparable with hf output.""" _, output_str, out_logprobs = vllm_output @@ -79,7 +78,7 @@ if current_platform.is_rocm(): def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]], + inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]], model: str, *, max_model_len: int, @@ -88,7 +87,7 @@ def run_test( num_logprobs: int, mm_limit: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between hf and vllm. diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index bde07da9101ac..3cad2c43d5623 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from dataclasses import asdict -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import pytest from mistral_common.multimodal import download_image @@ -117,7 +117,7 @@ FIXTURE_LOGPROBS_CHAT = { MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json", } -OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]] +OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]] # For the test author to store golden output in JSON diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index a8f0ba8701850..a4abf6e405f74 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional, TypedDict, Union +from typing import Any, TypedDict import numpy.typing as npt import pytest @@ -83,7 +83,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( - image_batches: list[Union[Image.Image, list[Image.Image]]], + image_batches: list[Image.Image | list[Image.Image]], processor, llm: VllmRunner, ) -> list[Qwen2VLPromptImageEmbeddingInput]: @@ -272,7 +272,7 @@ def run_embedding_input_test( num_logprobs: int, mm_limit: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between original image/video input and image/video embeddings input. diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 766f09b0d3207..eca2b61e37d53 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest @@ -92,7 +91,7 @@ def run_test( model: str, *, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ) -> None: prompt_list = PROMPTS * 10 expected_list = EXPECTED[model] * 10 diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 096931cca09f7..6252f33bdfad7 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -2,9 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helpers for building inputs that can be leveraged for different test types.""" -from collections.abc import Iterable +from collections.abc import Callable, Iterable from pathlib import PosixPath -from typing import Callable, Optional, Union import torch @@ -47,9 +46,9 @@ def replace_test_placeholder( def get_model_prompts( base_prompts: Iterable[str], - img_idx_to_prompt: Optional[Callable[[int], str]], - video_idx_to_prompt: Optional[Callable[[int], str]], - audio_idx_to_prompt: Optional[Callable[[int], str]], + img_idx_to_prompt: Callable[[int], str] | None, + video_idx_to_prompt: Callable[[int], str] | None, + audio_idx_to_prompt: Callable[[int], str] | None, prompt_formatter: Callable[[str], str], ) -> list[str]: """Given a model-agnostic base prompt and test configuration for a model(s) @@ -93,7 +92,7 @@ def build_single_image_inputs_from_test_info( test_info: VLMTestInfo, image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None, + tmp_path: PosixPath | None = None, ) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build single image inputs") @@ -147,7 +146,7 @@ def build_multi_image_inputs_from_test_info( test_info: VLMTestInfo, image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None, + tmp_path: PosixPath | None = None, ) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build multi image inputs") @@ -266,9 +265,7 @@ def build_video_inputs_from_test_info( ] -def apply_image_size_scaling( - image, size: Union[float, tuple[int, int]], size_type: SizeType -): +def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType): """Applies a size scaler to one image; this can be an image size factor, which scales the image while maintaining the aspect ratio""" # Special case for embeddings; if it's a tensor, it's only valid if we diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 5748ccc14c294..8d0e9b3eee9fd 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Core test implementation to be shared across modalities.""" -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any import torch from transformers.models.auto.auto_factory import _BaseAutoModelClass @@ -27,21 +28,21 @@ def run_test( enforce_eager: bool, max_model_len: int, max_num_seqs: int, - hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], - vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], + hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None, + vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None, auto_cls: type[_BaseAutoModelClass], use_tokenizer_eos: bool, comparator: Callable[..., None], - get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]], - stop_str: Optional[list[str]], + get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None, + stop_str: list[str] | None, limit_mm_per_prompt: dict[str, int], - vllm_runner_kwargs: Optional[dict[str, Any]], - hf_model_kwargs: Optional[dict[str, Any]], - patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], + vllm_runner_kwargs: dict[str, Any] | None, + hf_model_kwargs: dict[str, Any] | None, + patch_hf_runner: Callable[[HfRunner], HfRunner] | None, runner: RunnerOption = "auto", - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, tensor_parallel_size: int = 1, - vllm_embeddings: Optional[torch.Tensor] = None, + vllm_embeddings: torch.Tensor | None = None, ): """Modality agnostic test executor for comparing HF/vLLM outputs.""" # In the case of embeddings, vLLM takes separate input tensors diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index 8f2f8bba39ca2..8c9c390911bdc 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom input builders for edge-cases in different models.""" -from typing import Callable +from collections.abc import Callable from vllm.assets.image import ImageAsset from vllm.multimodal.image import rescale_image_size diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index e51d895772c05..d9c1d53b61c28 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -7,7 +7,6 @@ typically specific to a small subset of models. import types from pathlib import PosixPath -from typing import Optional, Union import numpy as np import numpy.typing as npt @@ -58,7 +57,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutpu def qwen_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize vllm output [qwen models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -69,7 +68,7 @@ def qwen_vllm_to_hf_output( def qwen2_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -80,7 +79,7 @@ def qwen2_vllm_to_hf_output( def kimiv_vl_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize vllm output [kimi_vl models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -99,7 +98,7 @@ def llava_image_vllm_to_hf_output( def llava_video_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: config = AutoConfig.from_pretrained(model) mm_token_id = config.video_token_index return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id) @@ -263,7 +262,7 @@ def get_llava_embeddings(image_assets: ImageTestAssets): ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( - tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets] + tmp_path: PosixPath, prompt: str, assets: list[ImageAsset] | ImageTestAssets ) -> str: """Given a temporary dir path, export one or more image assets into the tempdir & replace its contents with the local path to the string so that @@ -440,7 +439,7 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): + def __call__(self, text: str, images: Image | list[Image], **kwargs): from vllm.model_executor.models.h2ovl import ( IMG_CONTEXT, IMG_END, @@ -499,7 +498,7 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): + def __call__(self, text: str, images: Image | list[Image], **kwargs): from vllm.model_executor.models.skyworkr1v import ( IMG_CONTEXT, IMG_END, @@ -560,8 +559,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __call__( self, text: str, - images: Union[Image, list[Image]] = None, - videos: Union[npt.NDArray, list[npt.NDArray]] = None, + images: Image | list[Image] = None, + videos: npt.NDArray | list[npt.NDArray] = None, **kwargs, ): from vllm.model_executor.models.internvl import ( @@ -650,7 +649,7 @@ def _internvl_generate( self, pixel_values: torch.FloatTensor, input_ids: torch.FloatTensor, - attention_mask: Optional[torch.LongTensor] = None, + attention_mask: torch.LongTensor | None = None, **generate_kwargs, ) -> torch.LongTensor: """Generate method for InternVL2 model without fixed use_cache.""" diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 6e82f7e3306ab..fe02f71884324 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Types for writing multimodal model tests.""" -from collections.abc import Iterable +from collections.abc import Callable, Iterable from enum import Enum from pathlib import PosixPath -from typing import Any, Callable, NamedTuple, Optional, Union +from typing import Any, NamedTuple import torch from pytest import MarkDecorator @@ -52,16 +52,16 @@ VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)] -RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]] +RunnerOutput = tuple[list[int], str, SampleLogprobs | None] class PromptWithMultiModalInput(NamedTuple): """Holds the multimodal input for a single test case.""" prompts: list[str] - image_data: Optional[PromptImageInput] = None - video_data: Optional[PromptVideoInput] = None - audio_data: Optional[PromptAudioInput] = None + image_data: PromptImageInput | None = None + video_data: PromptVideoInput | None = None + audio_data: PromptAudioInput | None = None class VLMTestType(Enum): @@ -87,17 +87,17 @@ class ImageSizeWrapper(NamedTuple): type: SizeType # A size factor is a wrapper of 0+ floats, # while a fixed size contains an iterable of integer pairs - data: Union[Iterable[float], Iterable[tuple[int, int]]] + data: Iterable[float] | Iterable[tuple[int, int]] class VLMTestInfo(NamedTuple): """Holds the configuration for 1+ tests for one model architecture.""" models: list[str] - test_type: Union[VLMTestType, Iterable[VLMTestType]] + test_type: VLMTestType | Iterable[VLMTestType] # Should be None only if this is a CUSTOM_INPUTS test - prompt_formatter: Optional[Callable[[str], str]] = None + prompt_formatter: Callable[[str], str] | None = None img_idx_to_prompt: Callable[[int], str] = lambda idx: "\n" video_idx_to_prompt: Callable[[int], str] = lambda idx: "