update

Signed-off-by: bk-201 <joy25810@foxmail.com>
2026-05-06 11:37:59 +08:00 · 2025-10-13 02:23:02 +00:00 · 2025-10-13 02:23:02 +00:00 · 0fa9111e82
commit 0fa9111e82
parent cb1a6f074a 41f3884438
962 changed files with 12342 additions and 11738 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -25,25 +25,28 @@ function cpu_tests() {
  # offline inference
  podman exec -it "$container_id" bash -c "
-    set -e
+    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
  # Run basic model test
  podman exec -it "$container_id" bash -c "
-    set -e
+    set -evx
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
    # Note: disable Bart until supports V1
    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export container_id
 export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -44,6 +44,5 @@ docker run \
    pytest -v -s v1/structured_output
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
    pytest -v -s v1/test_metrics
    pytest -v -s v1/test_serial_utils.py
 '
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -121,3 +121,11 @@ mkdocs.yaml @hmellor
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
 # Pooling models
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -8,7 +8,6 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
 from typing import Optional, Union
 import aiohttp
 import huggingface_hub.constants
@ -28,13 +27,13 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
-    model_name: Optional[str] = None
+    model_name: str | None = None
-    logprobs: Optional[int] = None
+    logprobs: int | None = None
-    extra_body: Optional[dict] = None
+    extra_body: dict | None = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    multi_modal_content: dict | list[dict] | None = None
    ignore_eos: bool = False
-    language: Optional[str] = None
+    language: str | None = None
-    request_id: Optional[str] = None
+    request_id: str | None = None
@dataclass
@ -52,7 +51,7 @@ class RequestFuncOutput:
 async def async_request_tgi(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -133,7 +132,7 @@ async def async_request_tgi(
 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
@ -204,7 +203,7 @@ async def async_request_trt_llm(
 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("completions", "profile")), (
@ -367,7 +366,7 @@ async def async_request_openai_completions(
 async def async_request_openai_chat_completions(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
@ -610,7 +609,7 @@ def get_tokenizer(
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
    if pretrained_model_name_or_path is not None and not os.path.exists(
        pretrained_model_name_or_path
    ):
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -32,7 +32,6 @@ import dataclasses
 import json
 import random
 import time
 from typing import Optional
 from transformers import PreTrainedTokenizerBase
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[Request]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
@ -128,7 +127,7 @@ def sample_requests_from_random(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
    prefix_len: int,
 ) -> list[Request]:
    requests = []
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -7,7 +7,6 @@ import dataclasses
 import json
 import random
 import time
 from typing import Optional
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -24,7 +23,7 @@ def sample_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[tuple[str, int, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -32,7 +32,6 @@ import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
 from typing import Optional
 import datasets
 import numpy as np
@ -316,7 +315,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: list[str],
    selected_percentiles: list[float],
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
    actual_output_lens: list[int] = []
    total_input = 0
@ -436,9 +435,9 @@ async def benchmark(
    selected_percentile_metrics: list[str],
    selected_percentiles: list[str],
    ignore_eos: bool,
-    max_concurrency: Optional[int],
+    max_concurrency: int | None,
    structured_output_ratio: float,
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any
 def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
    def __init__(self, scale: int) -> None:
        self.cnt: int = 0
        self._sum: int = 0
-        self._max: Optional[int] = None
+        self._max: int | None = None
        self.scale = scale
        self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
        else:
            self._max = max(self._max, v)
-    def avg(self) -> Union[float, str]:
+    def avg(self) -> float | str:
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
-    def max(self) -> Union[float, str]:
+    def max(self) -> float | str:
        return self._max / self.scale if self._max else "N/A"
-    def dump_avg_max(self) -> list[Union[float, str]]:
+    def dump_avg_max(self) -> list[float | str]:
        return [self.avg(), self.max()]
    def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
    def __exit__(
        self,
-        exc_type: Optional[type[BaseException]],
+        exc_type: type[BaseException] | None,
-        exc_value: Optional[BaseException],
+        exc_value: BaseException | None,
-        exc_traceback: Optional[TracebackType],
+        exc_traceback: TracebackType | None,
    ) -> None:
        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from typing import Callable
 import torch
 import torch.utils.benchmark as TBenchmark
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -53,7 +52,7 @@ def bench_int8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
@ -108,7 +107,7 @@ def bench_fp8(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
@ -183,7 +182,7 @@ def bench(
    n: int,
    label: str,
    sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
    dtype: torch.dtype,
    MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -3,10 +3,9 @@
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Optional
 import torch
 import torch.utils.benchmark as TBenchmark
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -68,7 +67,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    # Norm
@ -85,7 +84,7 @@ def unfused_fp8_impl(
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch
 import pandas as pd
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -22,8 +22,8 @@ Example:
 import json
 import os
 import time
 from collections.abc import Callable
 from contextlib import nullcontext
 from typing import Callable, Optional
 import torch
 import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
-    ) -> Optional[float]:
+    ) -> float | None:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -6,11 +6,12 @@ import copy
 import json
 import pickle
 import time
 from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any
 import torch
 import torch.utils.benchmark as TBenchmark
@ -158,7 +159,7 @@ def ref_group_gemm(
    seq_lens_cpu: torch.Tensor,
    prompt_lora_mapping_cpu: torch.Tensor,
    scaling: float,
-    add_inputs: Optional[bool],
+    add_inputs: bool | None,
 ):
    """
    Torch group gemm reference implementation to test correctness of
@ -316,8 +317,8 @@ class BenchmarkContext:
    lora_rank: int
    sort_by_lora_id: bool
    dtype: torch.dtype
-    seq_length: Optional[int] = None
+    seq_length: int | None = None
-    num_slices: Optional[int] = None  # num_slices for slice based ops
+    num_slices: int | None = None  # num_slices for slice based ops
    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
        ctx = copy.copy(self)
@ -561,7 +562,7 @@ class BenchmarkTensors:
        }
    def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: Optional[bool] = None
+        self, op_type: OpType, add_inputs: bool | None = None
    ) -> dict[str, Any]:
        if op_type.is_shrink_fn():
            assert add_inputs is None
@ -575,7 +576,7 @@ class BenchmarkTensors:
        raise ValueError(f"Unrecognized optype {self}")
    def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+        self, op_type: OpType, expand_fn_add_inputs: bool | None
    ) -> bool:
        """
        Test correctness of op_type implementation against a grouped gemm
@ -611,8 +612,8 @@ def bench_optype(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
+    cuda_graph_nops: int | None = None,
-    expand_fn_add_inputs: Optional[bool] = None,
+    expand_fn_add_inputs: bool | None = None,
    test_correctness: bool = False,
 ) -> TMeasurement:
    assert arg_pool_size >= 1
@ -679,7 +680,7 @@ def bench_torch_mm(
    ctx: BenchmarkContext,
    arg_pool_size: int,
    op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
+    cuda_graph_nops: int | None = None,
 ) -> TMeasurement:
    """
    Benchmark basic torch.mm as a roofline.
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
            """
-def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
    compare = TBenchmark.Compare(timers)
    compare.print()
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -8,10 +8,9 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Optional
 import pandas as pd
 import torch
@ -63,23 +62,23 @@ class BenchmarkTensors:
    a: torch.Tensor
    w_q: torch.Tensor
-    group_size: Optional[int]
+    group_size: int | None
    wtype: ScalarType
    w_g_s: torch.Tensor
-    w_g_zp: Optional[torch.Tensor]
+    w_g_zp: torch.Tensor | None
-    w_ch_s: Optional[torch.Tensor]
+    w_ch_s: torch.Tensor | None
-    w_tok_s: Optional[torch.Tensor]
+    w_tok_s: torch.Tensor | None
@dataclass
 class TypeConfig:
    act_type: torch.dtype
    weight_type: ScalarType
-    output_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
-    group_scale_type: Optional[torch.dtype]
+    group_scale_type: torch.dtype | None
-    group_zero_type: Optional[torch.dtype]
+    group_zero_type: torch.dtype | None
-    channel_scale_type: Optional[torch.dtype]
+    channel_scale_type: torch.dtype | None
-    token_scale_type: Optional[torch.dtype]
+    token_scale_type: torch.dtype | None
 def rand_data(shape, dtype=torch.float16, scale=1):
@ -93,8 +92,8 @@ def quantize_and_pack(
    atype: torch.dtype,
    w: torch.Tensor,
    wtype: ScalarType,
-    stype: Optional[torch.dtype],
+    stype: torch.dtype | None,
-    group_size: Optional[int],
+    group_size: int | None,
    zero_points: bool = False,
 ):
    assert wtype.is_integer(), "TODO: support floating point weights"
@ -113,7 +112,7 @@ def quantize_and_pack(
 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 ) -> list[BenchmarkTensors]:
    m, n, k = shape
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
    return res
-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
 def bench(
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -3,7 +3,6 @@
 import random
 import time
 from typing import Optional
 import torch
@ -37,7 +36,7 @@ def main(
    seed: int,
    do_profile: bool,
    device: str = "cuda",
-    kv_cache_dtype: Optional[str] = None,
+    kv_cache_dtype: str | None = None,
 ) -> None:
    current_platform.seed_everything(seed)
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@ -3,8 +3,8 @@
 import argparse
 import math
 from collections.abc import Callable
 from contextlib import contextmanager
 from typing import Callable
 from unittest.mock import patch
 import torch
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import random
 import time
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from typing import Optional, Union
 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        orig_dtype = x.dtype
        x = x.to(torch.float32)
        if residual is not None:
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -65,7 +64,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from itertools import accumulate
 from typing import Optional
 import nvtx
 import torch
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
 from typing import Optional
 import flashinfer
 import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_decode(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
 from typing import Optional
 import flashinfer
 import torch
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
 def benchmark_prefill(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_len: int,
    num_heads: tuple[int, int] = (64, 8),
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
-from typing import Any, Callable, Optional
+from typing import Any
 import torch
 import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
    def __init__(
        self,
-        cuda_graph_params: Optional[CudaGraphBenchParams],
+        cuda_graph_params: CudaGraphBenchParams | None,
        label: str,
        sub_label: str,
        description: str,
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
    def __init__(
        self,
-        min_val: Union[int, float],
+        min_val: int | float,
-        max_val: Union[int, float],
+        max_val: int | float,
        is_integer: bool = True,
    ) -> None:
        self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
 class ConstantDistribution(Distribution):
-    def __init__(self, value: Union[int, float]) -> None:
+    def __init__(self, value: int | float) -> None:
        self.value = value
        self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
        self.alpha = alpha
        self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
        self.alpha = alpha
        self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
    def __init__(
        self,
-        mean: Optional[float] = None,
+        mean: float | None = None,
-        sigma: Optional[float] = None,
+        sigma: float | None = None,
-        average: Optional[int] = None,
+        average: int | None = None,
-        median_ratio: Optional[float] = None,
+        median_ratio: float | None = None,
-        max_val: Optional[int] = None,
+        max_val: int | None = None,
    ) -> None:
        self.average = average
        self.median_ratio = median_ratio
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple, Union
+from typing import NamedTuple
 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
 class DebugStats:
    def __init__(self, logger: logging.Logger, window_size: int) -> None:
        self.logger = logger
-        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+        self.metrics: dict[str, MovingAverage | MetricStats] = {
            "moving_avg_ttft_ms": MovingAverage(window_size),
            "moving_avg_tpot_ms": MovingAverage(window_size),
            "ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(
            if args.verbose:
                curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: Union[str, float] = "N/A"
+                time_since_last_turn: str | float = "N/A"
                if conv_id in time_of_last_turn:
                    time_since_last_turn = round(
                        curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                )
-                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
                if len(client_metrics) < (5 * bench_args.num_clients):
                    # Do not estimate the RPS if the number of samples is very low
                    # (threshold can be tuned if needed)
                    rps = "N/A"
-                runtime_left_sec: Union[str, float] = round(
+                runtime_left_sec: str | float = round(
                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                )
                if percent < 0.05:
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any, Optional
+from typing import Any
 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
 def content_is_valid(
-    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+    content: str, min_content_len: int | None, max_content_len: int | None
 ) -> bool:
    if min_content_len and len(content) < min_content_len:
        return False
@ -37,7 +37,7 @@ def content_is_valid(
 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
 ) -> None:
    # Collect statistics
    stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
    seed: int,
    input_file: str,
    output_file: str,
-    max_items: Optional[int],
+    max_items: int | None,
-    min_content_len: Optional[int] = None,
+    min_content_len: int | None = None,
-    max_content_len: Optional[int] = None,
+    max_content_len: int | None = None,
-    min_turns: Optional[int] = None,
+    min_turns: int | None = None,
-    max_turns: Optional[int] = None,
+    max_turns: int | None = None,
-    model: Optional[str] = None,
+    model: str | None = None,
 ) -> None:
    if min_turns and max_turns:
        assert min_turns <= max_turns
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -198,13 +198,24 @@ else()
 endif()
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
-    FetchContent_Declare(
+    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
-        oneDNN
+
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
-        GIT_TAG v3.9
+        message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
-        GIT_PROGRESS TRUE
+        FetchContent_Declare(
-        GIT_SHALLOW TRUE
+            oneDNN
-    )
+            SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
        )
    else()
        message(STATUS "Downloading oneDNN from GitHub")
        FetchContent_Declare(
            oneDNN
            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
            GIT_TAG v3.9
            GIT_PROGRESS TRUE
            GIT_SHALLOW TRUE
        )
    endif()
    if(USE_ACL)
        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "ON")
+    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
    FetchContent_MakeAvailable(oneDNN)
@ -309,4 +320,4 @@ define_gpu_extension_target(
    WITH_SOABI
 )
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 from typing import Union
 from cutlass_library import *
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
    TmaWarpSpecializedCooperative = enum_auto()
-VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
    **DataTypeNames,  # type: ignore
    **{
        VLLMDataType.u4b8: "u4b8",
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
    },
 }
-VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
    **DataTypeTag,  # type: ignore
    **{
        VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    },
 }
-VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
+VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
    **DataTypeSize,  # type: ignore
    **{
        VLLMDataType.u4b8: 4,
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
    },
 }
-VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
    VLLMDataType.u4b8: "vllm::kU4B8",
    VLLMDataType.u8b128: "vllm::kU8B128",
    DataType.u4: "vllm::kU4",
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.bf16: "vllm::kBfloat16",
 }
-VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
    DataType.u8: "at::ScalarType::Byte",
    DataType.s8: "at::ScalarType::Char",
    DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
    DataType.f32: "at::ScalarType::Float",
 }
-VLLMKernelScheduleTag: dict[
+VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
    Union[MixedInputKernelScheduleType, KernelScheduleType], str
 ] = {
    **KernelScheduleTag,  # type: ignore
    **{
        MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -9,7 +9,6 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
 from typing import Optional, Union
 import jinja2
 from vllm_cutlass_library_extension import (
@ -259,7 +258,7 @@ class ScheduleConfig:
@dataclass(frozen=True)
 class TypeConfig:
    a: DataType
-    b: Union[DataType, VLLMDataType]
+    b: DataType | VLLMDataType
    b_group_scale: DataType
    b_group_zeropoint: DataType
    b_channel_scale: DataType
@ -280,7 +279,7 @@ class PrepackTypeConfig:
 class ImplConfig:
    types: TypeConfig
    schedules: list[ScheduleConfig]
-    heuristic: list[tuple[Optional[str], ScheduleConfig]]
+    heuristic: list[tuple[str | None, ScheduleConfig]]
 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -1,4 +1,4 @@
-ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
+ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
 ###############################################################
 # Stage to build openblas
@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
 ARG MAX_JOBS
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
    && source /opt/rh/gcc-toolset-13/enable \
    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
 FROM centos-deps-builder AS base-builder
 ARG PYTHON_VERSION=3.12
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 # Set Environment Variables for venv, cargo & openblas
 ENV VIRTUAL_ENV=/opt/vllm
@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
-       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
    && dnf clean all \
    && PREFIX=/usr/local make -C /openblas install \
    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
 FROM base-builder AS torch-builder
 ARG MAX_JOBS
-ARG TORCH_VERSION=2.6.0
+ARG TORCH_VERSION=2.7.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 RUN --mount=type=cache,target=/root/.cache/uv \
    source /opt/rh/gcc-toolset-13/enable &&  \
@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
    PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
-ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_VERSION=0.22.0
 ARG TORCHVISION_USE_NVJPEG=0
 ARG TORCHVISION_USE_FFMPEG=0
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    BUILD_VERSION=${TORCHVISION_VERSION} \
    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
-ARG TORCHAUDIO_VERSION=2.6.0
+ARG TORCHAUDIO_VERSION=2.7.0
 ARG BUILD_SOX=1
 ARG BUILD_KALDI=1
 ARG BUILD_RNNT=1
@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
 ARG MAX_JOBS
 ARG PYARROW_PARALLEL
-ARG PYARROW_VERSION=19.0.1
+ARG PYARROW_VERSION=21.0.0
 RUN --mount=type=cache,target=/root/.cache/uv \
    source /opt/rh/gcc-toolset-13/enable && \
    git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    make install -j ${MAX_JOBS:-$(nproc)} && \
    cd ../../python/ && \
    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
    python setup.py build_ext \
    --build-type=release --bundle-arrow-cpp \
@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
    && make -j ${MAX_JOBS:-$(nproc)}
 ###############################################################
 # Stage to build numba 
 ###############################################################
 FROM base-builder AS numba-builder
 ARG MAX_JOBS
 ARG NUMBA_VERSION=0.61.2
 # Clone all required dependencies
 RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
    git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
    cd ./numba && \
    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
    fi && python -m build --wheel --installer=uv --outdir /numbawheels/
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=numba-builder /tmp/control /dev/null
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 # this step installs vllm and populates uv cache
 # with all the transitive dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
    dnf install llvm15 llvm15-devel -y && \
    rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
    source /opt/rh/gcc-toolset-13/enable && \
    git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
    uv pip install maturin && \
@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
    --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
    --mount=type=bind,src=.,dst=/src/,rw \
    source /opt/rh/gcc-toolset-13/enable && \
-    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    export PATH=$PATH:/usr/lib64/llvm15/bin && \
    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
-    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
    uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
    make -C /numactl install && \
    # sentencepiece.pc is in some pkgconfig inside uv cache
    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
-    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
    cd /src/ && \
    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
    uv pip install /vllmwheel/*.whl
@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
 ARG PYTHON_VERSION=3.12
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 # Set Environment Variables for venv & openblas
 ENV VIRTUAL_ENV=/opt/vllm
@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
 COPY --from=openblas-builder /tmp/control /dev/null
 COPY --from=numba-builder /tmp/control /dev/null
 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
    microdnf install --nodocs -y \
-    tar findutils openssl \
+    libomp tar findutils openssl llvm15 llvm15-devel \
    pkgconfig xsimd g++ gcc-fortran libsndfile \
    libtiff libjpeg openjpeg2 zlib zeromq \
    freetype lcms2 libwebp tcl tk utf8proc \
-    harfbuzz fribidi libraqm libimagequant libxcb \
+    harfbuzz fribidi libraqm libimagequant libxcb util-linux \
    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
-    && microdnf clean all \
+    && export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
    && python -m pip install -U pip uv --no-cache \
    && make -C /numactl install \
@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
    --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
-    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
+    --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
@ -314,4 +340,4 @@ WORKDIR /workspace/
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 # install nixl from source code
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
 ENTRYPOINT ["vllm", "serve"]
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@ -16,7 +16,7 @@ Declare supported languages and capabilities:
 ??? code "supported_languages and supports_transcription_only"
    ```python
-    from typing import ClassVar, Mapping, Optional, Literal
+    from typing import ClassVar, Mapping, Literal
    import numpy as np
    import torch
    from torch import nn
@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
            audio: np.ndarray,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
            task_type: Literal["transcribe", "translate"],
            request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
        ) -> PromptType:
            # Example with a free-form instruction prompt
            task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
            audio: np.ndarray,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
            task_type: Literal["transcribe", "translate"],
            request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
        ) -> PromptType:
            if language is None:
                raise ValueError("Language must be specified")
@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method (
 ??? code "validate_language()"
    ```python
    @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
        if language is None:
            logger.warning(
                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
            audio_duration_s: float,
            stt_config: SpeechToTextConfig,
            model_config: ModelConfig,
-        ) -> Optional[int]:
+        ) -> int | None:
            # Return None if unknown; otherwise return an estimate.
            return int(audio_duration_s * stt_config.sample_rate // 320)  # example
    ```
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
    from collections.abc import Sequence
    from dataclasses import dataclass
    from enum import Enum, auto
-    from typing import TYPE_CHECKING, Optional
+    from typing import TYPE_CHECKING
    import torch
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
        @abstractmethod
        def update_state(
            self,
-            batch_update: Optional["BatchUpdate"],
+            batch_update: "BatchUpdate" | None,
        ) -> None:
            """Called when there are new output tokens, prior
            to each forward pass.
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
    * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
    * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
-* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
+* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
    * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
    * Use the `BatchUpdate` members to update logits processor internal state
    * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
 ??? code "Example custom logits processor definition"
    ``` python
    from typing import Optional
    import torch
    from vllm.config import VllmConfig
    from vllm.sampling_params import SamplingParams
@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
            """Never impacts greedy sampling"""
            return False
-        def update_state(self, batch_update: Optional[BatchUpdate]):
+        def update_state(self, batch_update: BatchUpdate | None):
            if not batch_update:
                return
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -10,7 +10,7 @@ on HuggingFace model repository.
 import os
 from dataclasses import asdict
-from typing import Any, NamedTuple, Optional
+from typing import Any, NamedTuple
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@ -30,11 +30,11 @@ question_per_audio_count = {
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: Optional[str] = None
+    prompt: str | None = None
-    prompt_token_ids: Optional[dict[str, list[int]]] = None
+    prompt_token_ids: dict[str, list[int]] | None = None
-    multi_modal_data: Optional[dict[str, Any]] = None
+    multi_modal_data: dict[str, Any] | None = None
-    stop_token_ids: Optional[list[int]] = None
+    stop_token_ids: list[int] | None = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    lora_requests: list[LoRARequest] | None = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
--- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
@ -3,7 +3,7 @@
 # ruff: noqa: E501
 import logging
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
    def get_finished(
        self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
        if self._async_load:
            meta = self._get_connector_metadata()
            assert isinstance(meta, RogueSharedStorageConnectorMetadata)
--- a/examples/offline_inference/logits_processor/custom.py
+++ b/examples/offline_inference/logits_processor/custom.py
@ -33,8 +33,6 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
 from typing import Optional
 import torch
 from vllm import LLM, SamplingParams
@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor):
    def is_argmax_invariant(self) -> bool:
        return False
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
        process_dict_updates(
            self.req_info,
            batch_update,
--- a/examples/offline_inference/logits_processor/custom_req.py
+++ b/examples/offline_inference/logits_processor/custom_req.py
@ -39,7 +39,7 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
-from typing import Any, Optional
+from typing import Any
 import torch
@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
    def new_req_logits_processor(
        self,
        params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
        """This method returns a new request-level logits processor, customized
        to the `target_token` value associated with a particular request.
@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
        Returns:
          `Callable` request logits processor, or None
        """
-        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+        target_token: Any | None = params.extra_args and params.extra_args.get(
            "target_token"
        )
        if target_token is None:
--- a/examples/offline_inference/logits_processor/custom_req_init.py
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
 device, the first and third requests would not repeat the same token.
 """
 from typing import Optional
 import torch
 from vllm import LLM, SamplingParams
@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
    def new_req_logits_processor(
        self,
        params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
        """This method returns a new request-level logits processor, customized
        to the `target_token` value associated with a particular request.
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
 """
 import gc
 from typing import Optional
 import torch
 from huggingface_hub import snapshot_download
@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest
 def create_test_prompts(
    lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
    return [
        # this is an example of using quantization without LoRA
        (
@ -56,7 +55,7 @@ def create_test_prompts(
 def process_requests(
    engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
@ -78,7 +77,7 @@ def process_requests(
 def initialize_engine(
-    model: str, quantization: str, lora_repo: Optional[str]
+    model: str, quantization: str, lora_repo: str | None
 ) -> LLMEngine:
    """Initialize the LLMEngine."""
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -7,8 +7,6 @@ for offline inference.
 Requires HuggingFace credentials for access to Llama2.
 """
 from typing import Optional
 from huggingface_hub import snapshot_download
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest
 def create_test_prompts(
    lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
    """Create a list of test prompts with their sampling parameters.
    2 requests for base model, 4 requests for the LoRA. We define 2
@ -68,7 +66,7 @@ def create_test_prompts(
 def process_requests(
    engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@ -3,7 +3,6 @@
 import argparse
 import datetime
 import os
 from typing import Union
 import albumentations
 import numpy as np
@ -160,7 +159,7 @@ def load_example(
    file_paths: list[str],
    mean: list[float] = None,
    std: list[float] = None,
-    indices: Union[list[int], None] = None,
+    indices: list[int] | None = None,
 ):
    """Build an input example by loading images in *file_paths*.
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-from typing import Callable, Optional, TypedDict
+from collections.abc import Callable
 from typing import TypedDict
 import torch
 import zmq
@ -71,7 +72,7 @@ class WorkerExtension:
 def rebuild_ipc(
-    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+    handle: tuple[Callable, tuple], device_id: int | None = None
 ) -> torch.Tensor:
    func, args = handle
    list_args = list(args)
@ -109,7 +110,7 @@ class ColocateWorkerExtension:
            self._zmq_ctx = zmq.Context()
        socket = self._zmq_ctx.socket(zmq.REP)
        socket.connect(zmq_handles[self.report_device_id()])
-        buffer: Optional[torch.Tensor] = None
+        buffer: torch.Tensor | None = None
        while True:
            payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
                socket.recv_pyobj()
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -12,7 +12,7 @@ import os
 import random
 from contextlib import contextmanager
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
-    stop_token_ids: Optional[list[int]] = None
+    stop_token_ids: list[int] | None = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    lora_requests: list[LoRARequest] | None = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -9,7 +9,7 @@ using the chat template defined by the model.
 import os
 from argparse import Namespace
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
+    stop_token_ids: list[int] | None = None
-    chat_template: Optional[str] = None
+    chat_template: str | None = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    lora_requests: list[LoRARequest] | None = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -1251,7 +1251,7 @@ model_example_map = {
 }
-def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
+def run_generate(model, question: str, image_urls: list[str], seed: int | None):
    req_data = model_example_map[model](question, image_urls)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
        print("-" * 50)
-def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
+def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
    req_data = model_example_map[model](question, image_urls)
    # Disable other modalities to save memory
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@ -11,7 +11,7 @@ on HuggingFace model repository.
 from argparse import Namespace
 from dataclasses import asdict
 from pathlib import Path
-from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
 from PIL.Image import Image
@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict):
 QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
+Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: Optional[str] = None
+    prompt: str | None = None
-    image: Optional[Image] = None
+    image: Image | None = None
-    query: Optional[str] = None
+    query: str | None = None
-    documents: Optional[ScoreMultiModalParam] = None
+    documents: ScoreMultiModalParam | None = None
 def run_clip(query: Query) -> ModelRequestData:
@ -281,7 +281,7 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)
-def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+def run_encode(model: str, modality: QueryModality, seed: int | None):
    query = get_query(modality)
    req_data = model_example_map[model](query)
@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
        print("-" * 50)
-def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+def run_score(model: str, modality: QueryModality, seed: int | None):
    query = get_query(modality)
    req_data = model_example_map[model](query)
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@ -23,7 +23,7 @@ import logging
 import os
 import sys
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from collections.abc import Callable
 import aiohttp
 import requests
@ -49,12 +49,9 @@ class Proxy:
        decode_instances: list[str],
        model: str,
        scheduling_policy: SchedulingPolicy,
-        custom_create_completion: Optional[
+        custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
-            Callable[[Request], StreamingResponse]
+        custom_create_chat_completion: Callable[[Request], StreamingResponse]
-        ] = None,
+        | None = None,
        custom_create_chat_completion: Optional[
            Callable[[Request], StreamingResponse]
        ] = None,
    ):
        self.prefill_instances = prefill_instances
        self.decode_instances = decode_instances
@ -348,9 +345,9 @@ class ProxyServer:
    def __init__(
        self,
        args: argparse.Namespace,
-        scheduling_policy: Optional[SchedulingPolicy] = None,
+        scheduling_policy: SchedulingPolicy | None = None,
-        create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        create_completion: Callable[[Request], StreamingResponse] | None = None,
-        create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
    ):
        self.validate_parsed_serve_args(args)
        self.port = args.port
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Any
 import msgspec
 import zmq
@ -25,16 +25,16 @@ class KVCacheEvent(
 class BlockStored(KVCacheEvent):
    block_hashes: list[ExternalBlockHash]
-    parent_block_hash: Optional[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
    token_ids: list[int]
    block_size: int
-    lora_id: Optional[int]
+    lora_id: int | None
-    medium: Optional[str]
+    medium: str | None
 class BlockRemoved(KVCacheEvent):
    block_hashes: list[ExternalBlockHash]
-    medium: Optional[str]
+    medium: str | None
 class AllBlocksCleared(KVCacheEvent):
@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):
 class KVEventBatch(EventBatch):
-    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]
 def process_event(event_batch):
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from typing import Optional
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -43,7 +42,7 @@ async def main():
    )
    prompt = "Who won the 2004 World Series?"
-    final_output: Optional[RequestOutput] = None
+    final_output: RequestOutput | None = None
    async for output in engine_client.generate(
        prompt=prompt,
        sampling_params=sampling_params,
--- a/examples/online_serving/pooling/cohere_rerank_client.py
+++ b/examples/online_serving/pooling/cohere_rerank_client.py
@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
 run: vllm serve BAAI/bge-reranker-base
 """
 from typing import Union
 import cohere
 from cohere import Client, ClientV2
@ -25,7 +23,7 @@ documents = [
 def cohere_rerank(
-    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+    client: Client | ClientV2, model: str, query: str, documents: list[str]
 ) -> dict:
    return client.rerank(model=model, query=query, documents=documents)
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model.
 import argparse
 import base64
 import io
-from typing import Literal, Union
+from typing import Literal
 from openai import OpenAI
 from openai._types import NOT_GIVEN, NotGiven
@ -29,7 +29,7 @@ def create_chat_embeddings(
    *,
    messages: list[ChatCompletionMessageParam],
    model: str,
-    encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
+    encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
 ) -> CreateEmbeddingResponse:
    """
    Convenience function for accessing vLLM's Chat Embeddings API,
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@ -1,21 +1,15 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import argparse
 import asyncio
 import enum
 import os
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Any, Literal
 import openai
 import pydantic
-
+from openai.types.chat import ChatCompletionChunk
 if TYPE_CHECKING:
    from openai.types.chat import ChatCompletionChunk
 ConstraintsFormat = Literal[
    "choice",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -84,12 +84,6 @@ ignore = [
    "B007",
    # f-string format
    "UP032",
    # Can remove once 3.10+ is the minimum Python version
    "UP007",
    "UP027",
    "UP035",
    "UP038",
    "UP045",
 ]
 [tool.ruff.format]
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@ -10,7 +10,6 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
 nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
--- a/setup.py
+++ b/setup.py
@ -540,6 +540,11 @@ def get_gaudi_sw_version():
 def get_vllm_version() -> str:
    # Allow overriding the version. This is useful to build platform-specific
    # wheels (e.g. CPU, TPU) without modifying the source.
    if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
        return env_version
    version = get_version(write_to="vllm/_version.py")
    sep = "+" if "+" not in version else "."  # dev versions might contain +
--- a/tests/benchmarks/test_random_dataset.py
+++ b/tests/benchmarks/test_random_dataset.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Any, NamedTuple, Optional, cast
+from typing import Any, NamedTuple, cast
 import numpy as np
 import pytest
@ -185,8 +185,8 @@ def _collect_mm_samples(
    output_len: int = 5,
    base_items_per_request: int = 2,
    num_mm_items_range_ratio: float = 0.0,
-    limit_mm_per_prompt: Optional[dict[str, int]] = None,
+    limit_mm_per_prompt: dict[str, int] | None = None,
-    bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
+    bucket_config: dict[tuple[int, int, int], float] | None = None,
    enable_multimodal_chat: bool = False,
 ) -> list[SampleRequest]:
    if limit_mm_per_prompt is None:
--- a/tests/ci_envs.py
+++ b/tests/ci_envs.py
@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need!
 """
 import os
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
    VLLM_CI_NO_SKIP: bool = False
-    VLLM_CI_DTYPE: Optional[str] = None
+    VLLM_CI_DTYPE: str | None = None
-    VLLM_CI_HEAD_DTYPE: Optional[str] = None
+    VLLM_CI_HEAD_DTYPE: str | None = None
-    VLLM_CI_HF_DTYPE: Optional[str] = None
+    VLLM_CI_HF_DTYPE: str | None = None
 environment_variables: dict[str, Callable[[], Any]] = {
    # A model family has many models with the same architecture.
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import weakref
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from copy import deepcopy
 from typing import Callable, Union
 from torch import fx
 from torch._ops import OpOverload
@ -44,7 +43,7 @@ class TestBackend:
    Inductor config is default-initialized from VllmConfig.CompilationConfig.
    """
-    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
+    def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
        self.custom_passes = list(passes)
        compile_config = get_current_vllm_config().compilation_config
        self.inductor_config = compile_config.inductor_compile_config
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -10,7 +10,7 @@ initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 import pytest
 import torch
@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module):
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        For tractable computation:
@ -217,7 +217,7 @@ class LlamaModel(nn.Module):
    def forward(
        self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
    ) -> torch.Tensor:
        hidden_states = self.embedding_tokens(input_ids)
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@ -142,7 +142,7 @@ class TestScaledMMRSModel(_BaseScaledMMModel):
        return [torch.ops.vllm.reduce_scatter.default]
    def ops_in_model_after(self):
-        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+        return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
 class TestAGScaledMMModel(_BaseScaledMMModel):
@ -195,7 +195,7 @@ class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
        return [torch.ops.vllm.reduce_scatter.default]
    def ops_in_model_after(self):
-        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+        return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
 class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
@ -243,9 +243,15 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
@pytest.mark.parametrize("seq_len", [16])
@pytest.mark.parametrize("hidden_size", [16])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("dynamic", [True, False])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_async_tp_pass_replace(
-    test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype
+    test_model: str,
    batch_size: int,
    seq_len: int,
    hidden_size: int,
    dtype: torch.dtype,
    dynamic: bool,
 ):
    if (
        test_model
@ -269,7 +275,15 @@ def test_async_tp_pass_replace(
        # torch.distributed and cuda
        torch.multiprocessing.spawn(
            fn,
-            args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
+            args=(
                num_processes,
                test_model,
                batch_size,
                seq_len,
                hidden_size,
                dtype,
                dynamic,
            ),
            nprocs=nprocs,
        )
@ -284,6 +298,7 @@ def async_tp_pass_on_test_model(
    seq_len: int,
    hidden_size: int,
    dtype: torch.dtype,
    dynamic: bool,
 ):
    current_platform.seed_everything(0)
@ -331,6 +346,9 @@ def async_tp_pass_on_test_model(
        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
    )
    if dynamic:
        torch._dynamo.mark_dynamic(hidden_states, 0)
    compiled_model = torch.compile(model, backend=backend)
    compiled_model(hidden_states)
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import dataclasses
 import pytest
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import logging
 import tempfile
-from typing import Any, Union
+from typing import Any
 import pytest
 import torch
@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
 def run_model(
-    compile_config: Union[int, CompilationConfig],
+    compile_config: int | CompilationConfig,
    model: str,
    model_kwargs: dict[str, Any],
 ):
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 from typing import Optional
 import pytest
 import torch._dynamo
@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
 # globals needed for string-import custom Dynamo backend field
-backend: Optional[TestBackend] = None
+backend: TestBackend | None = None
-backend_unfused: Optional[TestBackend] = None
+backend_unfused: TestBackend | None = None
 class AttentionQuantPatternModel(torch.nn.Module):
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import torch
@ -10,7 +9,7 @@ from vllm.config import CompilationLevel
 class MyMod(torch.nn.Module):
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
        if cache is not None:
            return x + cache
        return x * 2
@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
            compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
        )
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
        # this is the function to be compiled
        return self.model(x, cache)
-    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
        # let torch.compile compile twice
        if len(self.compiled_codes) == 2:
            dispatch_id = 0 if cache is None else 1
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -21,7 +21,7 @@ import threading
 from collections.abc import Generator
 from contextlib import nullcontext
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
+from typing import Any, Callable, TypedDict, TypeVar, cast
 import numpy as np
 import pytest
@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 _M = TypeVar("_M")
-_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
+_PromptMultiModalInput = list[_M] | list[list[_M]]
 PromptImageInput = _PromptMultiModalInput[Image.Image]
 PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
@ -267,7 +267,7 @@ class HfRunner:
        return "cpu" if current_platform.is_cpu() else current_platform.device_type
-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def wrap_device(self, x: _T, device: str | None = None) -> _T:
        if x is None or isinstance(x, (bool,)):
            return x
@ -287,14 +287,14 @@ class HfRunner:
        model_name: str,
        dtype: str = "auto",
        *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
        trust_remote_code: bool = True,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
        # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
    ) -> None:
        init_ctx = (
            nullcontext()
@ -319,7 +319,7 @@ class HfRunner:
        model_name: str,
        dtype: str = "auto",
        *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
        trust_remote_code: bool = True,
        is_sentence_transformer: bool = False,
        is_cross_encoder: bool = False,
@ -406,11 +406,11 @@ class HfRunner:
    def get_inputs(
        self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str] | list[list[int]],
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
-    ) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]:
+    ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
        if images is not None:
            assert len(prompts) == len(images)
@ -420,9 +420,7 @@ class HfRunner:
        if audios is not None:
            assert len(prompts) == len(audios)
-        all_inputs: list[
+        all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
            Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]
        ] = []
        for i, prompt in enumerate(prompts):
            if isinstance(prompt, str):
                processor_kwargs: dict[str, Any] = {
@ -494,10 +492,10 @@ class HfRunner:
    def generate(
        self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str] | list[list[int]],
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[list[int]], list[str]]]:
        all_inputs = self.get_inputs(
@ -522,11 +520,11 @@ class HfRunner:
    def generate_greedy(
        self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str] | list[list[int]],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[int], str]]:
        outputs = self.generate(
@ -546,9 +544,9 @@ class HfRunner:
        prompts: list[str],
        beam_width: int,
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
        outputs = self.generate(
            prompts,
@ -574,9 +572,9 @@ class HfRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[list[torch.Tensor]]:
        all_inputs = self.get_inputs(
@ -624,7 +622,7 @@ class HfRunner:
    def _hidden_states_to_logprobs(
        self,
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: Optional[int],
+        num_logprobs: int | None,
    ) -> tuple[list[dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)
@ -652,10 +650,10 @@ class HfRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: Optional[int],
+        num_logprobs: int | None,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
        **kwargs: Any,
    ) -> list[TokensTextLogprobs]:
        all_inputs = self.get_inputs(
@ -734,20 +732,20 @@ class VllmRunner:
        model_name: str,
        runner: RunnerOption = "auto",
        convert: ConvertOption = "auto",
-        tokenizer_name: Optional[str] = None,
+        tokenizer_name: str | None = None,
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
-        seed: Optional[int] = 0,
+        seed: int | None = 0,
-        max_model_len: Optional[int] = 1024,
+        max_model_len: int | None = 1024,
        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16 if not torch.xpu.is_available() else 64,
-        enable_chunked_prefill: Optional[bool] = False,
+        enable_chunked_prefill: bool | None = False,
        swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
+        enforce_eager: bool | None = False,
        # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
        **kwargs,
    ) -> None:
        init_ctx = (
@ -785,10 +783,10 @@ class VllmRunner:
    def get_inputs(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
    ) -> list[dict[str, Any]]:
        if any(
            x is not None and len(x) != len(prompts) for x in [images, videos, audios]
@ -824,11 +822,11 @@ class VllmRunner:
    def generate(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
        sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@ -871,11 +869,11 @@ class VllmRunner:
        self,
        prompts: list[str],
        sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
        **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
        req_outputs = self.llm.generate(
@ -894,11 +892,11 @@ class VllmRunner:
    def generate_greedy(
        self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
        **kwargs: Any,
    ) -> list[tuple[list[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
@ -916,15 +914,15 @@ class VllmRunner:
        self,
        prompts: list[str],
        max_tokens: int,
-        num_logprobs: Optional[int],
+        num_logprobs: int | None,
-        num_prompt_logprobs: Optional[int] = None,
+        num_prompt_logprobs: int | None = None,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        stop_token_ids: Optional[list[int]] = None,
+        stop_token_ids: list[int] | None = None,
-        stop: Optional[list[str]] = None,
+        stop: list[str] | None = None,
        **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
@ -957,7 +955,7 @@ class VllmRunner:
        perplexities = []
        for output in outputs:
            output = cast(TokensTextLogprobsPromptLogprobs, output)
-            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            token_datas = cast(list[dict[int, Logprob] | None], output[3])
            assert token_datas[0] is None
            token_log_probs = []
            for token_data in token_datas[1:]:
@ -976,10 +974,10 @@ class VllmRunner:
        prompts: list[str],
        beam_width: int,
        max_tokens: int,
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
-        concurrency_limit: Optional[int] = None,
+        concurrency_limit: int | None = None,
    ) -> list[tuple[list[list[int]], list[str]]]:
        inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@ -1002,9 +1000,9 @@ class VllmRunner:
    def embed(
        self,
        prompts: list[str],
-        images: Optional[PromptImageInput] = None,
+        images: PromptImageInput | None = None,
-        videos: Optional[PromptVideoInput] = None,
+        videos: PromptVideoInput | None = None,
-        audios: Optional[PromptAudioInput] = None,
+        audios: PromptAudioInput | None = None,
        *args,
        **kwargs,
    ) -> list[list[float]]:
@ -1023,8 +1021,8 @@ class VllmRunner:
    def score(
        self,
-        text_1: Union[str, list[str]],
+        text_1: list[str] | str,
-        text_2: Union[str, list[str]],
+        text_2: list[str] | str,
        *args,
        **kwargs,
    ) -> list[float]:
@ -1226,8 +1224,8 @@ def _find_free_port() -> int:
 class LocalAssetServer:
    address: str
    port: int
-    server: Optional[http.server.ThreadingHTTPServer]
+    server: http.server.ThreadingHTTPServer | None
-    thread: Optional[threading.Thread]
+    thread: threading.Thread | None
    def __init__(self, address: str = "127.0.0.1") -> None:
        self.address = address
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 import pytest
@ -15,8 +15,8 @@ def _test_stopping(
    llm: LLM,
    expected_output: str,
    expected_reason: Any,
-    stop: Optional[list[str]] = None,
+    stop: list[str] | None = None,
-    stop_token_ids: Optional[list[int]] = None,
+    stop_token_ids: list[int] | None = None,
    include_in_output: bool = False,
 ) -> None:
    output = llm.generate(
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 from typing import Optional, Union
 import msgspec
 import msgspec.msgpack
@ -78,8 +77,8 @@ class MockSubscriber:
    def __init__(
        self,
-        pub_endpoints: Union[str, list[str]],
+        pub_endpoints: str | list[str],
-        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        replay_endpoints: str | list[str] | None = None,
        topic: str = "",
        decode_type=SampleBatch,
    ):
@ -111,7 +110,7 @@ class MockSubscriber:
        self.last_seq = -1
        self.decoder = msgspec.msgpack.Decoder(type=decode_type)
-    def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+    def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
        """Receive a single message with timeout"""
        if not self.sub.poll(timeout):
            return None
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -5,9 +5,8 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-from __future__ import annotations
+from collections.abc import Callable
-
+from typing import Any
 from typing import Any, Callable
 import pytest
 import ray
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 import pytest
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
 class CPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
@dataclass
@ -54,7 +54,7 @@ class CPTestSettings:
        dcp_base: int = 1,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for eager_mode_val in [False]:
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 import pytest
@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):
 class EPTestOptions(NamedTuple):
    trust_remote_code: bool
-    tokenizer_mode: Optional[str]
+    tokenizer_mode: str | None
-    load_format: Optional[str] = None
+    load_format: str | None = None
-    hf_overrides: Optional[str] = None
+    hf_overrides: str | None = None
@dataclass
@ -40,9 +40,9 @@ class EPTestSettings:
        tp_base: int = 2,
        runner: RunnerOption = "auto",
        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
+        tokenizer_mode: str | None = None,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
-        hf_overrides: Optional[str] = None,
+        hf_overrides: str | None = None,
    ):
        return EPTestSettings(
            parallel_setups=[
@ -72,9 +72,9 @@ class EPTestSettings:
        tp_base: int = 2,
        runner: RunnerOption = "auto",
        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
+        tokenizer_mode: str | None = None,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
-        hf_overrides: Optional[str] = None,
+        hf_overrides: str | None = None,
    ):
        return EPTestSettings(
            parallel_setups=[
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 import pytest
@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple):
 class PPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
@dataclass
@ -52,7 +52,7 @@ class PPTestSettings:
        pp_base: int = 2,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        return PPTestSettings(
            parallel_setups=[
@ -76,7 +76,7 @@ class PPTestSettings:
        pp_base: int = 2,
        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        return PPTestSettings(
            parallel_setups=[
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@ -1,16 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import pytest
 from typing_extensions import LiteralString
 from ..utils import compare_two_settings, create_new_process_for_each_test
 if TYPE_CHECKING:
    from typing_extensions import LiteralString
@pytest.mark.parametrize(
    "PP_SIZE, MODEL_NAME",
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 import pytest
@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
 class SPTestOptions(NamedTuple):
    multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
@dataclass
@ -53,7 +53,7 @@ class SPTestSettings:
        pp_base: int = 1,
        multi_node_only: bool = False,
        runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for eager_mode_val in [False, True]:
@ -84,7 +84,7 @@ class SPTestSettings:
        pp_base: int = 1,
        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for eager_mode_val in [False, True]:
@ -115,7 +115,7 @@ class SPTestSettings:
        pp_base: int = 1,
        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
    ):
        parallel_setups = []
        for fusion_val in [False, True]:
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -5,7 +5,7 @@ import json
 from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 import pytest
@ -115,9 +115,9 @@ class NestedConfig:
 class DummyConfig:
    regular_bool: bool = True
    """Regular bool with default True"""
-    optional_bool: Optional[bool] = None
+    optional_bool: bool | None = None
    """Optional bool with default None"""
-    optional_literal: Optional[Literal["x", "y"]] = None
+    optional_literal: Literal["x", "y"] | None = None
    """Optional literal with default None"""
    tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
    """Tuple with variable length"""
@ -127,7 +127,7 @@ class DummyConfig:
    """List with variable length"""
    list_literal: list[Literal[1, 2]] = field(default_factory=list)
    """List with literal choices"""
-    list_union: list[Union[str, type[object]]] = field(default_factory=list)
+    list_union: list[str | type[object]] = field(default_factory=list)
    """List with union type"""
    literal_literal: Literal[Literal[1], Literal[2]] = 1
    """Literal of literals with default 1"""
@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected):
    ("type_hint", "expected"),
    [
        (Annotated[int, "annotation"], {int}),
-        (Optional[int], {int, type(None)}),
+        (int | None, {int, type(None)}),
-        (Annotated[Optional[int], "annotation"], {int, type(None)}),
+        (Annotated[int | None, "annotation"], {int, type(None)}),
-        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+        (Annotated[int, "annotation"] | None, {int, type(None)}),
    ],
-    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
+    ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
 )
 def test_get_type_hints(type_hint, expected):
    assert get_type_hints(type_hint) == expected
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@ -3,7 +3,7 @@
 import asyncio
 import random
-from typing import Callable
+from collections.abc import Callable
 import openai
 import pytest
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -3,7 +3,6 @@
 # imports for structured outputs tests
 import json
 from typing import Optional
 import jsonschema
 import openai  # use the official client for correctness check
@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st
    [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
 )
 async def test_prompt_logprobs_chat(
-    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
 ):
    params: dict = {
        "messages": [
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
 from typing import Union
 import openai  # use the official client for correctness check
 import pytest
@ -166,7 +165,7 @@ async def test_function_tool_use(
    client: openai.AsyncOpenAI,
    model_name: str,
    stream: bool,
-    tool_choice: Union[str, dict],
+    tool_choice: str | dict,
    enable_thinking: bool,
 ):
    if not stream:
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -4,7 +4,6 @@
 from contextlib import suppress
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
 import pytest
@ -38,13 +37,13 @@ class MockModelConfig:
    trust_remote_code: bool = False
    tokenizer_mode: str = "auto"
    max_model_len: int = 100
-    tokenizer_revision: Optional[str] = None
+    tokenizer_revision: str | None = None
    multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
    hf_config: MockHFConfig = field(default_factory=MockHFConfig)
-    logits_processor_pattern: Optional[str] = None
+    logits_processor_pattern: str | None = None
-    diff_sampling_param: Optional[dict] = None
+    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
    encoder_config = None
    generation_config: str = "auto"
    skip_tokenizer_init: bool = False
@ -56,7 +55,7 @@ class MockModelConfig:
 class MockLoRAResolver(LoRAResolver):
    async def resolve_lora(
        self, base_model_name: str, lora_name: str
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
        if lora_name == "test-lora":
            return LoRARequest(
                lora_name="test-lora",
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@ -1,16 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import Any
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 import pytest_asyncio
 from openai import OpenAI
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM
 from ...utils import RemoteOpenAIServer
 if TYPE_CHECKING:
    from openai import OpenAI
 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Union
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
@ -84,10 +83,10 @@ class StreamingToolReconstructor:
 def run_tool_extraction(
    tool_parser: ToolParser,
    model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
    streaming: bool = False,
    assert_one_tool_per_delta: bool = True,
-) -> tuple[Union[str, None], list[ToolCall]]:
+) -> tuple[str | None, list[ToolCall]]:
    if streaming:
        reconstructor = run_tool_extraction_streaming(
            tool_parser,
@ -105,7 +104,7 @@ def run_tool_extraction(
 def run_tool_extraction_nonstreaming(
    tool_parser: ToolParser,
    model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> ExtractedToolCallInformation:
    request = request or ChatCompletionRequest(messages=[], model="test-model")
    return tool_parser.extract_tool_calls(model_output, request)
@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming(
 def run_tool_extraction_streaming(
    tool_parser: ToolParser,
    model_deltas: Iterable[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
    assert_one_tool_per_delta: bool = True,
 ) -> StreamingToolReconstructor:
    request = request or ChatCompletionRequest(messages=[], model="test-model")
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@ -4,8 +4,6 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
 from typing import Optional
 import openai
 import pytest
@ -103,14 +101,14 @@ async def test_matryoshka(
        run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
    if model_info.is_matryoshka:
-        valid_dimensions: list[Optional[int]] = [None]
+        valid_dimensions: list[int | None] = [None]
        if model_info.matryoshka_dimensions is not None:
            valid_dimensions += model_info.matryoshka_dimensions[:2]
        for dimensions in valid_dimensions:
            await make_request_and_correctness_test(dimensions)
-        invalid_dimensions: list[Optional[int]] = [-1]
+        invalid_dimensions: list[int | None] = [-1]
        if model_info.matryoshka_dimensions is not None:
            assert 5 not in model_info.matryoshka_dimensions
            invalid_dimensions.append(5)
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@ -5,7 +5,6 @@ import multiprocessing
 import socket
 import threading
 import time
 from typing import Optional
 from unittest.mock import patch
 import pytest
@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
        assert len(manager.processes) == 3
        # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}
        def run_with_exception_capture():
            try:
@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args):
        assert len(manager.processes) == 3
        # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}
        def run_with_exception_capture():
            try:
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -3,7 +3,7 @@
 import warnings
 from collections.abc import Mapping
-from typing import Literal, Optional
+from typing import Literal
 import pytest
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
@ -152,9 +152,9 @@ def audio_url():
 def _assert_mm_data_is_image_input(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
    image_count: int,
-    skipped_image_indices: Optional[list] = None,
+    skipped_image_indices: list | None = None,
 ) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}
@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input(
 def _assert_mm_uuids(
-    mm_uuids: Optional[MultiModalUUIDDict],
+    mm_uuids: MultiModalUUIDDict | None,
    media_count: int,
-    expected_uuids: list[Optional[str]],
+    expected_uuids: list[str | None],
    modality: str = "image",
 ) -> None:
    if len(expected_uuids) > 0:
@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]
 def _assert_mm_data_inputs(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
    data_count: MultiModalDataCounts,
-    skipped_media_indices: Optional[dict[str, list]] = None,  # modality -> list[int]
+    skipped_media_indices: dict[str, list] | None = None,  # modality -> list[int]
 ) -> None:
    assert mm_data is not None
    assert set(data_count.keys()) == (set(mm_data.keys()))
--- a/tests/entrypoints/test_renderer.py
+++ b/tests/entrypoints/test_renderer.py
@ -3,7 +3,6 @@
 import io
 from dataclasses import dataclass
 from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
 import pybase64
@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt
@dataclass
 class MockModelConfig:
    max_model_len: int = 100
-    encoder_config: Optional[dict] = None
+    encoder_config: dict | None = None
 class MockTokenizerResult:
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@ -12,7 +12,6 @@ import json
 import os
 import time
 from collections.abc import Generator
 from typing import Optional, Union
 import aiohttp
 import numpy as np
@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
 INVALID = -9999999
-def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+def download_and_cache_file(url: str, filename: str | None = None) -> str:
    """Download and cache a file from a URL."""
    if filename is None:
        filename = os.path.join("/tmp", url.split("/")[-1])
@ -81,9 +80,9 @@ async def call_vllm_api(
    prompt: str,
    temperature: float,
    max_tokens: int,
-    stop: Optional[list[str]] = None,
+    stop: list[str] | None = None,
-    url: Optional[str] = None,
+    url: str | None = None,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> str:
    """Call vLLM's OpenAI-compatible completions endpoint."""
    data = {
@ -112,8 +111,8 @@ def evaluate_gsm8k(
    host: str = "http://127.0.0.1",
    port: int = 8000,
    temperature: float = 0.0,
-    seed: Optional[int] = 42,
+    seed: int | None = 42,
-) -> dict[str, Union[float, int]]:
+) -> dict[str, float | int]:
    """
    Evaluate GSM8K accuracy using vLLM serve endpoint.
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
@ -27,8 +26,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
+    sliding_window: int | None = None,
-    soft_cap: Optional[float] = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -94,12 +93,12 @@ def test_varlen_with_paged_kv(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 from typing import Optional
 import pytest
 import torch
@ -50,7 +49,7 @@ def ref_masked_attention(
    key: torch.Tensor,
    value: torch.Tensor,
    scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
 ) -> torch.Tensor:
    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
    if attn_mask is not None:
@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention(
    block_tables: torch.Tensor,
    seq_lens: torch.Tensor,
    scale: float,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
 ) -> None:
    num_query_heads = query.shape[1]
    num_kv_heads = value_cache.shape[1]
@ -415,7 +414,7 @@ def ref_multi_query_kv_attention(
    key: torch.Tensor,
    value: torch.Tensor,
    scale: float,
-    alibi_bias: Optional[list[torch.Tensor]],
+    alibi_bias: list[torch.Tensor] | None,
    dtype: torch.dtype,
 ) -> torch.Tensor:
    num_seqs = len(cu_seq_lens) - 1
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
@ -85,7 +84,7 @@ def test_cascade(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
    fa_version: int,
 ) -> None:
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
 from typing import Optional
 import pytest
 import torch
@ -17,7 +16,7 @@ def cal_diff(
    y: torch.Tensor,
    name: str,
    use_fp8: bool = False,
-    diff_threshold: Optional[float] = None,
+    diff_threshold: float | None = None,
 ) -> None:
    x, y = x.double(), y.double()
    cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
@ -34,8 +33,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
+    sliding_window: int | None = None,
-    soft_cap: Optional[float] = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
    if not is_fa_version_supported(fa_version):
@ -221,13 +220,13 @@ def test_varlen_with_paged_kv(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
    fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
    if not is_fa_version_supported(fa_version):
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import flashinfer
 import pytest
@ -26,8 +25,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
+    sliding_window: int | None = None,
-    soft_cap: Optional[float] = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
 ) -> None:
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
    pytest.skip("TODO: fix the accuracy issue")
    torch.set_default_device("cuda")
@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
    head_size: int,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
    # test doesn't work for num_heads = (16,16)
    torch.set_default_device("cuda")
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import flashinfer
 import pytest
@ -68,9 +67,7 @@ NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
@torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_lens: tuple[int, int],
    num_heads: tuple[int, int],
@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
    kv_layout: str,
    block_size: int,
    window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    has_sinks: bool,
 ) -> None:
    torch.set_default_device("cuda")
@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
@torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
    dtype: torch.dtype,
-    quant_dtypes: tuple[
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
    ],
    batch_size: int,
    max_seq_lens: tuple[int, int],
    num_heads: tuple[int, int],
@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
    kv_layout: str,
    block_size: int,
    window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    has_sinks: bool,
 ) -> None:
    torch.set_default_device("cuda")
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
@ -20,7 +19,7 @@ def merge_attn_states_torch(
    prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
    suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-    output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+    output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
 ):
    p_lse = prefix_lse
    s_lse = suffix_lse
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
@ -32,8 +31,8 @@ def ref_paged_attn(
    kv_lens: list[int],
    block_tables: torch.Tensor,
    scale: float,
-    sliding_window: Optional[int] = None,
+    sliding_window: int | None = None,
-    soft_cap: Optional[float] = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
    num_seqs = len(query_lens)
    block_tables = block_tables.cpu().numpy()
@ -98,12 +97,12 @@ def test_triton_unified_attn(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
    head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
    dtype: torch.dtype,
    block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
    num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
    torch.set_default_device("cuda")
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional, Union
 import pytest
 import torch
@ -31,13 +30,13 @@ EPS = 1e-6
 ## Helpers
-def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
    return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 def ref_rms_norm(
-    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
+    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor | None]:
    if residual is not None:
        residual = residual.clone()
        out, residual = rms_norm_layer.forward_native(x, residual)
@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
-    scale_ub: Optional[torch.Tensor],
+    scale_ub: torch.Tensor | None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    if scale_ub is not None:
        assert quant_dtype == torch.float8_e4m3fn
@ -76,9 +75,9 @@ def ref_impl(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
-    scale_ub: Optional[torch.Tensor],
+    scale_ub: torch.Tensor | None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    return ref_dynamic_per_token_quant(
        rms_norm_layer, x, quant_dtype, residual, scale_ub
    )
@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant(
    weight: torch.Tensor,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
-    scale_ub: Optional[torch.Tensor],
+    scale_ub: torch.Tensor | None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    if residual is not None:
        residual = residual.clone()
    out, scales = ops.rms_norm_dynamic_per_token_quant(
@ -103,9 +102,9 @@ def ops_impl(
    weight: torch.Tensor,
    x: torch.Tensor,
    quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
-    scale_ub: Optional[torch.Tensor],
+    scale_ub: torch.Tensor | None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
    return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Callable
 from itertools import product
 from typing import Callable, Optional
 import pytest
 import torch
@ -68,7 +68,7 @@ def test_rotary_embedding(
    seq_len: int,
    num_heads: int,
    head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
    dtype: torch.dtype,
    seed: int,
    device: str,
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@ -4,8 +4,6 @@
 Tests for miscellaneous utilities
 """
 from typing import Optional
 import pytest
 import torch
@ -17,7 +15,7 @@ def rotary_embedding_opcheck(
    rot,
    positions: torch.Tensor,
    query: torch.Tensor,
-    key: Optional[torch.Tensor] = None,
+    key: torch.Tensor | None = None,
 ):
    cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import pytest
 import torch
@ -19,11 +18,11 @@ from vllm.platforms import current_platform
 def causal_conv1d_ref(
    x: torch.Tensor,
    weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
-    initial_states: Optional[torch.Tensor] = None,
+    initial_states: torch.Tensor | None = None,
    return_final_states: bool = False,
-    final_states_out: Optional[torch.Tensor] = None,
+    final_states_out: torch.Tensor | None = None,
-    activation: Optional[str] = "silu",
+    activation: str | None = "silu",
 ):
    """
    x: (batch, dim, seqlen)
@ -117,12 +116,12 @@ def causal_conv1d_update_ref(
 def causal_conv1d_opcheck_fn(
    x: torch.Tensor,
    weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
-    cu_seq_len: Optional[torch.Tensor] = None,
+    cu_seq_len: torch.Tensor | None = None,
-    cache_indices: Optional[torch.Tensor] = None,
+    cache_indices: torch.Tensor | None = None,
-    has_initial_state: Optional[torch.Tensor] = None,
+    has_initial_state: torch.Tensor | None = None,
-    conv_states: Optional[torch.Tensor] = None,
+    conv_states: torch.Tensor | None = None,
-    activation: Optional[str] = "silu",
+    activation: str | None = "silu",
    pad_slot_id: int = PAD_SLOT_ID,
 ):
    """
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 import torch
@ -35,7 +35,7 @@ from .mk_objects import (
 from .parallel_utils import ProcessGroupInfo
-def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
+def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
    if t is None:
        return f"{name} : None"
    else:
@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
@dataclass
 class Config:
-    Ms: Union[list[int], int]
+    Ms: list[int] | int
    K: int
    N: int
    E: int
-    topks: Union[list[int], int]
+    topks: list[int] | int
    dtype: torch.dtype
-    quant_config: Optional[TestMoEQuantConfig]
+    quant_config: TestMoEQuantConfig | None
    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
-    fused_moe_chunk_size: Optional[int]
+    fused_moe_chunk_size: int | None
    world_size: int
-    torch_trace_dir_path: Optional[str] = None
+    torch_trace_dir_path: str | None = None
    def __post_init__(self):
        if self.quant_config is None:
@ -93,7 +93,7 @@ class Config:
        return self.Ms
    @property
-    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+    def quant_dtype(self) -> torch.dtype | str | None:
        assert self.quant_config is not None
        return self.quant_config.quant_dtype
@ -112,7 +112,7 @@ class Config:
        return self.quant_config.per_out_ch_quant
    @property
-    def quant_block_shape(self) -> Optional[list[int]]:
+    def quant_block_shape(self) -> list[int] | None:
        assert self.quant_config is not None
        return self.quant_config.block_shape
@ -209,7 +209,7 @@ class Config:
        info = prepare_finalize_info(self.prepare_finalize_type)
        return info.backend
-    def is_valid(self) -> tuple[bool, Optional[str]]:
+    def is_valid(self) -> tuple[bool, str | None]:
        # Check prepare-finalize and fused-experts compatibility
        if self.is_batched_prepare_finalize():
            if not self.is_batched_fused_experts():
@ -280,10 +280,10 @@ class Config:
 class WeightTensors:
    w1: torch.Tensor
    w2: torch.Tensor
-    w1_scale: Optional[torch.Tensor]
+    w1_scale: torch.Tensor | None
-    w2_scale: Optional[torch.Tensor]
+    w2_scale: torch.Tensor | None
-    w1_gs: Optional[torch.Tensor] = None
+    w1_gs: torch.Tensor | None = None
-    w2_gs: Optional[torch.Tensor] = None
+    w2_gs: torch.Tensor | None = None
    def describe(self):
        s = ""
@ -351,11 +351,11 @@ class WeightTensors:
@dataclass
 class RankTensors:
    hidden_states: torch.Tensor
-    hidden_states_scale: Optional[torch.Tensor]
+    hidden_states_scale: torch.Tensor | None
    topk_weights: torch.Tensor
    topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor]
+    expert_map: torch.Tensor | None
    def describe(self):
        s = ""
@ -370,7 +370,7 @@ class RankTensors:
    @staticmethod
    def make_hidden_states(
        config: Config,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        """
        Return hidden_states
        """
--- a/Show More
+++ b/Show More